346 files changed, 83297 insertions, 0 deletions
diff --git a/src/common/AsyncOpTracker.cc b/src/common/AsyncOpTracker.cc
new file mode 100644
index 000000000..fb6439d38
--- /dev/null
+++ b/src/common/AsyncOpTracker.cc
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/AsyncOpTracker.h"
+#include "include/Context.h"
+
+AsyncOpTracker::AsyncOpTracker()
+{
+}
+
+AsyncOpTracker::~AsyncOpTracker() {
+  std::lock_guard locker(m_lock);
+  ceph_assert(m_pending_ops == 0);
+}
+
+void AsyncOpTracker::start_op() {
+  std::lock_guard locker(m_lock);
+  ++m_pending_ops;
+}
+
+void AsyncOpTracker::finish_op() {
+  Context *on_finish = nullptr;
+  {
+    std::lock_guard locker(m_lock);
+    ceph_assert(m_pending_ops > 0);
+    if (--m_pending_ops == 0) {
+      std::swap(on_finish, m_on_finish);
+    }
+  }
+
+  if (on_finish != nullptr) {
+    on_finish->complete(0);
+  }
+}
+
+void AsyncOpTracker::wait_for_ops(Context *on_finish) {
+  {
+    std::lock_guard locker(m_lock);
+    ceph_assert(m_on_finish == nullptr);
+    if (m_pending_ops > 0) {
+      m_on_finish = on_finish;
+      return;
+    }
+  }
+  on_finish->complete(0);
+}
+
+bool AsyncOpTracker::empty() {
+  std::lock_guard locker(m_lock);
+  return (m_pending_ops == 0);
+}
+
diff --git a/src/common/AsyncOpTracker.h b/src/common/AsyncOpTracker.h
new file mode 100644
index 000000000..dfa913ad4
--- /dev/null
+++ b/src/common/AsyncOpTracker.h
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_ASYNC_OP_TRACKER_H
+#define CEPH_ASYNC_OP_TRACKER_H
+
+#include "common/ceph_mutex.h"
+#include "include/Context.h"
+
+class AsyncOpTracker {
+public:
+  AsyncOpTracker();
+  ~AsyncOpTracker();
+
+  void start_op();
+  void finish_op();
+
+  void wait_for_ops(Context *on_finish);
+
+  bool empty();
+
+private:
+  ceph::mutex m_lock = ceph::make_mutex("AsyncOpTracker::m_lock");
+  uint32_t m_pending_ops = 0;
+  Context *m_on_finish = nullptr;
+
+};
+
+class C_TrackedOp : public Context {
+public:
+  C_TrackedOp(AsyncOpTracker& async_op_tracker, Context* on_finish)
+    : m_async_op_tracker(async_op_tracker), m_on_finish(on_finish) {
+    m_async_op_tracker.start_op();
+  }
+
+  void finish(int r) override {
+    if (m_on_finish != nullptr) {
+      m_on_finish->complete(r);
+    }
+    m_async_op_tracker.finish_op();
+  }
+
+private:
+  AsyncOpTracker& m_async_op_tracker;
+  Context* m_on_finish;
+};
+
+#endif // CEPH_ASYNC_OP_TRACKER_H
diff --git a/src/common/AsyncReserver.h b/src/common/AsyncReserver.h
new file mode 100644
index 000000000..b80f9e7df
--- /dev/null
+++ b/src/common/AsyncReserver.h
@@ -0,0 +1,320 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef ASYNC_RESERVER_H
+#define ASYNC_RESERVER_H
+
+#include "common/Formatter.h"
+
+#define rdout(x) lgeneric_subdout(cct,reserver,x)
+
+/**
+ * Manages a configurable number of asynchronous reservations.
+ *
+ * Memory usage is linear with the number of items queued and
+ * linear with respect to the total number of priorities used
+ * over all time.
+ */
+template <typename T, typename F>
+class AsyncReserver {
+  CephContext *cct;
+  F *f;
+  unsigned max_allowed;
+  unsigned min_priority;
+  ceph::mutex lock = ceph::make_mutex("AsyncReserver::lock");
+
+  struct Reservation {
+    T item;
+    unsigned prio = 0;
+    Context *grant = 0;
+    Context *preempt = 0;
+    Reservation() {}
+    Reservation(T i, unsigned pr, Context *g, Context *p = 0)
+      : item(i), prio(pr), grant(g), preempt(p) {}
+    void dump(ceph::Formatter *f) const {
+      f->dump_stream("item") << item;
+      f->dump_unsigned("prio", prio);
+      f->dump_bool("can_preempt", !!preempt);
+    }
+    friend std::ostream& operator<<(std::ostream& out, const Reservation& r) {
+      return out << r.item << "(prio " << r.prio << " grant " << r.grant
+		 << " preempt " << r.preempt << ")";
+    }
+  };
+
+  std::map<unsigned, std::list<Reservation>> queues;
+  std::map<T, std::pair<unsigned, typename std::list<Reservation>::iterator>> queue_pointers;
+  std::map<T,Reservation> in_progress;
+  std::set<std::pair<unsigned,T>> preempt_by_prio;  ///< in_progress that can be preempted
+
+  void preempt_one() {
+    ceph_assert(!preempt_by_prio.empty());
+    auto q = in_progress.find(preempt_by_prio.begin()->second);
+    ceph_assert(q != in_progress.end());
+    Reservation victim = q->second;
+    rdout(10) << __func__ << " preempt " << victim << dendl;
+    f->queue(victim.preempt);
+    victim.preempt = nullptr;
+    in_progress.erase(q);
+    preempt_by_prio.erase(preempt_by_prio.begin());
+  }
+
+  void do_queues() {
+    rdout(20) << __func__ << ":\n";
+    ceph::JSONFormatter jf(true);
+    jf.open_object_section("queue");
+    _dump(&jf);
+    jf.close_section();
+    jf.flush(*_dout);
+    *_dout << dendl;
+
+    // in case min_priority was adjusted up or max_allowed was adjusted down
+    while (!preempt_by_prio.empty() &&
+	   (in_progress.size() > max_allowed ||
+	    preempt_by_prio.begin()->first < min_priority)) {
+      preempt_one();
+    }
+
+    while (!queues.empty()) {
+      // choose highest priority queue
+      auto it = queues.end();
+      --it;
+      ceph_assert(!it->second.empty());
+      if (it->first < min_priority) {
+	break;
+      }
+      if (in_progress.size() >= max_allowed &&
+	  !preempt_by_prio.empty() &&
+	  it->first > preempt_by_prio.begin()->first) {
+	preempt_one();
+      }
+      if (in_progress.size() >= max_allowed) {
+	break; // no room
+      }
+      // grant
+      Reservation p = it->second.front();
+      rdout(10) << __func__ << " grant " << p << dendl;
+      queue_pointers.erase(p.item);
+      it->second.pop_front();
+      if (it->second.empty()) {
+	queues.erase(it);
+      }
+      f->queue(p.grant);
+      p.grant = nullptr;
+      in_progress[p.item] = p;
+      if (p.preempt) {
+	preempt_by_prio.insert(std::make_pair(p.prio, p.item));
+      }
+    }
+  }
+public:
+  AsyncReserver(
+    CephContext *cct,
+    F *f,
+    unsigned max_allowed,
+    unsigned min_priority = 0)
+    : cct(cct),
+      f(f),
+      max_allowed(max_allowed),
+      min_priority(min_priority) {}
+
+  void set_max(unsigned max) {
+    std::lock_guard l(lock);
+    max_allowed = max;
+    do_queues();
+  }
+
+  void set_min_priority(unsigned min) {
+    std::lock_guard l(lock);
+    min_priority = min;
+    do_queues();
+  }
+
+  /**
+   * Update the priority of a reservation
+   *
+   * Note, on_reserved may be called following update_priority.  Thus,
+   * the callback must be safe in that case.  Callback will be called
+   * with no locks held.  cancel_reservation must be called to release the
+   * reservation slot.
+   *
+   * Cases
+   * 1. Item is queued, re-queue with new priority
+   * 2. Item is queued, re-queue and preempt if new priority higher than an in progress item
+   * 3. Item is in progress, just adjust priority if no higher priority waiting
+   * 4. Item is in progress, adjust priority if higher priority items waiting preempt item
+   *
+   */
+  void update_priority(T item, unsigned newprio) {
+    std::lock_guard l(lock);
+    auto i = queue_pointers.find(item);
+    if (i != queue_pointers.end()) {
+      unsigned prio = i->second.first;
+      if (newprio == prio)
+        return;
+      Reservation r = *i->second.second;
+      rdout(10) << __func__ << " update " << r << " (was queued)" << dendl;
+      // Like cancel_reservation() without preempting
+      queues[prio].erase(i->second.second);
+      if (queues[prio].empty()) {
+	queues.erase(prio);
+      }
+      queue_pointers.erase(i);
+
+      // Like request_reservation() to re-queue it but with new priority
+      ceph_assert(!queue_pointers.count(item) &&
+	   !in_progress.count(item));
+      r.prio = newprio;
+      queues[newprio].push_back(r);
+      queue_pointers.insert(std::make_pair(item,
+				    std::make_pair(newprio,--(queues[newprio]).end())));
+    } else {
+      auto p = in_progress.find(item);
+      if (p != in_progress.end()) {
+        if (p->second.prio == newprio)
+          return;
+	rdout(10) << __func__ << " update " << p->second
+		  << " (in progress)" << dendl;
+        // We want to preempt if priority goes down
+        // and smaller then highest priority waiting
+	if (p->second.preempt) {
+	  if (newprio < p->second.prio && !queues.empty()) {
+            // choose highest priority queue
+            auto it = queues.end();
+            --it;
+            ceph_assert(!it->second.empty());
+            if (it->first > newprio) {
+	      rdout(10) << __func__ << " update " << p->second
+		        << " lowered priority let do_queues() preempt it" << dendl;
+            }
+          }
+	  preempt_by_prio.erase(std::make_pair(p->second.prio, p->second.item));
+          p->second.prio = newprio;
+	  preempt_by_prio.insert(std::make_pair(p->second.prio, p->second.item));
+	} else {
+          p->second.prio = newprio;
+        }
+      } else {
+	rdout(10) << __func__ << " update " << item << " (not found)" << dendl;
+      }
+    }
+    do_queues();
+    return;
+  }
+
+  void dump(ceph::Formatter *f) {
+    std::lock_guard l(lock);
+    _dump(f);
+  }
+  void _dump(ceph::Formatter *f) {
+    f->dump_unsigned("max_allowed", max_allowed);
+    f->dump_unsigned("min_priority", min_priority);
+    f->open_array_section("queues");
+    for (auto& p : queues) {
+      f->open_object_section("queue");
+      f->dump_unsigned("priority", p.first);
+      f->open_array_section("items");
+      for (auto& q : p.second) {
+	f->dump_object("item", q);
+      }
+      f->close_section();
+      f->close_section();
+    }
+    f->close_section();
+    f->open_array_section("in_progress");
+    for (auto& p : in_progress) {
+      f->dump_object("item", p.second);
+    }
+    f->close_section();
+  }
+
+  /**
+   * Requests a reservation
+   *
+   * Note, on_reserved may be called following cancel_reservation.  Thus,
+   * the callback must be safe in that case.  Callback will be called
+   * with no locks held.  cancel_reservation must be called to release the
+   * reservation slot.
+   */
+  void request_reservation(
+    T item,                   ///< [in] reservation key
+    Context *on_reserved,     ///< [in] callback to be called on reservation
+    unsigned prio,            ///< [in] priority
+    Context *on_preempt = 0   ///< [in] callback to be called if we are preempted (optional)
+    ) {
+    std::lock_guard l(lock);
+    Reservation r(item, prio, on_reserved, on_preempt);
+    rdout(10) << __func__ << " queue " << r << dendl;
+    ceph_assert(!queue_pointers.count(item) &&
+	   !in_progress.count(item));
+    queues[prio].push_back(r);
+    queue_pointers.insert(std::make_pair(item,
+				    std::make_pair(prio,--(queues[prio]).end())));
+    do_queues();
+  }
+
+  /**
+   * Cancels reservation
+   *
+   * Frees the reservation under key for use.
+   * Note, after cancel_reservation, the reservation_callback may or
+   * may not still be called. 
+   */
+  void cancel_reservation(
+    T item                   ///< [in] key for reservation to cancel
+    ) {
+    std::lock_guard l(lock);
+    auto i = queue_pointers.find(item);
+    if (i != queue_pointers.end()) {
+      unsigned prio = i->second.first;
+      const Reservation& r = *i->second.second;
+      rdout(10) << __func__ << " cancel " << r << " (was queued)" << dendl;
+      delete r.grant;
+      delete r.preempt;
+      queues[prio].erase(i->second.second);
+      if (queues[prio].empty()) {
+	queues.erase(prio);
+      }
+      queue_pointers.erase(i);
+    } else {
+      auto p = in_progress.find(item);
+      if (p != in_progress.end()) {
+	rdout(10) << __func__ << " cancel " << p->second
+		  << " (was in progress)" << dendl;
+	if (p->second.preempt) {
+	  preempt_by_prio.erase(std::make_pair(p->second.prio, p->second.item));
+	  delete p->second.preempt;
+	}
+	in_progress.erase(p);
+      } else {
+	rdout(10) << __func__ << " cancel " << item << " (not found)" << dendl;
+      }
+    }
+    do_queues();
+  }
+
+  /**
+   * Has reservations
+   *
+   * Return true if there are reservations in progress
+   */
+  bool has_reservation() {
+    std::lock_guard l(lock);
+    return !in_progress.empty();
+  }
+  static const unsigned MAX_PRIORITY = (unsigned)-1;
+};
+
+#undef rdout
+#endif
diff --git a/src/common/BackTrace.cc b/src/common/BackTrace.cc
new file mode 100644
index 000000000..03105d64a
--- /dev/null
+++ b/src/common/BackTrace.cc
@@ -0,0 +1,90 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <ostream>
+#include <cxxabi.h>
+#include <string.h>
+
+#include "BackTrace.h"
+#include "common/version.h"
+#include "common/Formatter.h"
+
+namespace ceph {
+
+void ClibBackTrace::print(std::ostream& out) const
+{
+  out << " " << pretty_version_to_str() << std::endl;
+  for (size_t i = skip; i < size; i++) {
+    out << " " << (i-skip+1) << ": " << demangle(strings[i]) << std::endl;
+  }
+}
+
+void ClibBackTrace::dump(Formatter *f) const
+{
+  f->open_array_section("backtrace");
+  for (size_t i = skip; i < size; i++) {
+    //      out << " " << (i-skip+1) << ": " << strings[i] << std::endl;
+    f->dump_string("frame", demangle(strings[i]));
+  }
+  f->close_section();
+}
+
+std::string ClibBackTrace::demangle(const char* name)
+{
+  // find the parentheses and address offset surrounding the mangled name
+#ifdef __FreeBSD__
+  static constexpr char OPEN = '<';
+#else
+  static constexpr char OPEN = '(';
+#endif
+  const char* begin = nullptr;
+  const char* end = nullptr;
+  for (const char *j = name; *j; ++j) {
+    if (*j == OPEN) {
+      begin = j + 1;
+    } else if (*j == '+') {
+      end = j;
+    }
+  }
+  if (begin && end && begin < end) {
+    std::string mangled(begin, end);
+    int status;
+    // only demangle a C++ mangled name
+    if (mangled.compare(0, 2, "_Z") == 0) {
+      // let __cxa_demangle do the malloc
+      char* demangled = abi::__cxa_demangle(mangled.c_str(), nullptr, nullptr, &status);
+      if (!status) {
+        std::string full_name{OPEN};
+        full_name += demangled;
+        full_name += end;
+        // buf could be reallocated, so free(demangled) instead
+        free(demangled);
+        return full_name;
+      }
+      // demangle failed, just pretend it's a C function with no args
+    }
+    // C function
+    return mangled + "()";
+  } else {
+    // didn't find the mangled name, just print the whole line
+    return name;
+  }
+}
+
+void PyBackTrace::dump(Formatter *f) const
+{
+  f->open_array_section("backtrace");
+  for (auto& i : strings) {
+    f->dump_string("frame", i);
+  }
+  f->close_section();
+}
+
+void PyBackTrace::print(std::ostream& out) const
+{
+  for (auto& i : strings) {
+    out << i << std::endl;
+  }
+}
+
+}
diff --git a/src/common/BackTrace.h b/src/common/BackTrace.h
new file mode 100644
index 000000000..d89b34d3e
--- /dev/null
+++ b/src/common/BackTrace.h
@@ -0,0 +1,78 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_BACKTRACE_H
+#define CEPH_BACKTRACE_H
+
+#include "acconfig.h"
+#include <iosfwd>
+#ifdef HAVE_EXECINFO_H
+#include <execinfo.h>
+#endif
+#include <stdlib.h>
+
+#include <list>
+#include <string>
+
+namespace ceph {
+
+class Formatter;
+
+struct BackTrace {
+  virtual ~BackTrace() {}
+  virtual void print(std::ostream& out) const = 0;
+  virtual void dump(Formatter *f) const = 0;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const BackTrace& bt) {
+  bt.print(out);
+  return out;
+}
+
+
+struct ClibBackTrace : public BackTrace {
+  const static int max = 32;
+
+  int skip;
+  void *array[max]{};
+  size_t size;
+  char **strings;
+
+  explicit ClibBackTrace(int s) {
+#ifdef HAVE_EXECINFO_H
+    skip = s;
+    size = backtrace(array, max);
+    strings = backtrace_symbols(array, size);
+#else
+    skip = 0;
+    size = 0;
+    strings = nullptr;
+#endif
+  }
+  ~ClibBackTrace() {
+    free(strings);
+  }
+
+  ClibBackTrace(const ClibBackTrace& other);
+  const ClibBackTrace& operator=(const ClibBackTrace& other);
+
+  void print(std::ostream& out) const override;
+  void dump(Formatter *f) const override;
+
+  static std::string demangle(const char* name);
+};
+
+
+struct PyBackTrace : public BackTrace {
+  std::list<std::string> strings;
+
+  explicit PyBackTrace(std::list<std::string>& s) : strings(s) {}
+
+  void dump(Formatter *f) const override;
+  void print(std::ostream& out) const override;
+};
+
+
+}
+
+#endif
diff --git a/src/common/CDC.cc b/src/common/CDC.cc
new file mode 100644
index 000000000..8aabeaffc
--- /dev/null
+++ b/src/common/CDC.cc
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <random>
+
+#include "CDC.h"
+#include "FastCDC.h"
+#include "FixedCDC.h"
+
+std::unique_ptr<CDC> CDC::create(
+  const std::string& type,
+  int bits,
+  int windowbits)
+{
+  if (type == "fastcdc") {
+    return std::unique_ptr<CDC>(new FastCDC(bits, windowbits));
+  }
+  if (type == "fixed") {
+    return std::unique_ptr<CDC>(new FixedCDC(bits, windowbits));
+  }
+  return nullptr;
+}
+
+void generate_buffer(int size, bufferlist *outbl, int seed)
+{
+  std::mt19937_64 engine, engine2;
+  engine.seed(seed);
+  engine2.seed(seed);
+
+  // assemble from randomly-sized segments!
+  outbl->clear();
+  auto left = size;
+  while (left) {
+    size_t l = std::min<size_t>((engine2() & 0xffff0) + 16, left);
+    left -= l;
+    bufferptr p(l);
+    p.set_length(l);
+    char *b = p.c_str();
+    for (size_t i = 0; i < l / sizeof(uint64_t); ++i) {
+      ((ceph_le64 *)b)[i] = ceph_le64(engine());
+    }
+    outbl->append(p);
+  }
+}
+
diff --git a/src/common/CDC.h b/src/common/CDC.h
new file mode 100644
index 000000000..5c4273a08
--- /dev/null
+++ b/src/common/CDC.h
@@ -0,0 +1,30 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <vector>
+#include <string>
+
+#include "include/types.h"
+#include "include/buffer.h"
+
+class CDC {
+public:
+  virtual ~CDC() = default;
+
+  /// calculate chunk boundaries as vector of (offset, length) pairs
+  virtual void calc_chunks(
+    const bufferlist& inputdata,
+    std::vector<std::pair<uint64_t, uint64_t>> *chunks) const = 0;
+
+  /// set target chunk size as a power of 2, and number of bits for hard min/max
+  virtual void set_target_bits(int bits, int windowbits = 2) = 0;
+
+  static std::unique_ptr<CDC> create(
+    const std::string& type,
+    int bits,
+    int windowbits = 0);
+};
+
+void generate_buffer(int size, bufferlist *outbl, int seed = 0);
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
new file mode 100644
index 000000000..1297df324
--- /dev/null
+++ b/src/common/CMakeLists.txt
@@ -0,0 +1,246 @@
+add_library(common_buffer_obj OBJECT
+  buffer.cc)
+
+add_library(common_texttable_obj OBJECT
+  TextTable.cc)
+
+add_library(common_prioritycache_obj OBJECT
+  PriorityCache.cc)
+add_dependencies(common_prioritycache_obj legacy-option-headers)
+
+if(WIN32)
+  add_library(dlfcn_win32 STATIC win32/dlfcn.cc win32/errno.cc)
+endif()
+
+add_subdirectory(options)
+
+set(common_srcs
+  AsyncOpTracker.cc
+  BackTrace.cc
+  ConfUtils.cc
+  Cycles.cc
+  CDC.cc
+  DecayCounter.cc
+  FastCDC.cc
+  Finisher.cc
+  FixedCDC.cc
+  Formatter.cc
+  Graylog.cc
+  HTMLFormatter.cc
+  HeartbeatMap.cc
+  LogClient.cc
+  LogEntry.cc
+  ostream_temp.cc
+  OutputDataSocket.cc
+  PluginRegistry.cc
+  Readahead.cc
+  RefCountedObj.cc
+  SloppyCRCMap.cc
+  Thread.cc
+  Throttle.cc
+  Timer.cc
+  TracepointProvider.cc
+  TrackedOp.cc
+  WorkQueue.cc
+  admin_socket.cc
+  admin_socket_client.cc
+  assert.cc
+  bit_str.cc
+  bloom_filter.cc
+  ceph_argparse.cc
+  ceph_context.cc
+  ceph_crypto.cc
+  ceph_frag.cc
+  ceph_fs.cc
+  ceph_hash.cc
+  ceph_json.cc
+  ceph_strings.cc
+  ceph_releases.cc
+  ceph_time.cc
+  cmdparse.cc
+  code_environment.cc
+  common_init.cc
+  compat.cc
+  config.cc
+  config_values.cc
+  dout.cc
+  entity_name.cc
+  environment.cc
+  errno.cc
+  escape.cc
+  fd.cc
+  fs_types.cc
+  hex.cc
+  histogram.cc
+  hobject.cc
+  hostname.cc
+  ipaddr.cc
+  iso_8601.cc
+  mempool.cc
+  mime.c
+  numa.cc
+  openssl_opts_handler.cc
+  options.cc
+  page.cc
+  perf_counters.cc
+  perf_counters_collection.cc
+  perf_counters_key.cc
+  perf_histogram.cc
+  pick_address.cc
+  random_string.cc
+  reverse.c
+  run_cmd.cc
+  scrub_types.cc
+  signal.cc
+  snap_types.cc
+  str_list.cc
+  str_map.cc
+  strtol.cc
+  types.cc
+  url_escape.cc
+  pretty_binary.cc
+  utf8.c
+  util.cc
+  version.cc)
+
+if(WITH_SYSTEMD)
+  list(APPEND common_srcs
+    Journald.cc)
+endif()
+
+if(WITH_CEPH_DEBUG_MUTEX)
+  list(APPEND common_srcs
+    lockdep.cc
+    mutex_debug.cc
+    condition_variable_debug.cc
+    shared_mutex_debug.cc)
+endif()
+
+if(WIN32)
+  if(MINGW)
+    set(CMAKE_MC_COMPILER x86_64-w64-mingw32-windmc)
+    set(CMAKE_RC_COMPILER x86_64-w64-mingw32-windres)
+  endif()
+
+  add_custom_command(
+    OUTPUT ${CMAKE_BINARY_DIR}/src/common/event_logging.h
+    COMMAND ${CMAKE_MC_COMPILER} -b -e h -h ${CMAKE_BINARY_DIR}/src/common/
+      -r ${CMAKE_BINARY_DIR}/src/common ${CMAKE_SOURCE_DIR}/src/common/win32/event_logging.mc
+    COMMAND ${CMAKE_RC_COMPILER} ${CMAKE_BINARY_DIR}/src/common/event_logging.rc
+      -o ${CMAKE_BINARY_DIR}/src/common/event_logging.o
+    COMMAND ${CMAKE_CXX_COMPILER} -o ${CMAKE_BINARY_DIR}/bin/event_logging.dll -shared
+      ${CMAKE_BINARY_DIR}/src/common/event_logging.o
+    DEPENDS ${CMAKE_SOURCE_DIR}/src/common/win32/event_logging.mc)
+
+  set_source_files_properties(${CMAKE_SOURCE_DIR}/src/common/win32/syslog.cc
+    APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_BINARY_DIR}/src/common/event_logging.h)
+
+  include_directories(SYSTEM "${CMAKE_BINARY_DIR}/src/common/")
+
+  list(APPEND common_srcs
+    win32/blkdev.cc
+    win32/dns_resolve.cc
+    win32/ifaddrs.cc
+    win32/registry.cc
+    win32/service.cc
+    win32/SubProcess.cc
+    win32/syslog.cc)
+else()
+  list(APPEND common_srcs
+    blkdev.cc
+    dns_resolve.cc
+    linux_version.c
+    SubProcess.cc)
+endif()
+
+set_source_files_properties(${CMAKE_SOURCE_DIR}/src/common/version.cc
+  APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_BINARY_DIR}/src/include/ceph_ver.h)
+
+if(HAS_VTA)
+  set_source_files_properties(
+    config.cc
+    options.cc
+    PROPERTIES COMPILE_FLAGS -fno-var-tracking-assignments)
+endif()
+
+if(FREEBSD)
+  list(APPEND common_srcs freebsd_errno.cc)
+elseif(APPLE)
+  list(APPEND common_srcs darwin_errno.cc)
+elseif(SUN)
+  list(APPEND common_srcs solaris_errno.cc)
+elseif(AIX)
+  list(APPEND common_srcs aix_errno.cc)
+elseif(WIN32)
+  list(APPEND common_srcs win32/errno.cc)
+  list(APPEND common_srcs win32/wstring.cc)
+endif()
+
+if(WITH_EVENTTRACE)
+  list(APPEND common_srcs EventTrace.cc)
+endif()
+
+add_library(common-common-objs OBJECT
+  ${common_srcs})
+# Let's not rely on the default system headers and point Cmake to the
+# retrieved OpenSSL location. This is especially important when cross
+# compiling (e.g. targeting Windows).
+target_include_directories(common-common-objs PRIVATE ${OPENSSL_INCLUDE_DIR})
+# for options.cc
+target_compile_definitions(common-common-objs PRIVATE
+  "CMAKE_INSTALL_LIBDIR=\"${CMAKE_INSTALL_LIBDIR}\""
+  "CEPH_INSTALL_FULL_PKGLIBDIR=\"${CEPH_INSTALL_FULL_PKGLIBDIR}\""
+  "CEPH_INSTALL_DATADIR=\"${CEPH_INSTALL_DATADIR}\""
+  $<TARGET_PROPERTY:fmt::fmt,INTERFACE_COMPILE_DEFINITIONS>)
+add_dependencies(common-common-objs legacy-option-headers)
+
+set(common_mountcephfs_srcs
+  armor.c
+  safe_io.c
+  module.c
+  addr_parsing.c)
+add_library(common_mountcephfs_objs OBJECT
+  ${common_mountcephfs_srcs})
+
+
+set(crc32_srcs
+  crc32c.cc
+  crc32c_intel_baseline.c
+  sctp_crc32.c)
+if(HAVE_INTEL)
+  list(APPEND crc32_srcs
+    crc32c_intel_fast.c)
+  if(HAVE_NASM_X64)
+    set(CMAKE_ASM_FLAGS "-i ${PROJECT_SOURCE_DIR}/src/isa-l/include/ ${CMAKE_ASM_FLAGS}")
+    list(APPEND crc32_srcs
+      ${PROJECT_SOURCE_DIR}/src/isa-l/crc/crc32_iscsi_00.asm
+      crc32c_intel_fast_zero_asm.s)
+  endif(HAVE_NASM_X64)
+elseif(HAVE_POWER8)
+  list(APPEND crc32_srcs
+    crc32c_ppc.c)
+  if(HAVE_PPC64LE)
+    list(APPEND crc32_srcs
+      crc32c_ppc_asm.S
+      crc32c_ppc_fast_zero_asm.S)
+  endif(HAVE_PPC64LE)
+elseif(HAVE_ARMV8_CRC)
+  list(APPEND crc32_srcs
+    crc32c_aarch64.c)
+endif(HAVE_INTEL)
+
+add_library(crc32 STATIC ${crc32_srcs})
+if(HAVE_ARMV8_CRC)
+  set_target_properties(crc32 PROPERTIES
+    COMPILE_FLAGS "${CMAKE_C_FLAGS} ${ARMV8_CRC_COMPILE_FLAGS}")
+endif()
+target_link_libraries(crc32
+  arch)
+
+add_library(common_utf8 STATIC utf8.c)
+
+if(HAVE_KEYUTILS)
+  set(parse_secret_srcs
+    secret.c)
+  add_library(parse_secret_objs OBJECT ${parse_secret_srcs})
+endif()
diff --git a/src/common/Checksummer.h b/src/common/Checksummer.h
new file mode 100644
index 000000000..a42f5b682
--- /dev/null
+++ b/src/common/Checksummer.h
@@ -0,0 +1,274 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OS_BLUESTORE_CHECKSUMMER
+#define CEPH_OS_BLUESTORE_CHECKSUMMER
+
+#include "include/buffer.h"
+#include "include/byteorder.h"
+#include "include/ceph_assert.h"
+
+#include "xxHash/xxhash.h"
+
+class Checksummer {
+public:
+  enum CSumType {
+    CSUM_NONE = 1,	//intentionally set to 1 to be aligned with OSDMnitor's pool_opts_t handling - it treats 0 as unset while we need to distinguish none and unset cases
+    CSUM_XXHASH32 = 2,
+    CSUM_XXHASH64 = 3,
+    CSUM_CRC32C = 4,
+    CSUM_CRC32C_16 = 5, // low 16 bits of crc32c
+    CSUM_CRC32C_8 = 6,  // low 8 bits of crc32c
+    CSUM_MAX,
+  };
+  static const char *get_csum_type_string(unsigned t) {
+    switch (t) {
+    case CSUM_NONE: return "none";
+    case CSUM_XXHASH32: return "xxhash32";
+    case CSUM_XXHASH64: return "xxhash64";
+    case CSUM_CRC32C: return "crc32c";
+    case CSUM_CRC32C_16: return "crc32c_16";
+    case CSUM_CRC32C_8: return "crc32c_8";
+    default: return "???";
+    }
+  }
+  static int get_csum_string_type(const std::string &s) {
+    if (s == "none")
+      return CSUM_NONE;
+    if (s == "xxhash32")
+      return CSUM_XXHASH32;
+    if (s == "xxhash64")
+      return CSUM_XXHASH64;
+    if (s == "crc32c")
+      return CSUM_CRC32C;
+    if (s == "crc32c_16")
+      return CSUM_CRC32C_16;
+    if (s == "crc32c_8")
+      return CSUM_CRC32C_8;
+    return -EINVAL;
+  }
+
+  static size_t get_csum_init_value_size(int csum_type) {
+    switch (csum_type) {
+    case CSUM_NONE: return 0;
+    case CSUM_XXHASH32: return sizeof(xxhash32::init_value_t);
+    case CSUM_XXHASH64: return sizeof(xxhash64::init_value_t);
+    case CSUM_CRC32C: return sizeof(crc32c::init_value_t);
+    case CSUM_CRC32C_16: return sizeof(crc32c_16::init_value_t);
+    case CSUM_CRC32C_8: return sizeof(crc32c_8::init_value_t);
+    default: return 0;
+    }
+  }
+  static size_t get_csum_value_size(int csum_type) {
+    switch (csum_type) {
+    case CSUM_NONE: return 0;
+    case CSUM_XXHASH32: return 4;
+    case CSUM_XXHASH64: return 8;
+    case CSUM_CRC32C: return 4;
+    case CSUM_CRC32C_16: return 2;
+    case CSUM_CRC32C_8: return 1;
+    default: return 0;
+    }
+  }
+
+  struct crc32c {
+    typedef uint32_t init_value_t;
+    typedef ceph_le32 value_t;
+
+    // we have no execution context/state.
+    typedef int state_t;
+    static void init(state_t *state) {
+    }
+    static void fini(state_t *state) {
+    }
+
+    static init_value_t calc(
+      state_t state,
+      init_value_t init_value,
+      size_t len,
+      ceph::buffer::list::const_iterator& p
+      ) {
+      return p.crc32c(len, init_value);
+    }
+  };
+
+  struct crc32c_16 {
+    typedef uint32_t init_value_t;
+    typedef ceph_le16 value_t;
+
+    // we have no execution context/state.
+    typedef int state_t;
+    static void init(state_t *state) {
+    }
+    static void fini(state_t *state) {
+    }
+
+    static init_value_t calc(
+      state_t state,
+      init_value_t init_value,
+      size_t len,
+      ceph::buffer::list::const_iterator& p
+      ) {
+      return p.crc32c(len, init_value) & 0xffff;
+    }
+  };
+
+  struct crc32c_8 {
+    typedef uint32_t init_value_t;
+    typedef __u8 value_t;
+
+    // we have no execution context/state.
+    typedef int state_t;
+    static void init(state_t *state) {
+    }
+    static void fini(state_t *state) {
+    }
+
+    static init_value_t calc(
+      state_t state,
+      init_value_t init_value,
+      size_t len,
+      ceph::buffer::list::const_iterator& p
+      ) {
+      return p.crc32c(len, init_value) & 0xff;
+    }
+  };
+
+  struct xxhash32 {
+    typedef uint32_t init_value_t;
+    typedef ceph_le32 value_t;
+
+    typedef XXH32_state_t *state_t;
+    static void init(state_t *s) {
+      *s = XXH32_createState();
+    }
+    static void fini(state_t *s) {
+      XXH32_freeState(*s);
+    }
+
+    static init_value_t calc(
+      state_t state,
+      init_value_t init_value,
+      size_t len,
+      ceph::buffer::list::const_iterator& p
+      ) {
+      XXH32_reset(state, init_value);
+      while (len > 0) {
+	const char *data;
+	size_t l = p.get_ptr_and_advance(len, &data);
+	XXH32_update(state, data, l);
+	len -= l;
+      }
+      return XXH32_digest(state);
+    }
+  };
+
+  struct xxhash64 {
+    typedef uint64_t init_value_t;
+    typedef ceph_le64 value_t;
+
+    typedef XXH64_state_t *state_t;
+    static void init(state_t *s) {
+      *s = XXH64_createState();
+    }
+    static void fini(state_t *s) {
+      XXH64_freeState(*s);
+    }
+
+    static init_value_t calc(
+      state_t state,
+      init_value_t init_value,
+      size_t len,
+      ceph::buffer::list::const_iterator& p
+      ) {
+      XXH64_reset(state, init_value);
+      while (len > 0) {
+	const char *data;
+	size_t l = p.get_ptr_and_advance(len, &data);
+	XXH64_update(state, data, l);
+	len -= l;
+      }
+      return XXH64_digest(state);
+    }
+  };
+
+  template<class Alg>
+  static int calculate(
+    size_t csum_block_size,
+    size_t offset,
+    size_t length,
+    const ceph::buffer::list &bl,
+    ceph::buffer::ptr* csum_data
+    ) {
+    return calculate<Alg>(-1, csum_block_size, offset, length, bl, csum_data);
+  }
+
+  template<class Alg>
+  static int calculate(
+      typename Alg::init_value_t init_value,
+      size_t csum_block_size,
+      size_t offset,
+      size_t length,
+      const ceph::buffer::list &bl,
+      ceph::buffer::ptr* csum_data) {
+    ceph_assert(length % csum_block_size == 0);
+    size_t blocks = length / csum_block_size;
+    ceph::buffer::list::const_iterator p = bl.begin();
+    ceph_assert(bl.length() >= length);
+
+    typename Alg::state_t state;
+    Alg::init(&state);
+
+    ceph_assert(csum_data->length() >= (offset + length) / csum_block_size *
+	   sizeof(typename Alg::value_t));
+
+    typename Alg::value_t *pv =
+      reinterpret_cast<typename Alg::value_t*>(csum_data->c_str());
+    pv += offset / csum_block_size;
+    while (blocks--) {
+      *pv = Alg::calc(state, init_value, csum_block_size, p);
+      ++pv;
+    }
+    Alg::fini(&state);
+    return 0;
+  }
+
+  template<class Alg>
+  static int verify(
+    size_t csum_block_size,
+    size_t offset,
+    size_t length,
+    const ceph::buffer::list &bl,
+    const ceph::buffer::ptr& csum_data,
+    uint64_t *bad_csum=0
+    ) {
+    ceph_assert(length % csum_block_size == 0);
+    ceph::buffer::list::const_iterator p = bl.begin();
+    ceph_assert(bl.length() >= length);
+
+    typename Alg::state_t state;
+    Alg::init(&state);
+
+    const typename Alg::value_t *pv =
+      reinterpret_cast<const typename Alg::value_t*>(csum_data.c_str());
+    pv += offset / csum_block_size;
+    size_t pos = offset;
+    while (length > 0) {
+      typename Alg::init_value_t v = Alg::calc(state, -1, csum_block_size, p);
+      if (*pv != v) {
+	if (bad_csum) {
+	  *bad_csum = v;
+	}
+	Alg::fini(&state);
+	return pos;
+      }
+      ++pv;
+      pos += csum_block_size;
+      length -= csum_block_size;
+    }
+    Alg::fini(&state);
+    return -1;  // no errors
+  }
+};
+
+#endif
diff --git a/src/common/Clock.h b/src/common/Clock.h
new file mode 100644
index 000000000..b47954ad1
--- /dev/null
+++ b/src/common/Clock.h
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_CLOCK_H
+#define CEPH_CLOCK_H
+
+#include "include/utime.h"
+
+#include <time.h>
+
+static inline utime_t ceph_clock_now()
+{
+#if defined(__linux__)
+  struct timespec tp;
+  clock_gettime(CLOCK_REALTIME, &tp);
+  utime_t n(tp);
+#else
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  utime_t n(&tv);
+#endif
+  return n;
+}
+
+#endif
diff --git a/src/common/CommandTable.h b/src/common/CommandTable.h
new file mode 100644
index 000000000..53218d653
--- /dev/null
+++ b/src/common/CommandTable.h
@@ -0,0 +1,112 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef COMMAND_TABLE_H_
+#define COMMAND_TABLE_H_
+
+#include "messages/MCommand.h"
+#include "messages/MMgrCommand.h"
+
+class CommandOp
+{
+  public:
+  ConnectionRef con;
+  ceph_tid_t tid;
+
+  std::vector<std::string> cmd;
+  ceph::buffer::list    inbl;
+  Context      *on_finish;
+  ceph::buffer::list   *outbl;
+  std::string  *outs;
+
+  MessageRef get_message(const uuid_d &fsid,
+			 bool mgr=false) const
+  {
+    if (mgr) {
+      auto m = ceph::make_message<MMgrCommand>(fsid);
+      m->cmd = cmd;
+      m->set_data(inbl);
+      m->set_tid(tid);
+      return m;
+    } else {
+      auto m = ceph::make_message<MCommand>(fsid);
+      m->cmd = cmd;
+      m->set_data(inbl);
+      m->set_tid(tid);
+      return m;
+    }
+  }
+
+  CommandOp(const ceph_tid_t t) : tid(t), on_finish(nullptr),
+                                  outbl(nullptr), outs(nullptr) {}
+  CommandOp() : tid(0), on_finish(nullptr), outbl(nullptr), outs(nullptr) {}
+};
+
+/**
+ * Hold client-side state for a collection of in-flight commands
+ * to a remote service.
+ */
+template<typename T>
+class CommandTable
+{
+protected:
+  ceph_tid_t last_tid;
+  std::map<ceph_tid_t, T> commands;
+
+public:
+
+  CommandTable()
+    : last_tid(0)
+  {}
+
+  ~CommandTable()
+  {
+    ceph_assert(commands.empty());
+  }
+
+  T& start_command()
+  {
+    ceph_tid_t tid = last_tid++;
+    commands.insert(std::make_pair(tid, T(tid)) );
+
+    return commands.at(tid);
+  }
+
+  const std::map<ceph_tid_t, T> &get_commands() const
+  {
+    return commands;
+  }
+
+  bool exists(ceph_tid_t tid) const
+  {
+    return commands.count(tid) > 0;
+  }
+
+  T& get_command(ceph_tid_t tid)
+  {
+    return commands.at(tid);
+  }
+
+  void erase(ceph_tid_t tid)
+  {
+    commands.erase(tid);
+  }
+
+  void clear() {
+    commands.clear();
+  }
+};
+
+#endif
+
diff --git a/src/common/Cond.h b/src/common/Cond.h
new file mode 100644
index 000000000..f41d0bf40
--- /dev/null
+++ b/src/common/Cond.h
@@ -0,0 +1,122 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_COND_H
+#define CEPH_COND_H
+
+#include "common/Clock.h"
+#include "common/ceph_mutex.h"
+#include "include/Context.h"
+
+/**
+ * context to signal a cond
+ *
+ * Generic context to signal a cond and store the return value.  We
+ * assume the caller is holding the appropriate lock.
+ */
+class C_Cond : public Context {
+  ceph::condition_variable& cond;   ///< Cond to signal
+  bool *done;   ///< true if finish() has been called
+  int *rval;    ///< return value
+public:
+  C_Cond(ceph::condition_variable &c, bool *d, int *r) : cond(c), done(d), rval(r) {
+    *done = false;
+  }
+  void finish(int r) override {
+    *done = true;
+    *rval = r;
+    cond.notify_all();
+  }
+};
+
+/**
+ * context to signal a cond, protected by a lock
+ *
+ * Generic context to signal a cond under a specific lock. We take the
+ * lock in the finish() callback, so the finish() caller must not
+ * already hold it.
+ */
+class C_SafeCond : public Context {
+  ceph::mutex& lock;    ///< Mutex to take
+  ceph::condition_variable& cond;     ///< Cond to signal
+  bool *done;     ///< true after finish() has been called
+  int *rval;      ///< return value (optional)
+public:
+  C_SafeCond(ceph::mutex& l, ceph::condition_variable& c, bool *d, int *r=0)
+    : lock(l), cond(c), done(d), rval(r) {
+    *done = false;
+  }
+  void finish(int r) override {
+    std::lock_guard l{lock};
+    if (rval)
+      *rval = r;
+    *done = true;
+    cond.notify_all();
+  }
+};
+
+/**
+ * Context providing a simple wait() mechanism to wait for completion
+ *
+ * The context will not be deleted as part of complete and must live
+ * until wait() returns.
+ */
+class C_SaferCond : public Context {
+  ceph::mutex lock;  ///< Mutex to take
+  ceph::condition_variable cond;     ///< Cond to signal
+  bool done = false; ///< true after finish() has been called
+  int rval = 0;      ///< return value
+public:
+  C_SaferCond() :
+    C_SaferCond("C_SaferCond")
+  {}
+  explicit C_SaferCond(const std::string &name)
+    : lock(ceph::make_mutex(name)) {}
+  void finish(int r) override { complete(r); }
+
+  /// We overload complete in order to not delete the context
+  void complete(int r) override {
+    std::lock_guard l(lock);
+    done = true;
+    rval = r;
+    cond.notify_all();
+  }
+
+  /// Returns rval once the Context is called
+  int wait() {
+    std::unique_lock l{lock};
+    cond.wait(l, [this] { return done;});
+    return rval;
+  }
+
+  /// Wait until the \c secs expires or \c complete() is called
+  int wait_for(double secs) {
+    return wait_for(ceph::make_timespan(secs));
+  }
+
+  int wait_for(ceph::timespan secs) {
+    std::unique_lock l{lock};
+    if (done) {
+      return rval;
+    }
+    if (cond.wait_for(l, secs, [this] { return done; })) {
+      return rval;
+    } else {
+      return ETIMEDOUT;
+    }
+  }
+};
+
+#endif
diff --git a/src/common/ConfUtils.cc b/src/common/ConfUtils.cc
new file mode 100644
index 000000000..2f78fd02b
--- /dev/null
+++ b/src/common/ConfUtils.cc
@@ -0,0 +1,340 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+// #define BOOST_SPIRIT_DEBUG
+
+#include <algorithm>
+#include <cctype>
+#include <experimental/iterator>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <sstream>
+
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/trim_all.hpp>
+#include <boost/spirit/include/qi.hpp>
+#include <boost/phoenix.hpp>
+#include <boost/spirit/include/support_line_pos_iterator.hpp>
+
+#include "include/buffer.h"
+#include "common/errno.h"
+#include "common/utf8.h"
+#include "common/ConfUtils.h"
+
+namespace fs = std::filesystem;
+
+using std::ostringstream;
+using std::string;
+
+#define MAX_CONFIG_FILE_SZ 0x40000000
+
+conf_line_t::conf_line_t(const std::string& key, const std::string& val)
+  : key{ConfFile::normalize_key_name(key)},
+    val{boost::algorithm::trim_copy_if(
+          val,
+	  [](unsigned char c) {
+	    return std::isspace(c);
+	  })}
+{}
+
+bool conf_line_t::operator<(const conf_line_t &rhs) const
+{
+  // We only compare keys.
+  // If you have more than one line with the same key in a given section, the
+  // last one wins.
+  return key < rhs.key;
+}
+
+std::ostream &operator<<(std::ostream& oss, const conf_line_t &l)
+{
+  oss << "conf_line_t(key = '" << l.key << "', val='" << l.val << "')";
+  return oss;
+}
+
+conf_section_t::conf_section_t(const std::string& heading,
+			       const std::vector<conf_line_t>& lines)
+  : heading{heading}
+{
+  for (auto& line : lines) {
+    auto [where, inserted] = insert(line);
+    if (!inserted) {
+      erase(where);
+      insert(line);
+    }
+  }
+}
+
+///////////////////////// ConfFile //////////////////////////
+
+ConfFile::ConfFile(const std::vector<conf_section_t>& sections)
+{
+  for (auto& section : sections) {
+    auto [old_sec, sec_inserted] = emplace(section.heading, section);
+    if (!sec_inserted) {
+      // merge lines in section into old_sec
+      for (auto& line : section) {
+	auto [old_line, line_inserted] = old_sec->second.emplace(line);
+	// and replace the existing ones if any
+	if (!line_inserted) {
+	  old_sec->second.erase(old_line);
+	  old_sec->second.insert(line);
+	}
+      }
+    }
+  }
+}
+
+/* We load the whole file into memory and then parse it.  Although this is not
+ * the optimal approach, it does mean that most of this code can be shared with
+ * the bufferlist loading function. Since bufferlists are always in-memory, the
+ * load_from_buffer interface works well for them.
+ * In general, configuration files should be a few kilobytes at maximum, so
+ * loading the whole configuration into memory shouldn't be a problem.
+ */
+int ConfFile::parse_file(const std::string &fname,
+			 std::ostream *warnings)
+{
+  clear();
+  try {
+    if (auto file_size = fs::file_size(fname); file_size > MAX_CONFIG_FILE_SZ) {
+      *warnings << __func__ << ": config file '" << fname
+		<< "' is " << file_size << " bytes, "
+		<< "but the maximum is " << MAX_CONFIG_FILE_SZ;
+      return -EINVAL;
+    }
+  } catch (const fs::filesystem_error& e) {
+    std::error_code ec;
+    auto is_other = fs::is_other(fname, ec);
+    if (!ec && is_other) {
+      // /dev/null?
+      return 0;
+    } else {
+      *warnings << __func__ << ": " << e.what();
+      return -e.code().value();
+    }
+  }
+  std::ifstream ifs{fname};
+  std::string buffer{std::istreambuf_iterator<char>(ifs),
+			               std::istreambuf_iterator<char>()};
+  if (parse_buffer(buffer, warnings)) {
+    return 0;
+  } else {
+    return -EINVAL;
+  }
+}
+
+namespace {
+
+namespace qi = boost::spirit::qi;
+namespace phoenix = boost::phoenix;
+
+template<typename Iterator, typename Skipper>
+struct IniGrammer : qi::grammar<Iterator, ConfFile(), Skipper>
+{
+  struct error_handler_t {
+    std::ostream& os;
+    template<typename Iter>
+    auto operator()(Iter first, Iter last, Iter where,
+		    const boost::spirit::info& what) const {
+      auto line_start = boost::spirit::get_line_start(first, where);
+      os << "parse error: expected '" << what
+	 << "' in line " << boost::spirit::get_line(where)
+	 << " at position " << boost::spirit::get_column(line_start, where) << "\n";
+      return qi::fail;
+    }
+  };
+  IniGrammer(Iterator begin, std::ostream& err)
+    : IniGrammer::base_type{conf_file},
+      report_error{error_handler_t{err}}
+  {
+    using qi::_1;
+    using qi::_2;
+    using qi::_val;
+    using qi::char_;
+    using qi::eoi;
+    using qi::eol;
+    using qi::blank;
+    using qi::lexeme;
+    using qi::lit;
+    using qi::raw;
+
+    blanks = *blank;
+    comment_start = lit('#') | lit(';');
+    continue_marker = lit('\\') >> eol;
+
+    text_char %=
+      (lit('\\') >> (char_ - eol)) |
+      (char_ - (comment_start | eol));
+
+    key %= raw[+(text_char - char_("=[ ")) % +blank];
+    quoted_value %=
+      lexeme[lit('"') >> *(text_char - '"') > '"'] |
+      lexeme[lit('\'') >> *(text_char - '\'') > '\''];
+    unquoted_value %= *text_char;
+    comment = *blank >> comment_start > *(char_ - eol);
+    empty_line = -(blanks|comment) >> eol;
+    value %= quoted_value | unquoted_value;
+    key_val =
+      (blanks >> key >> blanks >> '=' > blanks > value > +empty_line)
+      [_val = phoenix::construct<conf_line_t>(_1, _2)];
+
+    heading %= lit('[') > +(text_char - ']') > ']' > +empty_line;
+    section =
+      (heading >> *(key_val - heading) >> *eol)
+      [_val = phoenix::construct<conf_section_t>(_1, _2)];
+    conf_file =
+      (key_val [_val = phoenix::construct<ConfFile>(_1)]
+       |
+       (*eol >> (*section)[_val = phoenix::construct<ConfFile>(_1)])
+      ) > eoi;
+
+    empty_line.name("empty_line");
+    key.name("key");
+    quoted_value.name("quoted value");
+    unquoted_value.name("unquoted value");
+    key_val.name("key=val");
+    heading.name("section name");
+    section.name("section");
+
+    qi::on_error<qi::fail>(
+      conf_file,
+      report_error(qi::_1, qi::_2, qi::_3, qi::_4));
+
+    BOOST_SPIRIT_DEBUG_NODE(heading);
+    BOOST_SPIRIT_DEBUG_NODE(section);
+    BOOST_SPIRIT_DEBUG_NODE(key);
+    BOOST_SPIRIT_DEBUG_NODE(quoted_value);
+    BOOST_SPIRIT_DEBUG_NODE(unquoted_value);
+    BOOST_SPIRIT_DEBUG_NODE(key_val);
+    BOOST_SPIRIT_DEBUG_NODE(conf_file);
+  }
+
+  qi::rule<Iterator> blanks;
+  qi::rule<Iterator> empty_line;
+  qi::rule<Iterator> comment_start;
+  qi::rule<Iterator> continue_marker;
+  qi::rule<Iterator, char()> text_char;
+  qi::rule<Iterator, std::string(), Skipper> key;
+  qi::rule<Iterator, std::string(), Skipper> quoted_value;
+  qi::rule<Iterator, std::string(), Skipper> unquoted_value;
+  qi::rule<Iterator> comment;
+  qi::rule<Iterator, std::string(), Skipper> value;
+  qi::rule<Iterator, conf_line_t(), Skipper> key_val;
+  qi::rule<Iterator, std::string(), Skipper> heading;
+  qi::rule<Iterator, conf_section_t(), Skipper> section;
+  qi::rule<Iterator, ConfFile(), Skipper> conf_file;
+  boost::phoenix::function<error_handler_t> report_error;
+};
+}
+
+bool ConfFile::parse_buffer(std::string_view buf, std::ostream* err)
+{
+  assert(err);
+#ifdef _WIN32
+  // We'll need to ensure that there's a new line at the end of the buffer,
+  // otherwise the config parsing will fail.
+  std::string _buf = std::string(buf) + "\n";
+#else
+  std::string_view _buf = buf;
+#endif
+  if (int err_pos = check_utf8(_buf.data(), _buf.size()); err_pos > 0) {
+    *err << "parse error: invalid UTF-8 found at line "
+	 << std::count(_buf.begin(), std::next(_buf.begin(), err_pos), '\n') + 1;
+    return false;
+  }
+  using iter_t = boost::spirit::line_pos_iterator<decltype(_buf.begin())>;
+  iter_t first{_buf.begin()};
+  using skipper_t = qi::rule<iter_t>;
+  IniGrammer<iter_t, skipper_t> grammar{first, *err};
+  skipper_t skipper = grammar.continue_marker | grammar.comment;
+  return qi::phrase_parse(first, iter_t{_buf.end()},
+			  grammar, skipper, *this);
+}
+
+int ConfFile::parse_bufferlist(ceph::bufferlist *bl,
+			       std::ostream *warnings)
+{
+  clear();
+  ostringstream oss;
+  if (!warnings) {
+    warnings = &oss;
+  }
+  return parse_buffer({bl->c_str(), bl->length()}, warnings) ? 0 : -EINVAL;
+}
+
+int ConfFile::read(std::string_view section_name,
+		   std::string_view key,
+		   std::string &val) const
+{
+  string k(normalize_key_name(key));
+
+  if (auto s = base_type::find(section_name); s != end()) {
+    conf_line_t exemplar{k, {}};
+    if (auto line = s->second.find(exemplar); line != s->second.end()) {
+      val = line->val;
+      return 0;
+    }
+  }
+  return -ENOENT;
+}
+
+/* Normalize a key name.
+ *
+ * Normalized key names have no leading or trailing whitespace, and all
+ * whitespace is stored as underscores.  The main reason for selecting this
+ * normal form is so that in common/config.cc, we can use a macro to stringify
+ * the field names of md_config_t and get a key in normal form.
+ */
+std::string ConfFile::normalize_key_name(std::string_view key)
+{
+  std::string k{key};
+  boost::algorithm::trim_fill_if(k, "_", isspace);
+  return k;
+}
+
+void ConfFile::check_old_style_section_names(const std::vector<std::string>& prefixes,
+					     std::ostream& os)
+{
+  // Warn about section names that look like old-style section names
+  std::vector<std::string> old_style_section_names;
+  for (auto& [name, section] : *this) {
+    for (auto& prefix : prefixes) {
+      if (name.find(prefix) == 0 && name.size() > 3 && name[3] != '.') {
+	old_style_section_names.push_back(name);
+      }
+    }
+  }
+  if (!old_style_section_names.empty()) {
+    os << "ERROR! old-style section name(s) found: ";
+    std::copy(std::begin(old_style_section_names),
+              std::end(old_style_section_names),
+              std::experimental::make_ostream_joiner(os, ", "));
+    os << ". Please use the new style section names that include a period.";
+  }
+}
+
+std::ostream &operator<<(std::ostream &oss, const ConfFile &cf)
+{
+  for (auto& [name, section] : cf) {
+    oss << "[" << name << "]\n";
+    for (auto& [key, val] : section) {
+      if (!key.empty()) {
+	oss << "\t" << key << " = \"" << val << "\"\n";
+      }
+    }
+  }
+  return oss;
+}
diff --git a/src/common/ConfUtils.h b/src/common/ConfUtils.h
new file mode 100644
index 000000000..5252b6202
--- /dev/null
+++ b/src/common/ConfUtils.h
@@ -0,0 +1,88 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_CONFUTILS_H
+#define CEPH_CONFUTILS_H
+
+#include <deque>
+#include <map>
+#include <set>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "include/buffer_fwd.h"
+
+/*
+ * Ceph configuration file support.
+ *
+ * This class loads an INI-style configuration from a file or bufferlist, and
+ * holds it in memory. In general, an INI configuration file is composed of
+ * sections, which contain key/value pairs. You can put comments on the end of
+ * lines by using either a hash mark (#) or the semicolon (;).
+ *
+ * You can get information out of ConfFile by calling get_key or by examining
+ * individual sections.
+ *
+ * This class could be extended to support modifying configuration files and
+ * writing them back out without too much difficulty. Currently, this is not
+ * implemented, and the file is read-only.
+ */
+struct conf_line_t  {
+  conf_line_t() = default;
+  conf_line_t(const std::string& key, const std::string& val);
+  bool operator<(const conf_line_t& rhs) const;
+  std::string key;
+  std::string val;
+};
+
+std::ostream &operator<<(std::ostream& oss, const conf_line_t& line);
+
+class conf_section_t : public std::set<conf_line_t> {
+public:
+  conf_section_t() = default;
+  conf_section_t(const std::string& heading,
+		 const std::vector<conf_line_t>& lines);
+  std::string heading;
+  friend std::ostream& operator<<(std::ostream& os, const conf_section_t&);
+};
+
+class ConfFile : public std::map<std::string, conf_section_t, std::less<>> {
+  using base_type = std::map<std::string, conf_section_t, std::less<>>;
+public:
+  ConfFile()
+    : ConfFile{std::vector<conf_section_t>{}}
+  {}
+  ConfFile(const conf_line_t& line)
+    : ConfFile{{conf_section_t{"global", {line}}}}
+  {}
+  ConfFile(const std::vector<conf_section_t>& sections);
+  int parse_file(const std::string &fname, std::ostream *warnings);
+  int parse_bufferlist(ceph::bufferlist *bl, std::ostream *warnings);
+  bool parse_buffer(std::string_view buf, std::ostream* warning);
+  int read(std::string_view section, std::string_view key,
+	   std::string &val) const;
+  static std::string normalize_key_name(std::string_view key);
+  // print warnings to os if any old-style section name is found
+  //
+  // consider a section name as old-style name if it starts with any of the
+  // given prefixes, but does not follow with a "."
+  void check_old_style_section_names(const std::vector<std::string>& prefixes,
+				     std::ostream& os);
+
+};
+
+std::ostream &operator<<(std::ostream& oss, const ConfFile& cf);
+
+#endif
diff --git a/src/common/ContextCompletion.cc b/src/common/ContextCompletion.cc
new file mode 100644
index 000000000..a4f816834
--- /dev/null
+++ b/src/common/ContextCompletion.cc
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include "common/ContextCompletion.h"
+
+namespace ceph
+{
+
+ContextCompletion::ContextCompletion(Context *ctx, bool ignore_enoent)
+  : m_ctx(ctx),
+    m_ignore_enoent(ignore_enoent), m_ret(0), m_building(true), m_current_ops(0)
+{
+}
+
+void ContextCompletion::finish_adding_requests() {
+  bool complete;
+  {
+    std::lock_guard l(m_lock);
+    m_building = false;
+    complete = (m_current_ops == 0);
+  }
+  if (complete) {
+    m_ctx->complete(m_ret);
+    delete this;
+  }
+}
+
+void ContextCompletion::start_op() {
+  std::lock_guard l(m_lock);
+  ++m_current_ops;
+}
+
+void ContextCompletion::finish_op(int r) {
+  bool complete;
+  {
+    std::lock_guard l(m_lock);
+    if (r < 0 && m_ret == 0 && (!m_ignore_enoent || r != -ENOENT)) {
+      m_ret = r;
+    }
+
+    --m_current_ops;
+    complete = (m_current_ops == 0 && !m_building);
+  }
+  if (complete) {
+    m_ctx->complete(m_ret);
+    delete this;
+  }
+}
+
+} // namespace ceph
diff --git a/src/common/ContextCompletion.h b/src/common/ContextCompletion.h
new file mode 100644
index 000000000..86c51b2b8
--- /dev/null
+++ b/src/common/ContextCompletion.h
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_ASYNC_COMPLETION_H
+#define CEPH_ASYNC_COMPLETION_H
+
+#include "include/Context.h"
+
+namespace ceph {
+
+class ContextCompletion {
+public:
+  ContextCompletion(Context *ctx, bool ignore_enoent);
+
+  void finish_adding_requests();
+
+  void start_op();
+  void finish_op(int r);
+
+private:
+  ceph::mutex m_lock = ceph::make_mutex("ContextCompletion::m_lock");
+  Context *m_ctx;
+  bool m_ignore_enoent;
+  int m_ret;
+  bool m_building;
+  uint64_t m_current_ops;
+};
+
+class C_ContextCompletion : public Context {
+public:
+  C_ContextCompletion(ContextCompletion &context_completion)
+    : m_context_completion(context_completion)
+  {
+    m_context_completion.start_op();
+  }
+
+  void finish(int r) override {
+    m_context_completion.finish_op(r);
+  }
+
+private:
+  ContextCompletion &m_context_completion;
+};
+
+} // namespace ceph
+
+#endif // CEPH_ASYNC_COMPLETION_H
diff --git a/src/common/Continuation.h b/src/common/Continuation.h
new file mode 100644
index 000000000..731830d19
--- /dev/null
+++ b/src/common/Continuation.h
@@ -0,0 +1,174 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/Context.h"
+
+/**
+ * The Continuation interface is designed to help easily create multi-step
+ * operations that share data without having to pass it around or create
+ * custom Context classes for each step. To write a Continuation:
+ * 1) create a child class with a function for each stage.
+ * 2) Put all your shared data members into the class.
+ * 3) In the constructor, register each function stage with set_callback().
+ * 4) Whenever you need to provide a Context callback that activates the next
+ * stage, call get_callback(stage_number). If you need to proceed to another
+ * stage immediately, call immediate(stage, retcode) and return its result.
+ *
+ * To use a class:
+ * 1) Construct the child class on the heap.
+ * 2) Call begin().
+ * 3) The destructor will be called once one of your functions returns true to
+ * indicate it is done.
+ *
+ * Please note that while you can skip stages and get multiple Callback
+ * objects at once, you *cannot* have any stage report that the Continuation
+ * is completed while any other stage Callbacks are outstanding. It's best to
+ * be serial unless you want to maintain your own metadata about which stages
+ * are still pending.
+ *
+ * In fact, there are only two situations in which a stage should return
+ * true while others are running:
+ * 1) A Callback was issued and completed in the same thread,
+ * 2) you called immediate(stage) and it is returning true.
+ */
+
+class Continuation {
+  std::set<int> stages_in_flight;
+  std::set<int> stages_processing;
+  int rval;
+  Context *on_finish;
+  bool reported_done;
+
+  class Callback : public Context {
+    Continuation *continuation;
+    int stage_to_activate;
+  public:
+    Callback(Continuation *c, int stage) :
+      continuation(c),
+      stage_to_activate(stage) {}
+    void finish(int r) override {
+      continuation->continue_function(r, stage_to_activate);
+    }
+  };
+
+protected:
+  typedef bool (Continuation::*stagePtr)(int r);
+  /**
+   * Continue immediately to the given stage. It will be executed
+   * immediately, in the given thread.
+   * @pre You are in a callback function.
+   * @param stage The stage to execute
+   * @param r The return code that will be provided to the next stage
+   */
+  bool immediate(int stage, int r) {
+    ceph_assert(!stages_in_flight.count(stage));
+    ceph_assert(!stages_processing.count(stage));
+    stages_in_flight.insert(stage);
+    stages_processing.insert(stage);
+    return _continue_function(r, stage);
+  }
+
+  /**
+   * Obtain a Context * that when complete()ed calls back into the given stage.
+   * @pre You are in a callback function.
+   * @param stage The stage this Context should activate
+   */
+  Context *get_callback(int stage) {
+    stages_in_flight.insert(stage);
+    return new Callback(this, stage);
+  }
+
+  /**
+   * Set the return code that is passed to the finally-activated Context.
+   * @param new_rval The return code to use.
+   */
+  void set_rval(int new_rval) { rval = new_rval; }
+  int get_rval() { return rval; }
+
+  /**
+   * Register member functions as associated with a given stage. Start
+   * your stage IDs at 0 and make that one the setup phase.
+   * @pre There are no other functions associated with the stage.
+   * @param stage The stage to associate this function with
+   * @param func The function to use
+   */
+  void set_callback(int stage, stagePtr func) {
+    ceph_assert(callbacks.find(stage) == callbacks.end());
+    callbacks[stage] = func;
+  }
+  
+  /**
+   * Called when the Continuation is done, as determined by a stage returning
+   * true and us having finished all the currently-processing ones.
+   */
+   virtual void _done() {
+     on_finish->complete(rval);
+     on_finish = NULL;
+     return;
+   }
+
+private:
+  std::map<int, Continuation::stagePtr> callbacks;
+
+  bool _continue_function(int r, int n) {
+    std::set<int>::iterator in_flight_iter = stages_in_flight.find(n);
+    ceph_assert(in_flight_iter != stages_in_flight.end());
+    ceph_assert(callbacks.count(n));
+    stagePtr p = callbacks[n];
+
+    [[maybe_unused]] auto [processing_iter, inserted] =
+      stages_processing.insert(n);
+
+    bool done = (this->*p)(r);
+    if (done)
+      reported_done = true;
+
+    stages_processing.erase(processing_iter);
+    stages_in_flight.erase(in_flight_iter);
+    return done;
+  }
+
+  void continue_function(int r, int stage) {
+    bool done = _continue_function(r, stage);
+
+    assert (!done ||
+            stages_in_flight.size() == stages_processing.size());
+
+    if ((done || reported_done) && stages_processing.empty()) {
+      _done();
+      delete this;
+    }
+  }
+
+
+
+public:
+  /**
+   * Construct a new Continuation object. Call this from your child class,
+   * obviously.
+   *
+   * @Param c The Context which should be complete()ed when this Continuation
+   * is done.
+   */
+  Continuation(Context *c) :
+    rval(0), on_finish(c), reported_done(false) {}
+  /**
+   * Clean up.
+   */
+  virtual ~Continuation() { ceph_assert(on_finish == NULL); }
+  /**
+   * Begin running the Continuation.
+   */
+  void begin() { stages_in_flight.insert(0); continue_function(0, 0); }
+};
diff --git a/src/common/Cycles.cc b/src/common/Cycles.cc
new file mode 100644
index 000000000..2ebd24699
--- /dev/null
+++ b/src/common/Cycles.cc
@@ -0,0 +1,220 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+/* Copyright (c) 2011-2014 Stanford University
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "debug.h"
+#include "Cycles.h"
+
+double Cycles::cycles_per_sec = 0;
+
+/**
+ * Perform once-only overall initialization for the Cycles class, such
+ * as calibrating the clock frequency.  This method must be called
+ * before using the Cycles module.
+ *
+ * It is not initialized by default because the timing loops cause
+ * general process startup times to balloon
+ * (http://tracker.ceph.com/issues/15225).
+ */
+void Cycles::init()
+{
+  if (cycles_per_sec != 0)
+    return;
+
+  // Skip initialization if rtdsc is not implemented
+  if (rdtsc() == 0)
+    return;
+
+  // Compute the frequency of the fine-grained CPU timer: to do this,
+  // take parallel time readings using both rdtsc and gettimeofday.
+  // After 10ms have elapsed, take the ratio between these readings.
+
+  struct timeval start_time, stop_time;
+  uint64_t micros;
+  double old_cycles;
+
+  // There is one tricky aspect, which is that we could get interrupted
+  // between calling gettimeofday and reading the cycle counter, in which
+  // case we won't have corresponding readings.  To handle this (unlikely)
+  // case, compute the overall result repeatedly, and wait until we get
+  // two successive calculations that are within 0.1% of each other.
+  old_cycles = 0;
+  while (1) {
+    if (gettimeofday(&start_time, NULL) != 0) {
+      ceph_abort_msg("couldn't read clock");
+    }
+    uint64_t start_cycles = rdtsc();
+    while (1) {
+      if (gettimeofday(&stop_time, NULL) != 0) {
+        ceph_abort_msg("couldn't read clock");
+      }
+      uint64_t stop_cycles = rdtsc();
+      micros = (stop_time.tv_usec - start_time.tv_usec) +
+          (stop_time.tv_sec - start_time.tv_sec)*1000000;
+      if (micros > 10000) {
+        cycles_per_sec = static_cast<double>(stop_cycles - start_cycles);
+        cycles_per_sec = 1000000.0*cycles_per_sec/ static_cast<double>(micros);
+        break;
+      }
+    }
+    double delta = cycles_per_sec/1000.0;
+    if ((old_cycles > (cycles_per_sec - delta)) &&
+        (old_cycles < (cycles_per_sec + delta))) {
+      return;
+    }
+    old_cycles = cycles_per_sec;
+  }
+}
+
+/**
+ * Return the number of CPU cycles per second.
+ */
+double Cycles::per_second()
+{
+  return get_cycles_per_sec();
+}
+
+/**
+ * Given an elapsed time measured in cycles, return a floating-point number
+ * giving the corresponding time in seconds.
+ * \param cycles
+ *      Difference between the results of two calls to rdtsc.
+ * \param cycles_per_sec
+ *      Optional parameter to specify the frequency of the counter that #cycles
+ *      was taken from. Useful when converting a remote machine's tick counter
+ *      to seconds. The default value of 0 will use the local processor's
+ *      computed counter frequency.
+ * \return
+ *      The time in seconds corresponding to cycles.
+ */
+double Cycles::to_seconds(uint64_t cycles, double cycles_per_sec)
+{
+  if (cycles_per_sec == 0)
+    cycles_per_sec = get_cycles_per_sec();
+  return static_cast<double>(cycles)/cycles_per_sec;
+}
+
+/**
+ * Given a time in seconds, return the number of cycles that it
+ * corresponds to.
+ * \param seconds
+ *      Time in seconds.
+ * \param cycles_per_sec
+ *      Optional parameter to specify the frequency of the counter that #cycles
+ *      was taken from. Useful when converting a remote machine's tick counter
+ *      to seconds. The default value of 0 will use the local processor's
+ *      computed counter frequency.
+ * \return
+ *      The approximate number of cycles corresponding to #seconds.
+ */
+uint64_t Cycles::from_seconds(double seconds, double cycles_per_sec)
+{
+  if (cycles_per_sec == 0)
+    cycles_per_sec = get_cycles_per_sec();
+  return (uint64_t) (seconds*cycles_per_sec + 0.5);
+}
+
+/**
+ * Given an elapsed time measured in cycles, return an integer
+ * giving the corresponding time in microseconds. Note: to_seconds()
+ * is faster than this method.
+ * \param cycles
+ *      Difference between the results of two calls to rdtsc.
+ * \param cycles_per_sec
+ *      Optional parameter to specify the frequency of the counter that #cycles
+ *      was taken from. Useful when converting a remote machine's tick counter
+ *      to seconds. The default value of 0 will use the local processor's
+ *      computed counter frequency.
+ * \return
+ *      The time in microseconds corresponding to cycles (rounded).
+ */
+uint64_t Cycles::to_microseconds(uint64_t cycles, double cycles_per_sec)
+{
+  return to_nanoseconds(cycles, cycles_per_sec) / 1000;
+}
+
+/**
+ * Given an elapsed time measured in cycles, return an integer
+ * giving the corresponding time in nanoseconds. Note: to_seconds()
+ * is faster than this method.
+ * \param cycles
+ *      Difference between the results of two calls to rdtsc.
+ * \param cycles_per_sec
+ *      Optional parameter to specify the frequency of the counter that #cycles
+ *      was taken from. Useful when converting a remote machine's tick counter
+ *      to seconds. The default value of 0 will use the local processor's
+ *      computed counter frequency.
+ * \return
+ *      The time in nanoseconds corresponding to cycles (rounded).
+ */
+uint64_t Cycles::to_nanoseconds(uint64_t cycles, double cycles_per_sec)
+{
+  if (cycles_per_sec == 0)
+    cycles_per_sec = get_cycles_per_sec();
+  return (uint64_t) (1e09*static_cast<double>(cycles)/cycles_per_sec + 0.5);
+}
+
+/**
+ * Given a number of nanoseconds, return an approximate number of
+ * cycles for an equivalent time length.
+ * \param ns
+ *      Number of nanoseconds.
+ * \param cycles_per_sec
+ *      Optional parameter to specify the frequency of the counter that #cycles
+ *      was taken from. Useful when converting a remote machine's tick counter
+ *      to seconds. The default value of 0 will use the local processor's
+ *      computed counter frequency.
+ * \return
+ *      The approximate number of cycles for the same time length.
+ */
+uint64_t
+Cycles::from_nanoseconds(uint64_t ns, double cycles_per_sec)
+{
+  if (cycles_per_sec == 0)
+    cycles_per_sec = get_cycles_per_sec();
+  return (uint64_t) (static_cast<double>(ns)*cycles_per_sec/1e09 + 0.5);
+}
+
+/**
+ * Busy wait for a given number of microseconds.
+ * Callers should use this method in most reasonable cases as opposed to
+ * usleep for accurate measurements. Calling usleep may put the the processor
+ * in a low power mode/sleep state which reduces the clock frequency.
+ * So, each time the process/thread wakes up from usleep, it takes some time
+ * to ramp up to maximum frequency. Thus meausrements often incur higher
+ * latencies.
+ * \param us
+ *      Number of microseconds.
+ */
+void
+Cycles::sleep(uint64_t us)
+{
+  uint64_t stop = Cycles::rdtsc() + Cycles::from_nanoseconds(1000*us);
+  while (Cycles::rdtsc() < stop);
+}
diff --git a/src/common/Cycles.h b/src/common/Cycles.h
new file mode 100644
index 000000000..b546479c2
--- /dev/null
+++ b/src/common/Cycles.h
@@ -0,0 +1,117 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+/* Copyright (c) 2011-2014 Stanford University
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+
+#ifndef CEPH_CYCLES_H
+#define CEPH_CYCLES_H
+
+#include <cstdint>
+
+/**
+ * This class provides static methods that read the fine-grain CPU
+ * cycle counter and translate between cycle-level times and absolute
+ * times.
+ */
+class Cycles {
+ public:
+  static void init();
+
+  /**
+   * Return the current value of the fine-grain CPU cycle counter
+   * (accessed via the RDTSC instruction).
+   */
+  static __inline __attribute__((always_inline)) uint64_t rdtsc() {
+#if defined(__i386__)
+    int64_t ret;
+    __asm__ volatile ("rdtsc" : "=A" (ret) );
+    return ret;
+#elif defined(__x86_64__) || defined(__amd64__)
+    uint32_t lo, hi;
+    __asm__ __volatile__("rdtsc" : "=a" (lo), "=d" (hi));
+    return (((uint64_t)hi << 32) | lo);
+#elif defined(__aarch64__)
+    //
+    // arch/arm64/include/asm/arch_timer.h
+    //
+    // static inline u64 arch_counter_get_cntvct(void)
+    // {
+    //         u64 cval;
+    // 
+    //         isb();
+    //         asm volatile("mrs %0, cntvct_el0" : "=r" (cval));
+    // 
+    //         return cval;
+    // }
+    //
+    // https://github.com/cloudius-systems/osv/blob/master/arch/aarch64/arm-clock.cc
+    uint64_t cntvct;
+    asm volatile ("isb; mrs %0, cntvct_el0; isb; " : "=r" (cntvct) :: "memory");
+    return cntvct;
+#elif defined(__powerpc__) || defined (__powerpc64__)
+    // Based on:
+    // https://github.com/randombit/botan/blob/net.randombit.botan/src/lib/entropy/hres_timer/hres_timer.cpp
+    uint32_t lo = 0, hi = 0;
+    asm volatile("mftbu %0; mftb %1" : "=r" (hi), "=r" (lo));
+    return (((uint64_t)hi << 32) | lo);
+#elif defined(__s390__)
+    uint64_t tsc;
+    asm volatile("stck %0" : "=Q" (tsc) : : "cc");
+    return tsc;
+#else
+#warning No high-precision counter available for your OS/arch
+    return 0;
+#endif
+  }
+
+  static double per_second();
+  static double to_seconds(uint64_t cycles, double cycles_per_sec = 0);
+  static uint64_t from_seconds(double seconds, double cycles_per_sec = 0);
+  static uint64_t to_microseconds(uint64_t cycles, double cycles_per_sec = 0);
+  static uint64_t to_nanoseconds(uint64_t cycles, double cycles_per_sec = 0);
+  static uint64_t from_nanoseconds(uint64_t ns, double cycles_per_sec = 0);
+  static void sleep(uint64_t us);
+
+private:
+  Cycles();
+
+  /// Conversion factor between cycles and the seconds; computed by
+  /// Cycles::init.
+  static double cycles_per_sec;
+
+  /**
+   * Returns the conversion factor between cycles in seconds, using
+   * a mock value for testing when appropriate.
+   */
+  static __inline __attribute__((always_inline)) double get_cycles_per_sec() {
+    return cycles_per_sec;
+  }
+};
+
+#endif  // CEPH_CYCLES_H
diff --git a/src/common/DecayCounter.cc b/src/common/DecayCounter.cc
new file mode 100644
index 000000000..4e9e68cc1
--- /dev/null
+++ b/src/common/DecayCounter.cc
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "DecayCounter.h"
+#include "Formatter.h"
+
+#include "include/encoding.h"
+
+void DecayCounter::encode(ceph::buffer::list& bl) const
+{
+  decay();
+  ENCODE_START(5, 4, bl);
+  encode(val, bl);
+  ENCODE_FINISH(bl);
+}
+
+void DecayCounter::decode(ceph::buffer::list::const_iterator &p)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, p);
+  if (struct_v < 2) {
+    double k = 0.0;
+    decode(k, p);
+  }
+  if (struct_v < 3) {
+    double k = 0.0;
+    decode(k, p);
+  }
+  decode(val, p);
+  if (struct_v < 5) {
+    double delta, _;
+    decode(delta, p);
+    val += delta;
+    decode(_, p); /* velocity */
+  }
+  last_decay = clock::now();
+  DECODE_FINISH(p);
+}
+
+void DecayCounter::dump(ceph::Formatter *f) const
+{
+  decay();
+  f->dump_float("value", val);
+  f->dump_float("halflife", rate.get_halflife());
+}
+
+void DecayCounter::generate_test_instances(std::list<DecayCounter*>& ls)
+{
+  DecayCounter *counter = new DecayCounter();
+  counter->val = 3.0;
+  ls.push_back(counter);
+  counter = new DecayCounter();
+  ls.push_back(counter);
+}
+
+void DecayCounter::decay(double delta) const
+{
+  auto now = clock::now();
+  double el = std::chrono::duration<double>(now - last_decay).count();
+
+  // calculate new value
+  double newval = val * exp(el * rate.k) + delta;
+  if (newval < .01) {
+    newval = 0.0;
+  }
+
+  val = newval;
+  last_decay = now;
+}
diff --git a/src/common/DecayCounter.h b/src/common/DecayCounter.h
new file mode 100644
index 000000000..9455ecc5a
--- /dev/null
+++ b/src/common/DecayCounter.h
@@ -0,0 +1,136 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_DECAYCOUNTER_H
+#define CEPH_DECAYCOUNTER_H
+
+#include "include/buffer.h"
+#include "common/Formatter.h"
+#include "common/StackStringStream.h"
+#include "common/ceph_time.h"
+
+#include <cmath>
+#include <list>
+#include <sstream>
+
+/**
+ *
+ * TODO: normalize value based on some function of half_life, 
+ *  so that it can be interpreted as an approximation of a
+ *  moving average of N seconds.  currently, changing half-life
+ *  skews the scale of the value, even at steady state.  
+ *
+ */
+
+class DecayRate {
+public:
+  friend class DecayCounter;
+
+  DecayRate() {}
+  // cppcheck-suppress noExplicitConstructor
+  DecayRate(double hl) { set_halflife(hl); }
+  DecayRate(const DecayRate &dr) : k(dr.k) {}
+
+  void set_halflife(double hl) {
+    k = log(.5) / hl;
+  }
+  double get_halflife() const {
+    return log(.5) / k;
+  }
+
+private:
+  double k = 0;             // k = ln(.5)/half_life
+};
+
+class DecayCounter {
+public:
+  using time = ceph::coarse_mono_time;
+  using clock = ceph::coarse_mono_clock;
+
+  DecayCounter() : DecayCounter(DecayRate()) {}
+  explicit DecayCounter(const DecayRate &rate) : last_decay(clock::now()), rate(rate) {}
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& p);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<DecayCounter*>& ls);
+
+  /**
+   * reading
+   */
+
+  double get() const {
+    decay();
+    return val;
+  }
+
+  double get_last() const {
+    return val;
+  }
+  
+  time get_last_decay() const {
+    return last_decay; 
+  }
+
+  /**
+   * adjusting
+   */
+
+  double hit(double v = 1.0) {
+    decay(v);
+    return val;
+  }
+  void adjust(double v = 1.0) {
+    decay(v);
+  }
+
+  void scale(double f) {
+    val *= f;
+  }
+
+  /**
+   * decay etc.
+   */
+
+  void reset() {
+    last_decay = clock::now();
+    val = 0;
+  }
+
+protected:
+  void decay(double delta) const;
+  void decay() const {decay(0.0);}
+
+private:
+  mutable double val = 0.0;           // value
+  mutable time last_decay = clock::zero();   // time of last decay
+  DecayRate rate;
+};
+
+inline void encode(const DecayCounter &c, ceph::buffer::list &bl) {
+  c.encode(bl);
+}
+inline void decode(DecayCounter &c, ceph::buffer::list::const_iterator &p) {
+  c.decode(p);
+}
+
+inline std::ostream& operator<<(std::ostream& out, const DecayCounter& d) {
+  CachedStackStringStream css;
+  css->precision(2);
+  double val = d.get();
+  *css << "[C " << std::scientific << val << "]";
+  return out << css->strv();
+}
+
+#endif
diff --git a/src/common/EventTrace.cc b/src/common/EventTrace.cc
new file mode 100644
index 000000000..04c61e938
--- /dev/null
+++ b/src/common/EventTrace.cc
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) Intel Corporation.
+ * All rights reserved.
+ *
+ * Author: Anjaneya Chagam <anjaneya.chagam@intel.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/EventTrace.h"
+#include "common/TracepointProvider.h"
+#include "messages/MOSDOpReply.h"
+
+#ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#include "tracing/eventtrace.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
+#else
+#define tracepoint(...)
+#endif
+
+TracepointProvider::Traits event_tracepoint_traits("libeventtrace_tp.so", "event_tracing");
+bool EventTrace::tpinit = false;
+
+void EventTrace::init_tp(CephContext *_ctx)
+{
+  if (unlikely(!_ctx))
+    return;
+
+  if (unlikely(!tpinit)) {
+    TracepointProvider::initialize<event_tracepoint_traits>(_ctx);
+    tpinit = true;
+  }
+}
+
+void EventTrace::set_message_attrs(const Message *m, string& oid, string& context, bool incl_oid)
+{
+  // arg1 = oid, arg2 = message type, arg3 = source!source_addr!tid!sequence
+  if (m && (m->get_type() == CEPH_MSG_OSD_OP || m->get_type() == CEPH_MSG_OSD_OPREPLY)) {
+    if (incl_oid) {
+      if (m->get_type() == CEPH_MSG_OSD_OP)
+        oid = ((MOSDOp *)m)->get_oid().name;
+      else
+        oid = ((MOSDOpReply *)m)->get_oid().name;
+    }
+
+    ostringstream buf;
+    buf << m->get_source() << "!" << m->get_source_addr() << "!"
+        << m->get_tid() << "!" << m->get_seq() << "!" << m->get_type();
+    context = buf.str();
+  }
+}
+
+EventTrace::EventTrace(CephContext *_ctx, const char *_file, const char *_func, int _line) :
+  ctx(_ctx),
+  file(_file),
+  func(_func),
+  line(_line)
+{
+  if (unlikely(!ctx)) 
+    return;
+  last_ts = ceph_clock_now();
+  init_tp(ctx);
+
+  lsubdout(ctx, eventtrace, LOG_LEVEL) << "ENTRY (" <<  func << ") " << file << ":" << line << dendl;
+  tracepoint(eventtrace, func_enter, file.c_str(), func.c_str(), line);
+}
+
+EventTrace::~EventTrace()
+{
+  if (unlikely(!ctx)) 
+    return;
+  lsubdout(ctx, eventtrace, LOG_LEVEL) << "EXIT (" << func << ") " << file << dendl;
+  tracepoint(eventtrace, func_exit, file.c_str(), func.c_str());
+}
+
+void EventTrace::log_event_latency(const char *event)
+{
+  utime_t now = ceph_clock_now();
+  double usecs = (now.to_nsec()-last_ts.to_nsec())/1000;
+  OID_ELAPSED("", usecs, event);
+  last_ts = now;
+}
+
+void EventTrace::trace_oid_event(const char *oid, const char *event, const char *context,
+  const char *file, const char *func, int line)
+{
+  if (unlikely(!g_ceph_context))
+    return;
+  init_tp(g_ceph_context);
+  tracepoint(eventtrace, oid_event, oid, event, context, file, func, line);
+}
+
+void EventTrace::trace_oid_event(const Message *m, const char *event, const char *file,
+  const char *func, int line, bool incl_oid)
+{
+  string oid, context;
+  set_message_attrs(m, oid, context, incl_oid);
+  trace_oid_event(oid.c_str(), event, context.c_str(), file, func, line);
+}
+
+void EventTrace::trace_oid_elapsed(const char *oid, const char *event, const char *context,
+  double elapsed, const char *file, const char *func, int line)
+{
+  if (unlikely(!g_ceph_context))
+    return;
+  init_tp(g_ceph_context);
+  tracepoint(eventtrace, oid_elapsed, oid, event, context, elapsed, file, func, line);
+}
+
+void EventTrace::trace_oid_elapsed(const Message *m, const char *event, double elapsed,
+  const char *file, const char *func, int line, bool incl_oid)
+{
+  string oid, context;
+  set_message_attrs(m, oid, context, incl_oid);
+  trace_oid_elapsed(oid.c_str(), event, context.c_str(), elapsed, file, func, line);
+}
diff --git a/src/common/EventTrace.h b/src/common/EventTrace.h
new file mode 100644
index 000000000..426a8d763
--- /dev/null
+++ b/src/common/EventTrace.h
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Intel Corporation.
+ * All rights reserved.
+ *
+ * Author: Anjaneya Chagam <anjaneya.chagam@intel.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef _EventTrace_h_
+#define _EventTrace_h_
+
+#include "msg/Message.h"
+
+#if defined(WITH_EVENTTRACE)
+
+#define OID_EVENT_TRACE(oid, event) \
+  EventTrace::trace_oid_event(oid, event, "", __FILE__, __func__, __LINE__)
+#define OID_EVENT_TRACE_WITH_MSG(msg, event, incl_oid) \
+  EventTrace::trace_oid_event(msg, event, __FILE__, __func__, __LINE__, incl_oid)
+#define OID_ELAPSED(oid, elapsed, event) \
+  EventTrace::trace_oid_elapsed(oid, event, "", elapsed, __FILE__, __func__, __LINE__)
+#define OID_ELAPSED_WITH_MSG(m, elapsed, event, incl_oid) \
+  EventTrace::trace_oid_elapsed(m, event, elapsed, __FILE__, __func__, __LINE__, incl_oid)
+#define FUNCTRACE(cct) EventTrace _t1(cct, __FILE__, __func__, __LINE__)
+#define OID_ELAPSED_FUNC_EVENT(event) _t1.log_event_latency(event)
+
+#else
+
+#define OID_EVENT_TRACE(oid, event)
+#define OID_EVENT_TRACE_WITH_MSG(msg, event, incl_oid)
+#define OID_ELAPSED(oid, elapsed, event)
+#define OID_ELAPSED_WITH_MSG(m, elapsed, event, incl_oid)
+#define FUNCTRACE(cct)
+#define OID_ELAPSED_FUNC_EVENT(event)
+
+#endif
+
+#define LOG_LEVEL 30
+
+class EventTrace {
+private:
+  CephContext *ctx;
+  std::string file;
+  std::string func;
+  int line;
+  utime_t last_ts;
+
+  static bool tpinit;
+
+  static void init_tp(CephContext *_ctx);
+  static void set_message_attrs(const Message *m, std::string& oid, std::string& context, bool incl_oid);
+
+public:
+
+  EventTrace(CephContext *_ctx, const char *_file, const char *_func, int line);
+  ~EventTrace();
+  void log_event_latency(const char *tag);
+
+  static void trace_oid_event(const char *oid, const char *event, const char *context,
+    const char *file, const char *func, int line);
+  static void trace_oid_event(const Message *m, const char *event, const char *file,
+    const char *func, int line, bool incl_oid);
+
+  static void trace_oid_elapsed(const char *oid, const char *event, const char *context,
+    double elapsed, const char *file, const char *func, int line);
+  static void trace_oid_elapsed(const Message *m, const char *event, double elapsed,
+    const char *file, const char *func, int line, bool incl_oid);
+  
+};
+#endif
diff --git a/src/common/FastCDC.cc b/src/common/FastCDC.cc
new file mode 100644
index 000000000..941fc873c
--- /dev/null
+++ b/src/common/FastCDC.cc
@@ -0,0 +1,176 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <random>
+
+#include "FastCDC.h"
+
+
+// Unlike FastCDC described in the paper, if we are close to the
+// target, use the target mask.  If we are very small or very large,
+// use an adjusted mask--like the paper.  This tries to keep more
+// cut points using the same mask, and fewer using the small or large
+// masks.
+
+// How many more/fewer bits to set in the small/large masks.
+//
+// This is the "normalization level" or "NC level" in the FastCDC
+// paper.
+#define TARGET_WINDOW_MASK_BITS  2
+
+// How big the 'target window' is (in which we use the target mask).
+//
+// In the FastCDC paper, this is always 0: there is not "target
+// window," and either small_mask (maskS) or large_mask (maskL) is
+// used--never target_mask (maskA).
+#define TARGET_WINDOW_BITS       1
+
+// How many bits larger/smaller than target for hard limits on chunk
+// size.
+//
+// We assume the min and max sizes are always this many bits
+// larger/smaller than the target.  (Note that the FastCDC paper 8KB
+// example has a min of 2KB (2 bits smaller) and max of 64 KB (3 bits
+// larger), although it is not clear why they chose those values.)
+#define SIZE_WINDOW_BITS         2
+
+void FastCDC::_setup(int target, int size_window_bits)
+{
+  target_bits = target;
+
+  if (!size_window_bits) {
+    size_window_bits = SIZE_WINDOW_BITS;
+  }
+  min_bits = target - size_window_bits;
+  max_bits = target + size_window_bits;
+
+  std::mt19937_64 engine;
+
+  // prefill table
+  for (unsigned i = 0; i < 256; ++i) {
+    table[i] = engine();
+  }
+
+  // set mask
+  int did = 0;
+  uint64_t m = 0;
+  while (did < target_bits + TARGET_WINDOW_MASK_BITS) {
+    uint64_t bit = 1ull << (engine() & 63);
+    if (m & bit) {
+      continue;	// this bit is already set
+    }
+    m |= bit;
+    ++did;
+    if (did == target_bits - TARGET_WINDOW_MASK_BITS) {
+      large_mask = m;
+    } else if (did == target_bits) {
+      target_mask = m;
+    } else if (did == target_bits + TARGET_WINDOW_MASK_BITS) {
+      small_mask = m;
+    }
+  }
+}
+
+static inline bool _scan(
+  // these are our cursor/postion...
+  bufferlist::buffers_t::const_iterator *p,
+  const char **pp, const char **pe,
+  size_t& pos,
+  size_t max,   // how much to read
+  uint64_t& fp, // fingerprint
+  uint64_t mask, const uint64_t *table)
+{
+  while (pos < max) {
+    if (*pp == *pe) {
+      ++(*p);
+      *pp = (*p)->c_str();
+      *pe = *pp + (*p)->length();
+    }
+    const char *te = std::min(*pe, *pp + max - pos);
+    for (; *pp < te; ++(*pp), ++pos) {
+      if ((fp & mask) == mask) {
+	return false;
+      }
+      fp = (fp << 1) ^ table[*(unsigned char*)*pp];
+    }
+    if (pos >= max) {
+      return true;
+    }
+  }
+  return true;
+}
+
+void FastCDC::calc_chunks(
+  const bufferlist& bl,
+  std::vector<std::pair<uint64_t, uint64_t>> *chunks) const
+{
+  if (bl.length() == 0) {
+    return;
+  }
+  auto p = bl.buffers().begin();
+  const char *pp = p->c_str();
+  const char *pe = pp + p->length();
+
+  size_t pos = 0;
+  size_t len = bl.length();
+  while (pos < len) {
+    size_t cstart = pos;
+    uint64_t fp = 0;
+
+    // are we left with a min-sized (or smaller) chunk?
+    if (len - pos <= (1ul << min_bits)) {
+      chunks->push_back(std::pair<uint64_t,uint64_t>(pos, len - pos));
+      break;
+    }
+
+    // skip forward to the min chunk size cut point (minus the window, so
+    // we can initialize the rolling fingerprint).
+    size_t skip = (1 << min_bits) - window;
+    pos += skip;
+    while (skip) {
+      size_t s = std::min<size_t>(pe - pp, skip);
+      skip -= s;
+      pp += s;
+      if (pp == pe) {
+	++p;
+	pp = p->c_str();
+	pe = pp + p->length();
+      }
+    }
+
+    // first fill the window
+    size_t max = pos + window;
+    while (pos < max) {
+      if (pp == pe) {
+	++p;
+	pp = p->c_str();
+	pe = pp + p->length();
+      }
+      const char *te = std::min(pe, pp + (max - pos));
+      for (; pp < te; ++pp, ++pos) {
+	fp = (fp << 1) ^ table[*(unsigned char*)pp];
+      }
+    }
+    ceph_assert(pos < len);
+
+    // find an end marker
+    if (
+      // for the first "small" region
+      _scan(&p, &pp, &pe, pos,
+	    std::min(len, cstart + (1 << (target_bits - TARGET_WINDOW_BITS))),
+	    fp, small_mask, table) &&
+      // for the middle range (close to our target)
+      (TARGET_WINDOW_BITS == 0 ||
+       _scan(&p, &pp, &pe, pos,
+	     std::min(len, cstart + (1 << (target_bits + TARGET_WINDOW_BITS))),
+	     fp, target_mask, table)) &&
+      // we're past target, use large_mask!
+      _scan(&p, &pp, &pe, pos,
+	    std::min(len,
+		     cstart + (1 << max_bits)),
+	    fp, large_mask, table))
+      ;
+
+    chunks->push_back(std::pair<uint64_t,uint64_t>(cstart, pos - cstart));
+  }
+}
diff --git a/src/common/FastCDC.h b/src/common/FastCDC.h
new file mode 100644
index 000000000..b9156f551
--- /dev/null
+++ b/src/common/FastCDC.h
@@ -0,0 +1,54 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "CDC.h"
+
+// Based on this paper:
+//   https://www.usenix.org/system/files/conference/atc16/atc16-paper-xia.pdf
+//
+// Changes:
+//   - window size fixed at 64 bytes (to match our word size)
+//   - use XOR instead of +
+//   - match mask instead of 0
+//   - use target mask when close to target size (instead of
+//     small/large mask).  The idea here is to try to use a consistent (target)
+//     mask for most cut points if we can, and only resort to small/large mask
+//     when we are (very) small or (very) large.
+
+// Note about the target_bits: The goal is an average chunk size of 1
+// << target_bits.  However, in reality the average is ~1.25x that
+// because of the hard mininum chunk size.
+
+class FastCDC : public CDC {
+private:
+  int target_bits;  ///< target chunk size bits (1 << target_bits)
+  int min_bits;     ///< hard minimum chunk size bits (1 << min_bits)
+  int max_bits;     ///< hard maximum chunk size bits (1 << max_bits)
+
+  uint64_t target_mask;  ///< maskA in the paper (target_bits set)
+  uint64_t small_mask;   ///< maskS in the paper (more bits set)
+  uint64_t large_mask;   ///< maskL in the paper (fewer bits set)
+
+  /// lookup table with pseudorandom values for each byte
+  uint64_t table[256];
+
+  /// window size in bytes
+  const size_t window = sizeof(uint64_t)*8; // bits in uint64_t
+
+  void _setup(int target, int window_bits);
+
+public:
+  FastCDC(int target = 18, int window_bits = 0) {
+    _setup(target, window_bits);
+  };
+
+  void set_target_bits(int target, int window_bits) override {
+    _setup(target, window_bits);
+  }
+
+  void calc_chunks(
+    const bufferlist& bl,
+    std::vector<std::pair<uint64_t, uint64_t>> *chunks) const override;
+};
diff --git a/src/common/Finisher.cc b/src/common/Finisher.cc
new file mode 100644
index 000000000..974241f57
--- /dev/null
+++ b/src/common/Finisher.cc
@@ -0,0 +1,96 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Finisher.h"
+
+#define dout_subsys ceph_subsys_finisher
+#undef dout_prefix
+#define dout_prefix *_dout << "finisher(" << this << ") "
+
+void Finisher::start()
+{
+  ldout(cct, 10) << __func__ << dendl;
+  finisher_thread.create(thread_name.c_str());
+}
+
+void Finisher::stop()
+{
+  ldout(cct, 10) << __func__ << dendl;
+  finisher_lock.lock();
+  finisher_stop = true;
+  // we don't have any new work to do, but we want the worker to wake up anyway
+  // to process the stop condition.
+  finisher_cond.notify_all();
+  finisher_lock.unlock();
+  finisher_thread.join(); // wait until the worker exits completely
+  ldout(cct, 10) << __func__ << " finish" << dendl;
+}
+
+void Finisher::wait_for_empty()
+{
+  std::unique_lock ul(finisher_lock);
+  while (!finisher_queue.empty() || finisher_running) {
+    ldout(cct, 10) << "wait_for_empty waiting" << dendl;
+    finisher_empty_wait = true;
+    finisher_empty_cond.wait(ul);
+  }
+  ldout(cct, 10) << "wait_for_empty empty" << dendl;
+  finisher_empty_wait = false;
+}
+
+void *Finisher::finisher_thread_entry()
+{
+  std::unique_lock ul(finisher_lock);
+  ldout(cct, 10) << "finisher_thread start" << dendl;
+
+  utime_t start;
+  uint64_t count = 0;
+  while (!finisher_stop) {
+    /// Every time we are woken up, we process the queue until it is empty.
+    while (!finisher_queue.empty()) {
+      // To reduce lock contention, we swap out the queue to process.
+      // This way other threads can submit new contexts to complete
+      // while we are working.
+      in_progress_queue.swap(finisher_queue);
+      finisher_running = true;
+      ul.unlock();
+      ldout(cct, 10) << "finisher_thread doing " << in_progress_queue << dendl;
+
+      if (logger) {
+	start = ceph_clock_now();
+	count = in_progress_queue.size();
+      }
+
+      // Now actually process the contexts.
+      for (auto p : in_progress_queue) {
+	p.first->complete(p.second);
+      }
+      ldout(cct, 10) << "finisher_thread done with " << in_progress_queue
+                     << dendl;
+      in_progress_queue.clear();
+      if (logger) {
+	logger->dec(l_finisher_queue_len, count);
+	logger->tinc(l_finisher_complete_lat, ceph_clock_now() - start);
+      }
+
+      ul.lock();
+      finisher_running = false;
+    }
+    ldout(cct, 10) << "finisher_thread empty" << dendl;
+    if (unlikely(finisher_empty_wait))
+      finisher_empty_cond.notify_all();
+    if (finisher_stop)
+      break;
+    
+    ldout(cct, 10) << "finisher_thread sleeping" << dendl;
+    finisher_cond.wait(ul);
+  }
+  // If we are exiting, we signal the thread waiting in stop(),
+  // otherwise it would never unblock
+  finisher_empty_cond.notify_all();
+
+  ldout(cct, 10) << "finisher_thread stop" << dendl;
+  finisher_stop = false;
+  return 0;
+}
+
diff --git a/src/common/Finisher.h b/src/common/Finisher.h
new file mode 100644
index 000000000..f1060b0e4
--- /dev/null
+++ b/src/common/Finisher.h
@@ -0,0 +1,239 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_FINISHER_H
+#define CEPH_FINISHER_H
+
+#include "include/Context.h"
+#include "include/common_fwd.h"
+#include "common/Thread.h"
+#include "common/ceph_mutex.h"
+#include "common/perf_counters.h"
+#include "common/Cond.h"
+
+
+/// Finisher queue length performance counter ID.
+enum {
+  l_finisher_first = 997082,
+  l_finisher_queue_len,
+  l_finisher_complete_lat,
+  l_finisher_last
+};
+
+/** @brief Asynchronous cleanup class.
+ * Finisher asynchronously completes Contexts, which are simple classes
+ * representing callbacks, in a dedicated worker thread. Enqueuing
+ * contexts to complete is thread-safe.
+ */
+class Finisher {
+  CephContext *cct;
+  ceph::mutex finisher_lock; ///< Protects access to queues and finisher_running.
+  ceph::condition_variable finisher_cond; ///< Signaled when there is something to process.
+  ceph::condition_variable finisher_empty_cond; ///< Signaled when the finisher has nothing more to process.
+  bool         finisher_stop; ///< Set when the finisher should stop.
+  bool         finisher_running; ///< True when the finisher is currently executing contexts.
+  bool	       finisher_empty_wait; ///< True mean someone wait finisher empty.
+
+  /// Queue for contexts for which complete(0) will be called.
+  std::vector<std::pair<Context*,int>> finisher_queue;
+  std::vector<std::pair<Context*,int>> in_progress_queue;
+
+  std::string thread_name;
+
+  /// Performance counter for the finisher's queue length.
+  /// Only active for named finishers.
+  PerfCounters *logger;
+
+  void *finisher_thread_entry();
+
+  struct FinisherThread : public Thread {
+    Finisher *fin;
+    explicit FinisherThread(Finisher *f) : fin(f) {}
+    void* entry() override { return fin->finisher_thread_entry(); }
+  } finisher_thread;
+
+ public:
+  /// Add a context to complete, optionally specifying a parameter for the complete function.
+  void queue(Context *c, int r = 0) {
+    std::unique_lock ul(finisher_lock);
+    bool was_empty = finisher_queue.empty();
+    finisher_queue.push_back(std::make_pair(c, r));
+    if (was_empty) {
+      finisher_cond.notify_one();
+    }
+    if (logger)
+      logger->inc(l_finisher_queue_len);
+  }
+
+  void queue(std::list<Context*>& ls) {
+    {
+      std::unique_lock ul(finisher_lock);
+      if (finisher_queue.empty()) {
+	finisher_cond.notify_all();
+      }
+      for (auto i : ls) {
+	finisher_queue.push_back(std::make_pair(i, 0));
+      }
+      if (logger)
+	logger->inc(l_finisher_queue_len, ls.size());
+    }
+    ls.clear();
+  }
+  void queue(std::deque<Context*>& ls) {
+    {
+      std::unique_lock ul(finisher_lock);
+      if (finisher_queue.empty()) {
+	finisher_cond.notify_all();
+      }
+      for (auto i : ls) {
+	finisher_queue.push_back(std::make_pair(i, 0));
+      }
+      if (logger)
+	logger->inc(l_finisher_queue_len, ls.size());
+    }
+    ls.clear();
+  }
+  void queue(std::vector<Context*>& ls) {
+    {
+      std::unique_lock ul(finisher_lock);
+      if (finisher_queue.empty()) {
+	finisher_cond.notify_all();
+      }
+      for (auto i : ls) {
+	finisher_queue.push_back(std::make_pair(i, 0));
+      }
+      if (logger)
+	logger->inc(l_finisher_queue_len, ls.size());
+    }
+    ls.clear();
+  }
+
+  /// Start the worker thread.
+  void start();
+
+  /** @brief Stop the worker thread.
+   *
+   * Does not wait until all outstanding contexts are completed.
+   * To ensure that everything finishes, you should first shut down
+   * all sources that can add contexts to this finisher and call
+   * wait_for_empty() before calling stop(). */
+  void stop();
+
+  /** @brief Blocks until the finisher has nothing left to process.
+   * This function will also return when a concurrent call to stop()
+   * finishes, but this class should never be used in this way. */
+  void wait_for_empty();
+
+  /// Construct an anonymous Finisher.
+  /// Anonymous finishers do not log their queue length.
+  explicit Finisher(CephContext *cct_) :
+    cct(cct_), finisher_lock(ceph::make_mutex("Finisher::finisher_lock")),
+    finisher_stop(false), finisher_running(false), finisher_empty_wait(false),
+    thread_name("fn_anonymous"), logger(0),
+    finisher_thread(this) {}
+
+  /// Construct a named Finisher that logs its queue length.
+  Finisher(CephContext *cct_, std::string name, std::string tn) :
+    cct(cct_), finisher_lock(ceph::make_mutex("Finisher::" + name)),
+    finisher_stop(false), finisher_running(false), finisher_empty_wait(false),
+    thread_name(tn), logger(0),
+    finisher_thread(this) {
+    PerfCountersBuilder b(cct, std::string("finisher-") + name,
+			  l_finisher_first, l_finisher_last);
+    b.add_u64(l_finisher_queue_len, "queue_len");
+    b.add_time_avg(l_finisher_complete_lat, "complete_latency");
+    logger = b.create_perf_counters();
+    cct->get_perfcounters_collection()->add(logger);
+    logger->set(l_finisher_queue_len, 0);
+    logger->set(l_finisher_complete_lat, 0);
+  }
+
+  ~Finisher() {
+    if (logger && cct) {
+      cct->get_perfcounters_collection()->remove(logger);
+      delete logger;
+    }
+  }
+};
+
+/// Context that is completed asynchronously on the supplied finisher.
+class C_OnFinisher : public Context {
+  Context *con;
+  Finisher *fin;
+public:
+  C_OnFinisher(Context *c, Finisher *f) : con(c), fin(f) {
+    ceph_assert(fin != NULL);
+    ceph_assert(con != NULL);
+  }
+
+  ~C_OnFinisher() override {
+    if (con != nullptr) {
+      delete con;
+      con = nullptr;
+    }
+  }
+
+  void finish(int r) override {
+    fin->queue(con, r);
+    con = nullptr;
+  }
+};
+
+class ContextQueue {
+  std::list<Context *> q;
+  std::mutex q_mutex;
+  ceph::mutex& mutex;
+  ceph::condition_variable& cond;
+  std::atomic_bool q_empty = true;
+public:
+  ContextQueue(ceph::mutex& mut,
+	       ceph::condition_variable& con)
+    : mutex(mut), cond(con) {}
+
+  void queue(std::list<Context *>& ls) {
+    bool was_empty = false;
+    {
+      std::scoped_lock l(q_mutex);
+      if (q.empty()) {
+	q.swap(ls);
+	was_empty = true;
+      } else {
+	q.insert(q.end(), ls.begin(), ls.end());
+      }
+      q_empty = q.empty();
+    }
+
+    if (was_empty) {
+      std::scoped_lock l{mutex};
+      cond.notify_all();
+    }
+
+    ls.clear();
+  }
+
+  void move_to(std::list<Context *>& ls) {
+    ls.clear();
+    std::scoped_lock l(q_mutex);
+    if (!q.empty()) {
+      q.swap(ls);
+    }
+    q_empty = true;
+  }
+
+  bool empty() {
+    return q_empty;
+  }
+};
+
+#endif
diff --git a/src/common/FixedCDC.cc b/src/common/FixedCDC.cc
new file mode 100644
index 000000000..6e87f070e
--- /dev/null
+++ b/src/common/FixedCDC.cc
@@ -0,0 +1,20 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma
+
+#include "FixedCDC.h"
+
+void FixedCDC::calc_chunks(
+  const bufferlist& bl,
+  std::vector<std::pair<uint64_t, uint64_t>> *chunks) const
+{
+  size_t len = bl.length();
+  if (!len) {
+    return;
+  }
+  for (size_t pos = 0; pos < len; pos += chunk_size) {
+    chunks->push_back(std::pair<uint64_t,uint64_t>(pos, std::min(chunk_size,
+								 len - pos)));
+  }
+}
diff --git a/src/common/FixedCDC.h b/src/common/FixedCDC.h
new file mode 100644
index 000000000..a19a1859b
--- /dev/null
+++ b/src/common/FixedCDC.h
@@ -0,0 +1,23 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "CDC.h"
+
+class FixedCDC : public CDC {
+private:
+  size_t chunk_size;
+
+public:
+  FixedCDC(int target = 18, int window_bits = 0) {
+    set_target_bits(target, window_bits);
+  };
+
+  void set_target_bits(int target, int window_bits) override {
+    chunk_size = 1ul << target;
+  }
+  void calc_chunks(
+    const bufferlist& bl,
+    std::vector<std::pair<uint64_t, uint64_t>> *chunks) const override;
+};
diff --git a/src/common/Formatter.cc b/src/common/Formatter.cc
new file mode 100644
index 000000000..f121afa07
--- /dev/null
+++ b/src/common/Formatter.cc
@@ -0,0 +1,968 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#define LARGE_SIZE 1024
+
+#include "HTMLFormatter.h"
+#include "common/escape.h"
+#include "include/buffer.h"
+
+#include <fmt/format.h>
+#include <algorithm>
+#include <set>
+#include <limits>
+
+// -----------------------
+namespace ceph {
+
+std::string
+fixed_u_to_string(uint64_t num, int scale)
+{
+	std::ostringstream t;
+
+	t.fill('0');
+	t.width(scale + 1);
+	t << num;
+	int len = t.str().size();
+	return t.str().substr(0,len - scale) + "." + t.str().substr(len - scale);
+}
+
+std::string
+fixed_to_string(int64_t num, int scale)
+{
+	std::ostringstream t;
+	bool neg = num < 0;
+	if (neg) num = -num;
+
+	t.fill('0');
+	t.width(scale + 1);
+	t << num;
+	int len = t.str().size();
+	return (neg ? "-" : "") + t.str().substr(0,len - scale) + "." + t.str().substr(len - scale);
+}
+
+/*
+ * FormatterAttrs(const char *attr, ...)
+ *
+ * Requires a list of attrs followed by NULL. The attrs should be char *
+ * pairs, first one is the name, second one is the value. E.g.,
+ *
+ * FormatterAttrs("name1", "value1", "name2", "value2", NULL);
+ */
+FormatterAttrs::FormatterAttrs(const char *attr, ...)
+{
+  const char *s = attr;
+  va_list ap;
+  va_start(ap, attr);
+  do {
+    const char *val = va_arg(ap, char *);
+    if (!val)
+      break;
+
+    attrs.push_back(make_pair(std::string(s), std::string(val)));
+    s = va_arg(ap, char *);
+  } while (s);
+  va_end(ap);
+}
+
+void Formatter::write_bin_data(const char*, int){}
+
+Formatter::Formatter() { }
+
+Formatter::~Formatter() { }
+
+Formatter *Formatter::create(std::string_view type,
+			     std::string_view default_type,
+			     std::string_view fallback)
+{
+  std::string_view mytype(type);
+  if (mytype.empty()) {
+    mytype = default_type;
+  }
+
+  if (mytype == "json")
+    return new JSONFormatter(false);
+  else if (mytype == "json-pretty")
+    return new JSONFormatter(true);
+  else if (mytype == "xml")
+    return new XMLFormatter(false);
+  else if (mytype == "xml-pretty")
+    return new XMLFormatter(true);
+  else if (mytype == "table")
+    return new TableFormatter();
+  else if (mytype == "table-kv")
+    return new TableFormatter(true);
+  else if (mytype == "html")
+    return new HTMLFormatter(false);
+  else if (mytype == "html-pretty")
+    return new HTMLFormatter(true);
+  else if (fallback != "")
+    return create(fallback, "", "");
+  else
+    return (Formatter *) NULL;
+}
+
+
+void Formatter::flush(bufferlist &bl)
+{
+  std::stringstream os;
+  flush(os);
+  bl.append(os.str());
+}
+
+void Formatter::dump_format(std::string_view name, const char *fmt, ...)
+{
+  va_list ap;
+  va_start(ap, fmt);
+  dump_format_va(name, NULL, true, fmt, ap);
+  va_end(ap);
+}
+
+void Formatter::dump_format_ns(std::string_view name, const char *ns, const char *fmt, ...)
+{
+  va_list ap;
+  va_start(ap, fmt);
+  dump_format_va(name, ns, true, fmt, ap);
+  va_end(ap);
+
+}
+
+void Formatter::dump_format_unquoted(std::string_view name, const char *fmt, ...)
+{
+  va_list ap;
+  va_start(ap, fmt);
+  dump_format_va(name, NULL, false, fmt, ap);
+  va_end(ap);
+}
+
+// -----------------------
+
+JSONFormatter::JSONFormatter(bool p)
+: m_pretty(p), m_is_pending_string(false)
+{
+  reset();
+}
+
+void JSONFormatter::flush(std::ostream& os)
+{
+  finish_pending_string();
+  os << m_ss.str();
+  if (m_line_break_enabled)
+    os << "\n";
+  m_ss.clear();
+  m_ss.str("");
+}
+
+void JSONFormatter::reset()
+{
+  m_stack.clear();
+  m_ss.clear();
+  m_ss.str("");
+  m_pending_string.clear();
+  m_pending_string.str("");
+}
+
+void JSONFormatter::print_comma(json_formatter_stack_entry_d& entry)
+{
+  if (entry.size) {
+    if (m_pretty) {
+      m_ss << ",\n";
+      for (unsigned i = 1; i < m_stack.size(); i++)
+        m_ss << "    ";
+    } else {
+      m_ss << ",";
+    }
+  } else if (m_pretty) {
+    m_ss << "\n";
+    for (unsigned i = 1; i < m_stack.size(); i++)
+      m_ss << "    ";
+  }
+  if (m_pretty && entry.is_array)
+    m_ss << "    ";
+}
+
+void JSONFormatter::print_quoted_string(std::string_view s)
+{
+  m_ss << '\"' << json_stream_escaper(s) << '\"';
+}
+
+void JSONFormatter::print_name(std::string_view name)
+{
+  finish_pending_string();
+  if (m_stack.empty())
+    return;
+  struct json_formatter_stack_entry_d& entry = m_stack.back();
+  print_comma(entry);
+  if (!entry.is_array) {
+    if (m_pretty) {
+      m_ss << "    ";
+    }
+    m_ss << "\"" << name << "\"";
+    if (m_pretty)
+      m_ss << ": ";
+    else
+      m_ss << ':';
+  }
+  ++entry.size;
+}
+
+void JSONFormatter::open_section(std::string_view name, const char *ns, bool is_array)
+{
+  if (handle_open_section(name, ns, is_array)) {
+    return;
+  }
+  if (ns) {
+    std::ostringstream oss;
+    oss << name << " " << ns;
+    print_name(oss.str().c_str());
+  } else {
+    print_name(name);
+  }
+  if (is_array)
+    m_ss << '[';
+  else
+    m_ss << '{';
+
+  json_formatter_stack_entry_d n;
+  n.is_array = is_array;
+  m_stack.push_back(n);
+}
+
+void JSONFormatter::open_array_section(std::string_view name)
+{
+  open_section(name, nullptr, true);
+}
+
+void JSONFormatter::open_array_section_in_ns(std::string_view name, const char *ns)
+{
+  open_section(name, ns, true);
+}
+
+void JSONFormatter::open_object_section(std::string_view name)
+{
+  open_section(name, nullptr, false);
+}
+
+void JSONFormatter::open_object_section_in_ns(std::string_view name, const char *ns)
+{
+  open_section(name, ns, false);
+}
+
+void JSONFormatter::close_section()
+{
+
+  if (handle_close_section()) {
+    return;
+  }
+  ceph_assert(!m_stack.empty());
+  finish_pending_string();
+
+  struct json_formatter_stack_entry_d& entry = m_stack.back();
+  if (m_pretty && entry.size) {
+    m_ss << "\n";
+    for (unsigned i = 1; i < m_stack.size(); i++)
+      m_ss << "    ";
+  }
+  m_ss << (entry.is_array ? ']' : '}');
+  m_stack.pop_back();
+  if (m_pretty && m_stack.empty())
+    m_ss << "\n";
+}
+
+void JSONFormatter::finish_pending_string()
+{
+  if (m_is_pending_string) {
+    m_is_pending_string = false;
+    add_value(m_pending_name.c_str(), m_pending_string.str(), true);
+    m_pending_string.str("");
+  }
+}
+
+template <class T>
+void JSONFormatter::add_value(std::string_view name, T val)
+{
+  std::stringstream ss;
+  ss.precision(std::numeric_limits<T>::max_digits10);
+  ss << val;
+  add_value(name, ss.str(), false);
+}
+
+void JSONFormatter::add_value(std::string_view name, std::string_view val, bool quoted)
+{
+  if (handle_value(name, val, quoted)) {
+    return;
+  }
+  print_name(name);
+  if (!quoted) {
+    m_ss << val;
+  } else {
+    print_quoted_string(val);
+  }
+}
+
+void JSONFormatter::dump_null(std::string_view name)
+{
+  add_value(name, "null");
+}
+
+void JSONFormatter::dump_unsigned(std::string_view name, uint64_t u)
+{
+  add_value(name, u);
+}
+
+void JSONFormatter::dump_int(std::string_view name, int64_t s)
+{
+  add_value(name, s);
+}
+
+void JSONFormatter::dump_float(std::string_view name, double d)
+{
+  add_value(name, d);
+}
+
+void JSONFormatter::dump_string(std::string_view name, std::string_view s)
+{
+  add_value(name, s, true);
+}
+
+std::ostream& JSONFormatter::dump_stream(std::string_view name)
+{
+  finish_pending_string();
+  m_pending_name = name;
+  m_is_pending_string = true;
+  return m_pending_string;
+}
+
+void JSONFormatter::dump_format_va(std::string_view name, const char *ns, bool quoted, const char *fmt, va_list ap)
+{
+  char buf[LARGE_SIZE];
+  vsnprintf(buf, LARGE_SIZE, fmt, ap);
+
+  add_value(name, buf, quoted);
+}
+
+int JSONFormatter::get_len() const
+{
+  return m_ss.str().size();
+}
+
+void JSONFormatter::write_raw_data(const char *data)
+{
+  m_ss << data;
+}
+
+const char *XMLFormatter::XML_1_DTD =
+  "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
+
+XMLFormatter::XMLFormatter(bool pretty, bool lowercased, bool underscored)
+: m_pretty(pretty),
+  m_lowercased(lowercased),
+  m_underscored(underscored)
+{
+  reset();
+}
+
+void XMLFormatter::flush(std::ostream& os)
+{
+  finish_pending_string();
+  std::string m_ss_str = m_ss.str();
+  os << m_ss_str;
+  /* There is a small catch here. If the rest of the formatter had NO output,
+   * we should NOT output a newline. This primarily triggers on HTTP redirects */
+  if (m_pretty && !m_ss_str.empty())
+    os << "\n";
+  else if (m_line_break_enabled)
+    os << "\n";
+  m_ss.clear();
+  m_ss.str("");
+}
+
+void XMLFormatter::reset()
+{
+  m_ss.clear();
+  m_ss.str("");
+  m_pending_string.clear();
+  m_pending_string.str("");
+  m_sections.clear();
+  m_pending_string_name.clear();
+  m_header_done = false;
+}
+
+void XMLFormatter::output_header()
+{
+  if(!m_header_done) {
+    m_header_done = true;
+    write_raw_data(XMLFormatter::XML_1_DTD);
+    if (m_pretty)
+      m_ss << "\n";
+  }
+}
+
+void XMLFormatter::output_footer()
+{
+  while(!m_sections.empty()) {
+    close_section();
+  }
+}
+
+void XMLFormatter::open_object_section(std::string_view name)
+{
+  open_section_in_ns(name, NULL, NULL);
+}
+
+void XMLFormatter::open_object_section_with_attrs(std::string_view name, const FormatterAttrs& attrs)
+{
+  open_section_in_ns(name, NULL, &attrs);
+}
+
+void XMLFormatter::open_object_section_in_ns(std::string_view name, const char *ns)
+{
+  open_section_in_ns(name, ns, NULL);
+}
+
+void XMLFormatter::open_array_section(std::string_view name)
+{
+  open_section_in_ns(name, NULL, NULL);
+}
+
+void XMLFormatter::open_array_section_with_attrs(std::string_view name, const FormatterAttrs& attrs)
+{
+  open_section_in_ns(name, NULL, &attrs);
+}
+
+void XMLFormatter::open_array_section_in_ns(std::string_view name, const char *ns)
+{
+  open_section_in_ns(name, ns, NULL);
+}
+
+std::string XMLFormatter::get_xml_name(std::string_view name) const
+{
+  std::string e(name);
+  std::transform(e.begin(), e.end(), e.begin(),
+      [this](char c) { return this->to_lower_underscore(c); });
+  return e;
+}
+
+void XMLFormatter::close_section()
+{
+  ceph_assert(!m_sections.empty());
+  finish_pending_string();
+
+  auto section = get_xml_name(m_sections.back());
+  m_sections.pop_back();
+  print_spaces();
+  m_ss << "</" << section << ">";
+  if (m_pretty)
+    m_ss << "\n";
+}
+
+template <class T>
+void XMLFormatter::add_value(std::string_view name, T val)
+{
+  auto e = get_xml_name(name);
+  print_spaces();
+  m_ss.precision(std::numeric_limits<T>::max_digits10);
+  m_ss << "<" << e << ">" << val << "</" << e << ">";
+  if (m_pretty)
+    m_ss << "\n";
+}
+
+void XMLFormatter::dump_null(std::string_view name)
+{
+  print_spaces();
+  m_ss << "<" << get_xml_name(name) << " xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:nil=\"true\" />";
+  if (m_pretty)
+    m_ss << "\n";
+}
+
+void XMLFormatter::dump_unsigned(std::string_view name, uint64_t u)
+{
+  add_value(name, u);
+}
+
+void XMLFormatter::dump_int(std::string_view name, int64_t s)
+{
+  add_value(name, s);
+}
+
+void XMLFormatter::dump_float(std::string_view name, double d)
+{
+  add_value(name, d);
+}
+
+void XMLFormatter::dump_string(std::string_view name, std::string_view s)
+{
+  auto e = get_xml_name(name);
+  print_spaces();
+  m_ss << "<" << e << ">" << xml_stream_escaper(s) << "</" << e << ">";
+  if (m_pretty)
+    m_ss << "\n";
+}
+
+void XMLFormatter::dump_string_with_attrs(std::string_view name, std::string_view s, const FormatterAttrs& attrs)
+{
+  auto e = get_xml_name(name);
+  std::string attrs_str;
+  get_attrs_str(&attrs, attrs_str);
+  print_spaces();
+  m_ss << "<" << e << attrs_str << ">" << xml_stream_escaper(s) << "</" << e << ">";
+  if (m_pretty)
+    m_ss << "\n";
+}
+
+std::ostream& XMLFormatter::dump_stream(std::string_view name)
+{
+  print_spaces();
+  m_pending_string_name = name;
+  m_ss << "<" << m_pending_string_name << ">";
+  return m_pending_string;
+}
+
+void XMLFormatter::dump_format_va(std::string_view name, const char *ns, bool quoted, const char *fmt, va_list ap)
+{
+  char buf[LARGE_SIZE];
+  size_t len = vsnprintf(buf, LARGE_SIZE, fmt, ap);
+  auto e = get_xml_name(name);
+
+  print_spaces();
+  if (ns) {
+    m_ss << "<" << e << " xmlns=\"" << ns << "\">" << xml_stream_escaper(std::string_view(buf, len)) << "</" << e << ">";
+  } else {
+    m_ss << "<" << e << ">" << xml_stream_escaper(std::string_view(buf, len)) << "</" << e << ">";
+  }
+
+  if (m_pretty)
+    m_ss << "\n";
+}
+
+int XMLFormatter::get_len() const
+{
+  return m_ss.str().size();
+}
+
+void XMLFormatter::write_raw_data(const char *data)
+{
+  m_ss << data;
+}
+
+void XMLFormatter::write_bin_data(const char* buff, int buf_len)
+{
+  std::stringbuf *pbuf = m_ss.rdbuf();
+  pbuf->sputn(buff, buf_len);
+  m_ss.seekg(buf_len);
+}
+
+void XMLFormatter::get_attrs_str(const FormatterAttrs *attrs, std::string& attrs_str)
+{
+  std::stringstream attrs_ss;
+
+  for (std::list<std::pair<std::string, std::string> >::const_iterator iter = attrs->attrs.begin();
+       iter != attrs->attrs.end(); ++iter) {
+    std::pair<std::string, std::string> p = *iter;
+    attrs_ss << " " << p.first << "=" << "\"" << p.second << "\"";
+  }
+
+  attrs_str = attrs_ss.str();
+}
+
+void XMLFormatter::open_section_in_ns(std::string_view name, const char *ns, const FormatterAttrs *attrs)
+{
+  print_spaces();
+  std::string attrs_str;
+
+  if (attrs) {
+    get_attrs_str(attrs, attrs_str);
+  }
+
+  auto e = get_xml_name(name);
+
+  if (ns) {
+    m_ss << "<" << e << attrs_str << " xmlns=\"" << ns << "\">";
+  } else {
+    m_ss << "<" << e << attrs_str << ">";
+  }
+  if (m_pretty)
+    m_ss << "\n";
+  m_sections.push_back(std::string(name));
+}
+
+void XMLFormatter::finish_pending_string()
+{
+  if (!m_pending_string_name.empty()) {
+    m_ss << xml_stream_escaper(m_pending_string.str())
+      << "</" << m_pending_string_name << ">";
+    m_pending_string_name.clear();
+    m_pending_string.str(std::string());
+    if (m_pretty) {
+      m_ss << "\n";
+    }
+  }
+}
+
+void XMLFormatter::print_spaces()
+{
+  finish_pending_string();
+  if (m_pretty) {
+    std::string spaces(m_sections.size(), ' ');
+    m_ss << spaces;
+  }
+}
+
+char XMLFormatter::to_lower_underscore(char c) const
+{
+  if (m_underscored && c == ' ') {
+      return '_';
+  } else if (m_lowercased) {
+    return std::tolower(c);
+  }
+  return c;
+}
+
+TableFormatter::TableFormatter(bool keyval) : m_keyval(keyval)
+{
+  reset();
+}
+
+void TableFormatter::flush(std::ostream& os)
+{
+  finish_pending_string();
+  std::vector<size_t> column_size = m_column_size;
+  std::vector<std::string> column_name = m_column_name;
+
+  std::set<int> need_header_set;
+
+  // auto-sizing columns
+  for (size_t i = 0; i < m_vec.size(); i++) {
+    for (size_t j = 0; j < m_vec[i].size(); j++) {
+      column_size.resize(m_vec[i].size());
+      column_name.resize(m_vec[i].size());
+      if (i > 0) {
+        if (m_vec[i - 1][j] != m_vec[i][j]) {
+          // changing row labels require to show the header
+          need_header_set.insert(i);
+          column_name[i] = m_vec[i][j].first;
+        }
+      } else {
+        column_name[i] = m_vec[i][j].first;
+      }
+
+      if (m_vec[i][j].second.length() > column_size[j])
+        column_size[j] = m_vec[i][j].second.length();
+      if (m_vec[i][j].first.length() > column_size[j])
+        column_size[j] = m_vec[i][j].first.length();
+    }
+  }
+
+  bool need_header = false;
+  if ((column_size.size() == m_column_size.size())) {
+    for (size_t i = 0; i < column_size.size(); i++) {
+      if (column_size[i] != m_column_size[i]) {
+        need_header = true;
+        break;
+      }
+    }
+  } else {
+    need_header = true;
+  }
+
+  if (need_header) {
+    // first row always needs a header if there wasn't one before
+    need_header_set.insert(0);
+  }
+
+  m_column_size = column_size;
+  for (size_t i = 0; i < m_vec.size(); i++) {
+    if (i == 0) {
+      if (need_header_set.count(i)) {
+        // print the header
+        if (!m_keyval) {
+          os << "+";
+          for (size_t j = 0; j < m_vec[i].size(); j++) {
+            for (size_t v = 0; v < m_column_size[j] + 3; v++)
+              os << "-";
+            os << "+";
+          }
+          os << "\n";
+          os << "|";
+
+          for (size_t j = 0; j < m_vec[i].size(); j++) {
+            os << fmt::format(" {:<{}}|",
+                              m_vec[i][j].first, m_column_size[j] + 2);
+          }
+          os << "\n";
+          os << "+";
+          for (size_t j = 0; j < m_vec[i].size(); j++) {
+            for (size_t v = 0; v < m_column_size[j] + 3; v++)
+              os << "-";
+            os << "+";
+          }
+          os << "\n";
+        }
+      }
+    }
+    // print body
+    if (!m_keyval)
+      os << "|";
+    for (size_t j = 0; j < m_vec[i].size(); j++) {
+      if (!m_keyval)
+        os << " ";
+      if (m_keyval) {
+        os << "key::";
+        os << m_vec[i][j].first;
+        os << "=";
+        os << "\"";
+        os << m_vec[i][j].second;
+        os << "\" ";
+      } else {
+        os << fmt::format("{:<{}}|", m_vec[i][j].second, m_column_size[j] + 2);
+      }
+    }
+
+    os << "\n";
+    if (!m_keyval) {
+      if (i == (m_vec.size() - 1)) {
+        // print trailer
+        os << "+";
+        for (size_t j = 0; j < m_vec[i].size(); j++) {
+          for (size_t v = 0; v < m_column_size[j] + 3; v++)
+            os << "-";
+          os << "+";
+        }
+        os << "\n";
+      }
+    }
+    m_vec[i].clear();
+  }
+  m_vec.clear();
+}
+
+void TableFormatter::reset()
+{
+  m_ss.clear();
+  m_ss.str("");
+  m_section_cnt.clear();
+  m_column_size.clear();
+  m_section_open = 0;
+}
+
+void TableFormatter::open_object_section(std::string_view name)
+{
+  open_section_in_ns(name, NULL, NULL);
+}
+
+void TableFormatter::open_object_section_with_attrs(std::string_view name, const FormatterAttrs& attrs)
+{
+  open_section_in_ns(name, NULL, NULL);
+}
+
+void TableFormatter::open_object_section_in_ns(std::string_view name, const char *ns)
+{
+  open_section_in_ns(name, NULL, NULL);
+}
+
+void TableFormatter::open_array_section(std::string_view name)
+{
+  open_section_in_ns(name, NULL, NULL);
+}
+
+void TableFormatter::open_array_section_with_attrs(std::string_view name, const FormatterAttrs& attrs)
+{
+  open_section_in_ns(name, NULL, NULL);
+}
+
+void TableFormatter::open_array_section_in_ns(std::string_view name, const char *ns)
+{
+  open_section_in_ns(name, NULL, NULL);
+}
+
+void TableFormatter::open_section_in_ns(std::string_view name, const char *ns, const FormatterAttrs *attrs)
+{
+  m_section.push_back(std::string(name));
+  m_section_open++;
+}
+
+void TableFormatter::close_section()
+{
+  //
+  m_section_open--;
+  if (m_section.size()) {
+    m_section_cnt[m_section.back()] = 0;
+    m_section.pop_back();
+  }
+}
+
+size_t TableFormatter::m_vec_index(std::string_view name)
+{
+  std::string key(name);
+
+  size_t i = m_vec.size();
+  if (i)
+    i--;
+
+  // make sure there are vectors to push back key/val pairs
+  if (!m_vec.size())
+    m_vec.resize(1);
+
+  if (m_vec.size()) {
+    if (m_vec[i].size()) {
+      if (m_vec[i][0].first == key) {
+        // start a new column if a key is repeated
+        m_vec.resize(m_vec.size() + 1);
+        i++;
+      }
+    }
+  }
+
+  return i;
+}
+
+std::string TableFormatter::get_section_name(std::string_view name)
+{
+  std::string t_name{name};
+  for (size_t i = 0; i < m_section.size(); i++) {
+    t_name.insert(0, ":");
+    t_name.insert(0, m_section[i]);
+  }
+  if (m_section_open) {
+    std::stringstream lss;
+    lss << t_name;
+    lss << "[";
+    lss << m_section_cnt[t_name]++;
+    lss << "]";
+    return lss.str();
+  } else {
+    return t_name;
+  }
+}
+
+template <class T>
+void TableFormatter::add_value(std::string_view name, T val) {
+  finish_pending_string();
+  size_t i = m_vec_index(name);
+  m_ss.precision(std::numeric_limits<double>::max_digits10);
+  m_ss << val;
+
+  m_vec[i].push_back(std::make_pair(get_section_name(name), m_ss.str()));
+  m_ss.clear();
+  m_ss.str("");
+}
+
+void TableFormatter::dump_null(std::string_view name)
+{
+  add_value(name, "null");
+}
+
+void TableFormatter::dump_unsigned(std::string_view name, uint64_t u)
+{
+  add_value(name, u);
+}
+
+void TableFormatter::dump_int(std::string_view name, int64_t s)
+{
+  add_value(name, s);
+}
+
+void TableFormatter::dump_float(std::string_view name, double d)
+{
+  add_value(name, d);
+}
+
+void TableFormatter::dump_string(std::string_view name, std::string_view s)
+{
+  finish_pending_string();
+  size_t i = m_vec_index(name);
+  m_ss << s;
+
+  m_vec[i].push_back(std::make_pair(get_section_name(name), m_ss.str()));
+  m_ss.clear();
+  m_ss.str("");
+}
+
+void TableFormatter::dump_string_with_attrs(std::string_view name, std::string_view s, const FormatterAttrs& attrs)
+{
+  finish_pending_string();
+  size_t i = m_vec_index(name);
+
+  std::string attrs_str;
+  get_attrs_str(&attrs, attrs_str);
+  m_ss << attrs_str << s;
+
+  m_vec[i].push_back(std::make_pair(get_section_name(name), m_ss.str()));
+  m_ss.clear();
+  m_ss.str("");
+}
+
+void TableFormatter::dump_format_va(std::string_view name,
+				    const char *ns, bool quoted,
+				    const char *fmt, va_list ap)
+{
+  finish_pending_string();
+  char buf[LARGE_SIZE];
+  vsnprintf(buf, LARGE_SIZE, fmt, ap);
+
+  size_t i = m_vec_index(name);
+  if (ns) {
+    m_ss << ns << "." << buf;
+  } else
+    m_ss << buf;
+
+  m_vec[i].push_back(std::make_pair(get_section_name(name), m_ss.str()));
+  m_ss.clear();
+  m_ss.str("");
+}
+
+std::ostream& TableFormatter::dump_stream(std::string_view name)
+{
+  finish_pending_string();
+  // we don't support this
+  m_pending_name = name;
+  return m_ss;
+}
+
+int TableFormatter::get_len() const
+{
+  // we don't know the size until flush is called
+  return 0;
+}
+
+void TableFormatter::write_raw_data(const char *data) {
+  // not supported
+}
+
+void TableFormatter::get_attrs_str(const FormatterAttrs *attrs, std::string& attrs_str)
+{
+  std::stringstream attrs_ss;
+
+  for (std::list<std::pair<std::string, std::string> >::const_iterator iter = attrs->attrs.begin();
+       iter != attrs->attrs.end(); ++iter) {
+    std::pair<std::string, std::string> p = *iter;
+    attrs_ss << " " << p.first << "=" << "\"" << p.second << "\"";
+  }
+
+  attrs_str = attrs_ss.str();
+}
+
+void TableFormatter::finish_pending_string()
+{
+  if (m_pending_name.length()) {
+    std::string ss = m_ss.str();
+    m_ss.clear();
+    m_ss.str("");
+    std::string pending_name = m_pending_name;
+    m_pending_name = "";
+    dump_string(pending_name.c_str(), ss);
+  }
+}
+}
+
diff --git a/src/common/Formatter.h b/src/common/Formatter.h
new file mode 100644
index 000000000..1919b018a
--- /dev/null
+++ b/src/common/Formatter.h
@@ -0,0 +1,326 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_FORMATTER_H
+#define CEPH_FORMATTER_H
+
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+
+#include <deque>
+#include <list>
+#include <memory>
+#include <vector>
+#include <stdarg.h>
+#include <sstream>
+#include <map>
+
+namespace ceph {
+
+  struct FormatterAttrs {
+    std::list< std::pair<std::string, std::string> > attrs;
+
+    FormatterAttrs(const char *attr, ...);
+  };
+
+  class Formatter {
+  public:
+    class ObjectSection {
+      Formatter& formatter;
+
+    public:
+      ObjectSection(Formatter& f, std::string_view name) : formatter(f) {
+        formatter.open_object_section(name);
+      }
+      ObjectSection(Formatter& f, std::string_view name, const char *ns) : formatter(f) {
+        formatter.open_object_section_in_ns(name, ns);
+      }
+      ~ObjectSection() {
+        formatter.close_section();
+      }
+    };
+    class ArraySection {
+      Formatter& formatter;
+
+    public:
+      ArraySection(Formatter& f, std::string_view name) : formatter(f) {
+        formatter.open_array_section(name);
+      }
+      ArraySection(Formatter& f, std::string_view name, const char *ns) : formatter(f) {
+        formatter.open_array_section_in_ns(name, ns);
+      }
+      ~ArraySection() {
+        formatter.close_section();
+      }
+    };
+
+    static Formatter *create(std::string_view type,
+			     std::string_view default_type,
+			     std::string_view fallback);
+    static Formatter *create(std::string_view type,
+			     std::string_view default_type) {
+      return create(type, default_type, "");
+    }
+    static Formatter *create(std::string_view type) {
+      return create(type, "json-pretty", "");
+    }
+    template <typename... Params>
+    static std::unique_ptr<Formatter> create_unique(Params &&...params)
+    {
+      return std::unique_ptr<Formatter>(
+	  Formatter::create(std::forward<Params>(params)...));
+    }
+
+    Formatter();
+    virtual ~Formatter();
+
+    virtual void enable_line_break() = 0;
+    virtual void flush(std::ostream& os) = 0;
+    void flush(bufferlist &bl);
+    virtual void reset() = 0;
+
+    virtual void set_status(int status, const char* status_name) = 0;
+    virtual void output_header() = 0;
+    virtual void output_footer() = 0;
+
+    virtual void open_array_section(std::string_view name) = 0;
+    virtual void open_array_section_in_ns(std::string_view name, const char *ns) = 0;
+    virtual void open_object_section(std::string_view name) = 0;
+    virtual void open_object_section_in_ns(std::string_view name, const char *ns) = 0;
+    virtual void close_section() = 0;
+    virtual void dump_null(std::string_view name) = 0;
+    virtual void dump_unsigned(std::string_view name, uint64_t u) = 0;
+    virtual void dump_int(std::string_view name, int64_t s) = 0;
+    virtual void dump_float(std::string_view name, double d) = 0;
+    virtual void dump_string(std::string_view name, std::string_view s) = 0;
+    virtual void dump_bool(std::string_view name, bool b)
+    {
+      dump_format_unquoted(name, "%s", (b ? "true" : "false"));
+    }
+    template<typename T>
+    void dump_object(std::string_view name, const T& foo) {
+      open_object_section(name);
+      foo.dump(this);
+      close_section();
+    }
+    virtual std::ostream& dump_stream(std::string_view name) = 0;
+    virtual void dump_format_va(std::string_view name, const char *ns, bool quoted, const char *fmt, va_list ap) = 0;
+    virtual void dump_format(std::string_view name, const char *fmt, ...);
+    virtual void dump_format_ns(std::string_view name, const char *ns, const char *fmt, ...);
+    virtual void dump_format_unquoted(std::string_view name, const char *fmt, ...);
+    virtual int get_len() const = 0;
+    virtual void write_raw_data(const char *data) = 0;
+    /* with attrs */
+    virtual void open_array_section_with_attrs(std::string_view name, const FormatterAttrs& attrs)
+    {
+      open_array_section(name);
+    }
+    virtual void open_object_section_with_attrs(std::string_view name, const FormatterAttrs& attrs)
+    {
+      open_object_section(name);
+    }
+    virtual void dump_string_with_attrs(std::string_view name, std::string_view s, const FormatterAttrs& attrs)
+    {
+      dump_string(name, s);
+    }
+
+    virtual void *get_external_feature_handler(const std::string& feature) {
+      return nullptr;
+    }
+    virtual void write_bin_data(const char* buff, int buf_len);
+  };
+
+  class copyable_sstream : public std::stringstream {
+  public:
+    copyable_sstream() {}
+    copyable_sstream(const copyable_sstream& rhs) {
+      str(rhs.str());
+    }
+    copyable_sstream& operator=(const copyable_sstream& rhs) {
+      str(rhs.str());
+      return *this;
+    }
+  };
+
+  class JSONFormatter : public Formatter {
+  public:
+    explicit JSONFormatter(bool p = false);
+
+    void set_status(int status, const char* status_name) override {};
+    void output_header() override {};
+    void output_footer() override {};
+    void enable_line_break() override { m_line_break_enabled = true; }
+    void flush(std::ostream& os) override;
+    using Formatter::flush; // don't hide Formatter::flush(bufferlist &bl)
+    void reset() override;
+    void open_array_section(std::string_view name) override;
+    void open_array_section_in_ns(std::string_view name, const char *ns) override;
+    void open_object_section(std::string_view name) override;
+    void open_object_section_in_ns(std::string_view name, const char *ns) override;
+    void close_section() override;
+    void dump_null(std::string_view name) override;
+    void dump_unsigned(std::string_view name, uint64_t u) override;
+    void dump_int(std::string_view name, int64_t s) override;
+    void dump_float(std::string_view name, double d) override;
+    void dump_string(std::string_view name, std::string_view s) override;
+    std::ostream& dump_stream(std::string_view name) override;
+    void dump_format_va(std::string_view name, const char *ns, bool quoted, const char *fmt, va_list ap) override;
+    int get_len() const override;
+    void write_raw_data(const char *data) override;
+
+  protected:
+    virtual bool handle_value(std::string_view name, std::string_view s, bool quoted) {
+      return false; /* is handling done? */
+    }
+
+    virtual bool handle_open_section(std::string_view name, const char *ns, bool is_array) {
+      return false; /* is handling done? */
+    }
+
+    virtual bool handle_close_section() {
+      return false; /* is handling done? */
+    }
+
+    int stack_size() { return m_stack.size(); }
+
+  private:
+
+    struct json_formatter_stack_entry_d {
+      int size;
+      bool is_array;
+      json_formatter_stack_entry_d() : size(0), is_array(false) { }
+    };
+
+    bool m_pretty;
+    void open_section(std::string_view name, const char *ns, bool is_array);
+    void print_quoted_string(std::string_view s);
+    void print_name(std::string_view name);
+    void print_comma(json_formatter_stack_entry_d& entry);
+    void finish_pending_string();
+
+    template <class T>
+    void add_value(std::string_view name, T val);
+    void add_value(std::string_view name, std::string_view val, bool quoted);
+
+    copyable_sstream m_ss;
+    copyable_sstream m_pending_string;
+    std::string m_pending_name;
+    std::list<json_formatter_stack_entry_d> m_stack;
+    bool m_is_pending_string;
+    bool m_line_break_enabled = false;
+  };
+
+  template <class T>
+  void add_value(std::string_view name, T val);
+
+  class XMLFormatter : public Formatter {
+  public:
+    static const char *XML_1_DTD;
+    XMLFormatter(bool pretty = false, bool lowercased = false, bool underscored = true);
+
+    void set_status(int status, const char* status_name) override {}
+    void output_header() override;
+    void output_footer() override;
+
+    void enable_line_break() override { m_line_break_enabled = true; }
+    void flush(std::ostream& os) override;
+    using Formatter::flush; // don't hide Formatter::flush(bufferlist &bl)
+    void reset() override;
+    void open_array_section(std::string_view name) override;
+    void open_array_section_in_ns(std::string_view name, const char *ns) override;
+    void open_object_section(std::string_view name) override;
+    void open_object_section_in_ns(std::string_view name, const char *ns) override;
+    void close_section() override;
+    void dump_null(std::string_view name) override;
+    void dump_unsigned(std::string_view name, uint64_t u) override;
+    void dump_int(std::string_view name, int64_t s) override;
+    void dump_float(std::string_view name, double d) override;
+    void dump_string(std::string_view name, std::string_view s) override;
+    std::ostream& dump_stream(std::string_view name) override;
+    void dump_format_va(std::string_view name, const char *ns, bool quoted, const char *fmt, va_list ap) override;
+    int get_len() const override;
+    void write_raw_data(const char *data) override;
+    void write_bin_data(const char* buff, int len) override;
+
+    /* with attrs */
+    void open_array_section_with_attrs(std::string_view name, const FormatterAttrs& attrs) override;
+    void open_object_section_with_attrs(std::string_view name, const FormatterAttrs& attrs) override;
+    void dump_string_with_attrs(std::string_view name, std::string_view s, const FormatterAttrs& attrs) override;
+
+  protected:
+    void open_section_in_ns(std::string_view name, const char *ns, const FormatterAttrs *attrs);
+    void finish_pending_string();
+    void print_spaces();
+    void get_attrs_str(const FormatterAttrs *attrs, std::string& attrs_str);
+    char to_lower_underscore(char c) const;
+    std::string get_xml_name(std::string_view name) const;
+
+    std::stringstream m_ss, m_pending_string;
+    std::deque<std::string> m_sections;
+    const bool m_pretty;
+    const bool m_lowercased;
+    const bool m_underscored;
+    std::string m_pending_string_name;
+    bool m_header_done;
+    bool m_line_break_enabled = false;
+  private:
+    template <class T>
+    void add_value(std::string_view name, T val);
+  };
+
+  class TableFormatter : public Formatter {
+  public:
+    explicit TableFormatter(bool keyval = false);
+
+    void set_status(int status, const char* status_name) override {};
+    void output_header() override {};
+    void output_footer() override {};
+    void enable_line_break() override {};
+    void flush(std::ostream& os) override;
+    using Formatter::flush; // don't hide Formatter::flush(bufferlist &bl)
+    void reset() override;
+    void open_array_section(std::string_view name) override;
+    void open_array_section_in_ns(std::string_view name, const char *ns) override;
+    void open_object_section(std::string_view name) override;
+    void open_object_section_in_ns(std::string_view name, const char *ns) override;
+
+    void open_array_section_with_attrs(std::string_view name, const FormatterAttrs& attrs) override;
+    void open_object_section_with_attrs(std::string_view name, const FormatterAttrs& attrs) override;
+
+    void close_section() override;
+    void dump_null(std::string_view name) override;
+    void dump_unsigned(std::string_view name, uint64_t u) override;
+    void dump_int(std::string_view name, int64_t s) override;
+    void dump_float(std::string_view name, double d) override;
+    void dump_string(std::string_view name, std::string_view s) override;
+    void dump_format_va(std::string_view name, const char *ns, bool quoted, const char *fmt, va_list ap) override;
+    void dump_string_with_attrs(std::string_view name, std::string_view s, const FormatterAttrs& attrs) override;
+    std::ostream& dump_stream(std::string_view name) override;
+
+    int get_len() const override;
+    void write_raw_data(const char *data) override;
+    void get_attrs_str(const FormatterAttrs *attrs, std::string& attrs_str);
+
+  private:
+    template <class T>
+    void add_value(std::string_view name, T val);
+    void open_section_in_ns(std::string_view name, const char *ns, const FormatterAttrs *attrs);
+    std::vector< std::vector<std::pair<std::string, std::string> > > m_vec;
+    std::stringstream m_ss;
+    size_t m_vec_index(std::string_view name);
+    std::string get_section_name(std::string_view name);
+    void finish_pending_string();
+    std::string m_pending_name;
+    bool m_keyval;
+
+    int m_section_open;
+    std::vector< std::string > m_section;
+    std::map<std::string, int> m_section_cnt;
+    std::vector<size_t> m_column_size;
+    std::vector< std::string > m_column_name;
+  };
+
+  std::string fixed_to_string(int64_t num, int scale);
+  std::string fixed_u_to_string(uint64_t num, int scale);
+}
+#endif
+
diff --git a/src/common/Graylog.cc b/src/common/Graylog.cc
new file mode 100644
index 000000000..cbd63fab2
--- /dev/null
+++ b/src/common/Graylog.cc
@@ -0,0 +1,158 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Graylog.h"
+#include "common/Formatter.h"
+#include "common/LogEntry.h"
+#include "log/Entry.h"
+#include "log/SubsystemMap.h"
+
+using std::cerr;
+
+namespace ceph::logging {
+
+Graylog::Graylog(const SubsystemMap * const s, const std::string &logger)
+    : m_subs(s),
+      m_logger(std::move(logger)),
+      m_ostream_compressed(std::stringstream::in |
+                           std::stringstream::out |
+                           std::stringstream::binary)
+{
+  m_formatter = std::unique_ptr<Formatter>(Formatter::create("json"));
+  m_formatter_section = std::unique_ptr<Formatter>(Formatter::create("json"));
+}
+
+Graylog::Graylog(const std::string &logger)
+  : Graylog(nullptr, logger)
+{}
+
+Graylog::~Graylog()
+{
+}
+
+void Graylog::set_destination(const std::string& host, int port)
+{
+  try {
+    boost::asio::ip::udp::resolver resolver(m_io_service);
+    boost::asio::ip::udp::resolver::query query(host, std::to_string(port));
+    m_endpoint = *resolver.resolve(query);
+    m_log_dst_valid = true;
+  } catch (boost::system::system_error const& e) {
+    cerr << "Error resolving graylog destination: " << e.what() << std::endl;
+    m_log_dst_valid = false;
+  }
+}
+
+void Graylog::set_hostname(const std::string& host)
+{
+  assert(!host.empty());
+  m_hostname = host;
+}
+
+void Graylog::set_fsid(const uuid_d& fsid)
+{
+  std::vector<char> buf(40);
+  fsid.print(&buf[0]);
+  m_fsid = std::string(&buf[0]);
+}
+
+void Graylog::log_entry(const Entry& e)
+{
+  if (m_log_dst_valid) {
+    auto s = e.strv();
+
+    m_formatter->open_object_section("");
+    m_formatter->dump_string("version", "1.1");
+    m_formatter->dump_string("host", m_hostname);
+    m_formatter->dump_string("short_message", s);
+    m_formatter->dump_string("_app", "ceph");
+    auto t = ceph::logging::log_clock::to_timeval(e.m_stamp);
+    m_formatter->dump_float("timestamp", t.tv_sec + (t.tv_usec / 1000000.0));
+    m_formatter->dump_unsigned("_thread", (uint64_t)e.m_thread);
+    m_formatter->dump_int("_level", e.m_prio);
+    if (m_subs != NULL)
+    m_formatter->dump_string("_subsys_name", m_subs->get_name(e.m_subsys));
+    m_formatter->dump_int("_subsys_id", e.m_subsys);
+    m_formatter->dump_string("_fsid", m_fsid);
+    m_formatter->dump_string("_logger", m_logger);
+    m_formatter->close_section();
+
+    m_ostream_compressed.clear();
+    m_ostream_compressed.str("");
+
+    m_ostream.reset();
+
+    m_ostream.push(m_compressor);
+    m_ostream.push(m_ostream_compressed);
+
+    m_formatter->flush(m_ostream);
+    m_ostream << std::endl;
+
+    m_ostream.reset();
+
+    try {
+      boost::asio::ip::udp::socket socket(m_io_service);
+      socket.open(m_endpoint.protocol());
+      socket.send_to(boost::asio::buffer(m_ostream_compressed.str()), m_endpoint);
+    } catch (boost::system::system_error const& e) {
+      cerr << "Error sending graylog message: " << e.what() << std::endl;
+    }
+  }
+}
+
+void Graylog::log_log_entry(LogEntry const * const e)
+{
+  if (m_log_dst_valid) {
+    m_formatter->open_object_section("");
+    m_formatter->dump_string("version", "1.1");
+    m_formatter->dump_string("host", m_hostname);
+    m_formatter->dump_string("short_message", e->msg);
+    m_formatter->dump_float("timestamp", e->stamp.sec() + (e->stamp.usec() / 1000000.0));
+    m_formatter->dump_string("_app", "ceph");
+
+    m_formatter->dump_string("name", e->name.to_str());
+
+    m_formatter_section->open_object_section("rank");
+    e->rank.dump(m_formatter_section.get());
+    m_formatter_section->close_section();
+
+    m_formatter_section->open_object_section("addrs");
+    e->addrs.dump(m_formatter_section.get());
+    m_formatter_section->close_section();
+
+    m_ostream_section.clear();
+    m_ostream_section.str("");
+    m_formatter_section->flush(m_ostream_section);
+    m_formatter->dump_string("_who", m_ostream_section.str());
+
+    m_formatter->dump_int("_seq", e->seq);
+    m_formatter->dump_string("_prio", clog_type_to_string(e->prio));
+    m_formatter->dump_string("_channel", e->channel);
+    m_formatter->dump_string("_fsid", m_fsid);
+    m_formatter->dump_string("_logger", m_logger);
+    m_formatter->close_section();
+
+    m_ostream_compressed.clear();
+    m_ostream_compressed.str("");
+
+    m_ostream.reset();
+
+    m_ostream.push(m_compressor);
+    m_ostream.push(m_ostream_compressed);
+
+    m_formatter->flush(m_ostream);
+    m_ostream << std::endl;
+
+    m_ostream.reset();
+
+    try {
+      boost::asio::ip::udp::socket socket(m_io_service);
+      socket.open(m_endpoint.protocol());
+      socket.send_to(boost::asio::buffer(m_ostream_compressed.str()), m_endpoint);
+    } catch (boost::system::system_error const& e) {
+      cerr << "Error sending graylog message: " << e.what() << std::endl;
+    }
+  }
+}
+
+} // name ceph::logging
diff --git a/src/common/Graylog.h b/src/common/Graylog.h
new file mode 100644
index 000000000..c8c501319
--- /dev/null
+++ b/src/common/Graylog.h
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef __CEPH_LOG_GRAYLOG_H
+#define __CEPH_LOG_GRAYLOG_H
+
+#include <boost/asio.hpp>
+#include <boost/iostreams/filtering_stream.hpp>
+#include <boost/iostreams/filter/zlib.hpp>
+
+#include "include/ceph_assert.h"  // boost clobbers this
+
+struct uuid_d;
+class LogEntry;
+
+namespace ceph {
+
+class Formatter;
+
+namespace logging {
+
+class Entry;
+class SubsystemMap;
+
+// Graylog logging backend: Convert log datastructures (LogEntry, Entry) to
+// GELF (http://www.graylog2.org/resources/gelf/specification) and send it
+// to a GELF UDP receiver
+
+class Graylog
+{
+ public:
+
+  /**
+   * Create Graylog with SubsystemMap. log_entry will resolve the subsystem
+   * id to string. Logging will not be ready until set_destination is called
+   * @param s SubsystemMap
+   * @param logger Value for key "_logger" in GELF
+   */
+  Graylog(const SubsystemMap * const s, const std::string &logger);
+
+  /**
+   * Create Graylog without SubsystemMap. Logging will not be ready
+   * until set_destination is called
+   * @param logger Value for key "_logger" in GELF
+   */
+  explicit Graylog(const std::string &logger);
+  virtual ~Graylog();
+
+  void set_hostname(const std::string& host);
+  void set_fsid(const uuid_d& fsid);
+
+  void set_destination(const std::string& host, int port);
+
+  void log_entry(const Entry& e);
+  void log_log_entry(LogEntry const * const e);
+
+  typedef std::shared_ptr<Graylog> Ref;
+
+ private:
+  SubsystemMap const * const m_subs;
+
+  bool m_log_dst_valid = false;
+
+  std::string m_hostname;
+  std::string m_fsid;
+  std::string m_logger;
+
+  boost::asio::ip::udp::endpoint m_endpoint;
+  boost::asio::io_service m_io_service;
+
+  std::unique_ptr<Formatter> m_formatter;
+  std::unique_ptr<Formatter> m_formatter_section;
+  std::stringstream m_ostream_section;
+  std::stringstream m_ostream_compressed;
+  boost::iostreams::filtering_ostream m_ostream;
+  boost::iostreams::zlib_compressor m_compressor;
+
+};
+
+}
+}
+
+#endif
diff --git a/src/common/HBHandle.h b/src/common/HBHandle.h
new file mode 100644
index 000000000..a972a93c6
--- /dev/null
+++ b/src/common/HBHandle.h
@@ -0,0 +1,11 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+class HBHandle {
+public:
+  virtual void reset_tp_timeout() = 0;
+  virtual void suspend_tp_timeout() = 0;
+  virtual ~HBHandle() {}
+};
diff --git a/src/common/HTMLFormatter.cc b/src/common/HTMLFormatter.cc
new file mode 100644
index 000000000..e7e985531
--- /dev/null
+++ b/src/common/HTMLFormatter.cc
@@ -0,0 +1,158 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#define LARGE_SIZE 1024
+
+#include "HTMLFormatter.h"
+#include "Formatter.h"
+
+#include <sstream>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <string.h>     // for strdup
+
+#include "common/escape.h"
+
+// -----------------------
+namespace ceph {
+
+HTMLFormatter::HTMLFormatter(bool pretty)
+: XMLFormatter(pretty), m_status(0), m_status_name(NULL)
+{
+}
+
+HTMLFormatter::~HTMLFormatter()
+{
+  if (m_status_name) {
+    free((void*)m_status_name);
+    m_status_name = NULL;
+  }
+}
+
+void HTMLFormatter::reset()
+{
+  XMLFormatter::reset();
+  m_header_done = false;
+  m_status = 0;
+  if (m_status_name) {
+    free((void*)m_status_name);
+    m_status_name = NULL;
+  }
+}
+
+void HTMLFormatter::set_status(int status, const char* status_name)
+{
+  m_status = status;
+  if (status_name) {
+    if (m_status_name) {
+      free((void*)m_status_name);
+    }
+    m_status_name = strdup(status_name);
+  }
+};
+
+void HTMLFormatter::output_header() {
+  if (!m_header_done) {
+    m_header_done = true;
+    char buf[16];
+    snprintf(buf, sizeof(buf), "%d", m_status);
+    std::string status_line(buf);
+    if (m_status_name) {
+      status_line += " ";
+      status_line += m_status_name;
+    }
+    open_object_section("html");
+    print_spaces();
+    m_ss << "<head><title>" << status_line << "</title></head>";
+    if (m_pretty)
+      m_ss << "\n";
+    open_object_section("body");
+    print_spaces();
+    m_ss << "<h1>" << status_line << "</h1>";
+    if (m_pretty)
+      m_ss << "\n";
+    open_object_section("ul");
+  }
+}
+
+template <typename T>
+void HTMLFormatter::dump_template(std::string_view name, T arg)
+{
+  print_spaces();
+  m_ss << "<li>" << name << ": " << arg << "</li>";
+  if (m_pretty)
+    m_ss << "\n";
+}
+
+void HTMLFormatter::dump_unsigned(std::string_view name, uint64_t u)
+{
+  dump_template(name, u);
+}
+
+void HTMLFormatter::dump_int(std::string_view name, int64_t u)
+{
+  dump_template(name, u);
+}
+
+void HTMLFormatter::dump_float(std::string_view name, double d)
+{
+  dump_template(name, d);
+}
+
+void HTMLFormatter::dump_string(std::string_view name, std::string_view s)
+{
+  dump_template(name, xml_stream_escaper(s));
+}
+
+void HTMLFormatter::dump_string_with_attrs(std::string_view name, std::string_view s, const FormatterAttrs& attrs)
+{
+  std::string e(name);
+  std::string attrs_str;
+  get_attrs_str(&attrs, attrs_str);
+  print_spaces();
+  m_ss << "<li>" << e << ": " << xml_stream_escaper(s) << attrs_str << "</li>";
+  if (m_pretty)
+    m_ss << "\n";
+}
+
+std::ostream& HTMLFormatter::dump_stream(std::string_view name)
+{
+  print_spaces();
+  m_pending_string_name = "li";
+  m_ss << "<li>" << name << ": ";
+  return m_pending_string;
+}
+
+void HTMLFormatter::dump_format_va(std::string_view name, const char *ns, bool quoted, const char *fmt, va_list ap)
+{
+  char buf[LARGE_SIZE];
+  size_t len = vsnprintf(buf, LARGE_SIZE, fmt, ap);
+
+  std::string e(name);
+  print_spaces();
+  if (ns) {
+    m_ss << "<li xmlns=\"" << ns << "\">" << e << ": "
+	 << xml_stream_escaper(std::string_view(buf, len)) << "</li>";
+  } else {
+    m_ss << "<li>" << e << ": "
+	 << xml_stream_escaper(std::string_view(buf, len)) << "</li>";
+  }
+
+  if (m_pretty)
+    m_ss << "\n";
+}
+
+} // namespace ceph
diff --git a/src/common/HTMLFormatter.h b/src/common/HTMLFormatter.h
new file mode 100644
index 000000000..cc891824b
--- /dev/null
+++ b/src/common/HTMLFormatter.h
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_HTML_FORMATTER_H
+#define CEPH_HTML_FORMATTER_H
+
+#include "Formatter.h"
+
+namespace ceph {
+  class HTMLFormatter : public XMLFormatter {
+  public:
+    explicit HTMLFormatter(bool pretty = false);
+    ~HTMLFormatter() override;
+    void reset() override;
+
+    void set_status(int status, const char* status_name) override;
+    void output_header() override;
+
+    void dump_unsigned(std::string_view name, uint64_t u) override;
+    void dump_int(std::string_view name, int64_t u) override;
+    void dump_float(std::string_view name, double d) override;
+    void dump_string(std::string_view name, std::string_view s) override;
+    std::ostream& dump_stream(std::string_view name) override;
+    void dump_format_va(std::string_view name, const char *ns, bool quoted, const char *fmt, va_list ap) override;
+
+    /* with attrs */
+    void dump_string_with_attrs(std::string_view name, std::string_view s, const FormatterAttrs& attrs) override;
+  private:
+    template <typename T> void dump_template(std::string_view name, T arg);
+
+    int m_status;
+    const char* m_status_name;
+  };
+
+}
+
+#endif
diff --git a/src/common/HeartbeatMap.cc b/src/common/HeartbeatMap.cc
new file mode 100644
index 000000000..544427092
--- /dev/null
+++ b/src/common/HeartbeatMap.cc
@@ -0,0 +1,184 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include <utime.h>
+#include <signal.h>
+
+#include "HeartbeatMap.h"
+#include "ceph_context.h"
+#include "common/errno.h"
+#include "common/valgrind.h"
+#include "debug.h"
+
+#define dout_subsys ceph_subsys_heartbeatmap
+#undef dout_prefix
+#define dout_prefix *_dout << "heartbeat_map "
+
+using std::chrono::duration_cast;
+using std::chrono::seconds;
+using std::string;
+
+namespace ceph {
+
+HeartbeatMap::HeartbeatMap(CephContext *cct)
+  : m_cct(cct),
+    m_unhealthy_workers(0),
+    m_total_workers(0)
+{
+}
+
+HeartbeatMap::~HeartbeatMap()
+{
+  ceph_assert(m_workers.empty());
+}
+
+heartbeat_handle_d *HeartbeatMap::add_worker(const string& name, pthread_t thread_id)
+{
+  std::unique_lock locker{m_rwlock};
+  ldout(m_cct, 10) << "add_worker '" << name << "'" << dendl;
+  heartbeat_handle_d *h = new heartbeat_handle_d(name);
+  ANNOTATE_BENIGN_RACE_SIZED(&h->timeout, sizeof(h->timeout),
+                             "heartbeat_handle_d timeout");
+  ANNOTATE_BENIGN_RACE_SIZED(&h->suicide_timeout, sizeof(h->suicide_timeout),
+                             "heartbeat_handle_d suicide_timeout");
+  m_workers.push_front(h);
+  h->list_item = m_workers.begin();
+  h->thread_id = thread_id;
+  return h;
+}
+
+void HeartbeatMap::remove_worker(const heartbeat_handle_d *h)
+{
+  std::unique_lock locker{m_rwlock};
+  ldout(m_cct, 10) << "remove_worker '" << h->name << "'" << dendl;
+  m_workers.erase(h->list_item);
+  delete h;
+}
+
+bool HeartbeatMap::_check(const heartbeat_handle_d *h, const char *who,
+			  ceph::coarse_mono_time now)
+{
+  bool healthy = true;
+  if (auto was = h->timeout.load(std::memory_order_relaxed);
+      !clock::is_zero(was) && was < now) {
+    ldout(m_cct, 1) << who << " '" << h->name << "'"
+		    << " had timed out after " << h->grace << dendl;
+    healthy = false;
+  }
+  if (auto was = h->suicide_timeout.load(std::memory_order_relaxed);
+      !clock::is_zero(was) && was < now) {
+    ldout(m_cct, 1) << who << " '" << h->name << "'"
+		    << " had suicide timed out after " << h->suicide_grace << dendl;
+    pthread_kill(h->thread_id, SIGABRT);
+    sleep(1);
+    ceph_abort_msg("hit suicide timeout");
+  }
+  return healthy;
+}
+
+void HeartbeatMap::reset_timeout(heartbeat_handle_d *h,
+				 ceph::timespan grace,
+				 ceph::timespan suicide_grace)
+{
+  ldout(m_cct, 20) << "reset_timeout '" << h->name << "' grace " << grace
+		   << " suicide " << suicide_grace << dendl;
+  const auto now = clock::now();
+  _check(h, "reset_timeout", now);
+
+  h->timeout.store(now + grace, std::memory_order_relaxed);
+  h->grace = grace;
+
+  if (suicide_grace > ceph::timespan::zero()) {
+    h->suicide_timeout.store(now + suicide_grace, std::memory_order_relaxed);
+  } else {
+    h->suicide_timeout.store(clock::zero(), std::memory_order_relaxed);
+  }
+  h->suicide_grace = suicide_grace;
+}
+
+void HeartbeatMap::clear_timeout(heartbeat_handle_d *h)
+{
+  ldout(m_cct, 20) << "clear_timeout '" << h->name << "'" << dendl;
+  auto now = clock::now();
+  _check(h, "clear_timeout", now);
+  h->timeout.store(clock::zero(), std::memory_order_relaxed);
+  h->suicide_timeout.store(clock::zero(), std::memory_order_relaxed);
+}
+
+bool HeartbeatMap::is_healthy()
+{
+  int unhealthy = 0;
+  int total = 0;
+  m_rwlock.lock_shared();
+  auto now = ceph::coarse_mono_clock::now();
+  if (m_cct->_conf->heartbeat_inject_failure) {
+    ldout(m_cct, 0) << "is_healthy injecting failure for next " << m_cct->_conf->heartbeat_inject_failure << " seconds" << dendl;
+    m_inject_unhealthy_until = now + std::chrono::seconds(m_cct->_conf->heartbeat_inject_failure);
+    m_cct->_conf.set_val("heartbeat_inject_failure", "0");
+  }
+
+  bool healthy = true;
+  if (now < m_inject_unhealthy_until) {
+    auto sec = std::chrono::duration_cast<std::chrono::seconds>(m_inject_unhealthy_until - now).count();
+    ldout(m_cct, 0) << "is_healthy = false, injected failure for next "
+                    << sec << " seconds" << dendl;
+    healthy = false;
+  }
+
+  for (auto p = m_workers.begin();
+       p != m_workers.end();
+       ++p) {
+    heartbeat_handle_d *h = *p;
+    if (!_check(h, "is_healthy", now)) {
+      healthy = false;
+      unhealthy++;
+    }
+    total++;
+  }
+  m_rwlock.unlock_shared();
+
+  m_unhealthy_workers = unhealthy;
+  m_total_workers = total;
+
+  ldout(m_cct, 20) << "is_healthy = " << (healthy ? "healthy" : "NOT HEALTHY")
+    << ", total workers: " << total << ", number of unhealthy: " << unhealthy << dendl;
+  return healthy;
+}
+
+int HeartbeatMap::get_unhealthy_workers() const
+{
+  return m_unhealthy_workers;
+}
+
+int HeartbeatMap::get_total_workers() const
+{
+  return m_total_workers;
+}
+
+void HeartbeatMap::check_touch_file()
+{
+  string path = m_cct->_conf->heartbeat_file;
+  if (path.length() && is_healthy()) {
+    int fd = ::open(path.c_str(), O_WRONLY|O_CREAT|O_CLOEXEC, 0644);
+    if (fd >= 0) {
+      ::utime(path.c_str(), NULL);
+      ::close(fd);
+    } else {
+      ldout(m_cct, 0) << "unable to touch " << path << ": "
+                     << cpp_strerror(errno) << dendl;
+    }
+  }
+}
+
+}
diff --git a/src/common/HeartbeatMap.h b/src/common/HeartbeatMap.h
new file mode 100644
index 000000000..6f486b21c
--- /dev/null
+++ b/src/common/HeartbeatMap.h
@@ -0,0 +1,99 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_HEARTBEATMAP_H
+#define CEPH_HEARTBEATMAP_H
+
+#include <list>
+#include <atomic>
+#include <string>
+#include <pthread.h>
+
+#include "common/ceph_time.h"
+#include "common/ceph_mutex.h"
+#include "include/common_fwd.h"
+
+namespace ceph {
+
+/*
+ * HeartbeatMap -
+ *
+ * Maintain a set of handles for internal subsystems to periodically
+ * check in with a health check and timeout.  Each user can register
+ * and get a handle they can use to set or reset a timeout.  
+ *
+ * A simple is_healthy() method checks for any users who are not within
+ * their grace period for a heartbeat.
+ */
+
+struct heartbeat_handle_d {
+  const std::string name;
+  pthread_t thread_id = 0;
+  using clock = ceph::coarse_mono_clock;
+  using time = ceph::coarse_mono_time;
+  std::atomic<time> timeout = clock::zero();
+  std::atomic<time> suicide_timeout = clock::zero();
+  ceph::timespan grace = ceph::timespan::zero();
+  ceph::timespan suicide_grace = ceph::timespan::zero();
+  std::list<heartbeat_handle_d*>::iterator list_item;
+
+  explicit heartbeat_handle_d(const std::string& n)
+    : name(n)
+  { }
+};
+
+class HeartbeatMap {
+ public:
+  // register/unregister
+  heartbeat_handle_d *add_worker(const std::string& name, pthread_t thread_id);
+  void remove_worker(const heartbeat_handle_d *h);
+
+  // reset the timeout so that it expects another touch within grace amount of time
+  void reset_timeout(heartbeat_handle_d *h,
+		     ceph::timespan grace,
+		     ceph::timespan suicide_grace);
+  // clear the timeout so that it's not checked on
+  void clear_timeout(heartbeat_handle_d *h);
+
+  // return false if any of the timeouts are currently expired.
+  bool is_healthy();
+
+  // touch cct->_conf->heartbeat_file if is_healthy()
+  void check_touch_file();
+
+  // get the number of unhealthy workers
+  int get_unhealthy_workers() const;
+
+  // get the number of total workers
+  int get_total_workers() const;
+
+  explicit HeartbeatMap(CephContext *cct);
+  ~HeartbeatMap();
+
+ private:
+  using clock = ceph::coarse_mono_clock;
+  CephContext *m_cct;
+  ceph::shared_mutex m_rwlock =
+    ceph::make_shared_mutex("HeartbeatMap::m_rwlock");
+  clock::time_point m_inject_unhealthy_until;
+  std::list<heartbeat_handle_d*> m_workers;
+  std::atomic<unsigned> m_unhealthy_workers = { 0 };
+  std::atomic<unsigned> m_total_workers = { 0 };
+
+  bool _check(const heartbeat_handle_d *h, const char *who,
+	      ceph::coarse_mono_time now);
+};
+
+}
+#endif
diff --git a/src/common/Initialize.h b/src/common/Initialize.h
new file mode 100644
index 000000000..78ad5ec69
--- /dev/null
+++ b/src/common/Initialize.h
@@ -0,0 +1,96 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+/* Copyright (c) 2011 Stanford University
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef CEPH_INITIALIZE_H
+#define CEPH_INITIALIZE_H
+
+/**
+ * This class is used to manage once-only initialization that should occur
+ * before main() is invoked, such as the creation of static variables.  It
+ * also provides a mechanism for handling dependencies (where one class
+ * needs to perform its once-only initialization before another).
+ * 
+ * The simplest way to use an Initialize object is to define a static
+ * initialization method for a class, say Foo::init().  Then, declare
+ * a static Initialize object in the class:
+ * "static Initialize initializer(Foo::init);".
+ * The result is that Foo::init will be invoked when the object is
+ * constructed (before main() is invoked).  Foo::init can create static
+ * objects and perform any other once-only initialization needed by the
+ * class.  Furthermore, if some other class needs to ensure that Foo has
+ * been initialized (e.g. as part of its own initialization) it can invoke
+ * Foo::init directly (Foo::init should contain an internal guard so that
+ * it only performs its functions once, even if invoked several times).
+ *
+ * There is also a second form of constructor for Initialize that causes a
+ * new object to be dynamically allocated and assigned to a pointer, instead
+ * of invoking a function. This form allows for the creation of static objects
+ * that are never destructed (thereby avoiding issues with the order of
+ * destruction).
+ */
+class Initialize {
+ public:
+  /**
+   * This form of constructor causes its function argument to be invoked
+   * when the object is constructed.  When used with a static Initialize
+   * object, this will cause \p func to run before main() runs, so that
+   * \p func can perform once-only initialization.
+   *
+   * \param func
+   *      This function is invoked with no arguments when the object is
+   *      constructed.  Typically the function will create static
+   *      objects and/or invoke other initialization functions.  The
+   *      function should normally contain an internal guard so that it
+   *      only performs its initialization the first time it is invoked.
+   */
+  explicit Initialize(void (*func)()) {
+    (*func)();
+  }
+
+  /**
+   * This form of constructor causes a new object of a particular class
+   * to be constructed with a no-argument constructor and assigned to a
+   * given pointer.  This form is typically used with a static Initialize
+   * object: the result is that the object will be created and assigned
+   * to the pointer before main() runs.
+   *
+   * \param p
+   *      Pointer to an object of any type. If the pointer is NULL then
+   *      it is replaced with a pointer to a newly allocated object of
+   *      the given type.
+   */
+  template<typename T>
+  explicit Initialize(T*& p) {
+    if (p == NULL) {
+      p = new T;
+    }
+  }
+};
+
+#endif  // CEPH_INITIALIZE_H
diff --git a/src/common/Journald.cc b/src/common/Journald.cc
new file mode 100644
index 000000000..a1321c7ee
--- /dev/null
+++ b/src/common/Journald.cc
@@ -0,0 +1,319 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Journald.h"
+
+#include <endian.h>
+#include <fcntl.h>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/uio.h>
+#include <sys/un.h>
+#include <syslog.h>
+#include <unistd.h>
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include "include/ceph_assert.h"
+#include "common/LogEntry.h"
+#include "log/Entry.h"
+#include "log/SubsystemMap.h"
+#include "msg/msg_fmt.h"
+
+
+namespace ceph::logging {
+
+namespace {
+const struct sockaddr_un sockaddr = {
+  AF_UNIX,
+  "/run/systemd/journal/socket",
+};
+
+ssize_t sendmsg_fd(int transport_fd, int fd)
+{
+  constexpr size_t control_len = CMSG_LEN(sizeof(int));
+  char control[control_len];
+  struct msghdr mh = {
+    (struct sockaddr*)&sockaddr, // msg_name
+    sizeof(sockaddr),            // msg_namelen
+    nullptr,                     // msg_iov
+    0,                           // msg_iovlen
+    &control,                    // msg_control
+    control_len,                 // msg_controllen
+  };
+  ceph_assert(transport_fd >= 0);
+
+  struct cmsghdr *cmsg = CMSG_FIRSTHDR(&mh);
+  cmsg->cmsg_level = SOL_SOCKET;
+  cmsg->cmsg_type = SCM_RIGHTS;
+  cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+  *reinterpret_cast<int *>(CMSG_DATA(cmsg)) = fd;
+
+  return sendmsg(transport_fd, &mh, MSG_NOSIGNAL);
+}
+
+char map_prio(short ceph_prio)
+{
+  if (ceph_prio < 0)
+    return LOG_ERR;
+  if (ceph_prio == 0)
+    return LOG_WARNING;
+  if (ceph_prio < 5)
+    return LOG_NOTICE;
+  if (ceph_prio < 10)
+    return LOG_INFO;
+  return LOG_DEBUG;
+}
+}
+
+namespace detail {
+class EntryEncoderBase {
+ public:
+  EntryEncoderBase():
+    m_msg_vec {
+     {}, {}, {}, { (char *)"\n", 1 },
+    }
+  {
+    std::string id = program_invocation_short_name;
+    for (auto& c : id) {
+      if (c == '\n')
+        c = '_';
+    }
+    static_segment = "SYSLOG_IDENTIFIER=" + id + "\n";
+    m_msg_vec[0].iov_base = static_segment.data();
+    m_msg_vec[0].iov_len = static_segment.size();
+  }
+
+  constexpr struct iovec *iovec() { return this->m_msg_vec; }
+  constexpr std::size_t iovec_len()
+  {
+    return sizeof(m_msg_vec) / sizeof(m_msg_vec[0]);
+  }
+
+ private:
+  struct iovec m_msg_vec[4];
+  std::string static_segment;
+
+ protected:
+  fmt::memory_buffer meta_buf;
+
+  struct iovec &meta_vec() { return m_msg_vec[1]; }
+  struct iovec &msg_vec() { return m_msg_vec[2]; }
+};
+
+class EntryEncoder : public EntryEncoderBase {
+ public:
+  void encode(const Entry& e, const SubsystemMap *s)
+  {
+    meta_buf.clear();
+    fmt::format_to(std::back_inserter(meta_buf),
+      R"(PRIORITY={:d}
+CEPH_SUBSYS={}
+TIMESTAMP={}
+CEPH_PRIO={}
+THREAD={:016x}
+MESSAGE
+)",
+      map_prio(e.m_prio),
+      s->get_name(e.m_subsys),
+      e.m_stamp.time_since_epoch().count().count,
+      e.m_prio,
+      e.m_thread);
+
+    uint64_t msg_len = htole64(e.size());
+    meta_buf.resize(meta_buf.size() + sizeof(msg_len));
+    *(reinterpret_cast<uint64_t*>(meta_buf.end()) - 1) = htole64(e.size());
+
+    meta_vec().iov_base = meta_buf.data();
+    meta_vec().iov_len = meta_buf.size();
+
+    msg_vec().iov_base = (void *)e.strv().data();
+    msg_vec().iov_len = e.size();
+  }
+};
+
+class LogEntryEncoder : public EntryEncoderBase {
+ public:
+  void encode(const LogEntry& le)
+  {
+    meta_buf.clear();
+    fmt::format_to(std::back_inserter(meta_buf),
+      R"(PRIORITY={:d}
+TIMESTAMP={}
+CEPH_NAME={}
+CEPH_RANK={}
+CEPH_SEQ={}
+CEPH_CHANNEL={}
+MESSAGE
+)",
+      clog_type_to_syslog_level(le.prio),
+      le.stamp.to_nsec(),
+      le.name.to_str(),
+      le.rank,
+      le.seq,
+      le.channel);
+
+    uint64_t msg_len = htole64(le.msg.size());
+    meta_buf.resize(meta_buf.size() + sizeof(msg_len));
+    *(reinterpret_cast<uint64_t*>(meta_buf.end()) - 1) = htole64(le.msg.size());
+
+    meta_vec().iov_base = meta_buf.data();
+    meta_vec().iov_len = meta_buf.size();
+
+    msg_vec().iov_base = (void *)le.msg.data();
+    msg_vec().iov_len = le.msg.size();
+  }
+};
+
+enum class JournaldClient::MemFileMode {
+  MEMFD_CREATE,
+  OPEN_TMPFILE,
+  OPEN_UNLINK,  
+};
+
+constexpr const char *mem_file_dir = "/dev/shm";
+
+void JournaldClient::detect_mem_file_mode()
+{
+  int memfd = memfd_create("ceph-journald", MFD_ALLOW_SEALING | MFD_CLOEXEC);
+  if (memfd >= 0) {
+    mem_file_mode = MemFileMode::MEMFD_CREATE;
+    close(memfd);
+    return;
+  }
+  memfd = open(mem_file_dir, O_TMPFILE | O_EXCL | O_CLOEXEC, S_IRUSR | S_IWUSR);
+  if (memfd >= 0) {
+    mem_file_mode = MemFileMode::OPEN_TMPFILE;
+    close(memfd);
+    return;
+  }
+  mem_file_mode = MemFileMode::OPEN_UNLINK;
+}
+
+int JournaldClient::open_mem_file()
+{
+  switch (mem_file_mode) {
+  case MemFileMode::MEMFD_CREATE:
+    return memfd_create("ceph-journald", MFD_ALLOW_SEALING | MFD_CLOEXEC);
+  case MemFileMode::OPEN_TMPFILE:
+    return open(mem_file_dir, O_TMPFILE | O_EXCL | O_CLOEXEC, S_IRUSR | S_IWUSR);
+  case MemFileMode::OPEN_UNLINK:
+    char mem_file_template[] = "/dev/shm/ceph-journald-XXXXXX";
+    int fd = mkostemp(mem_file_template, O_CLOEXEC);
+    unlink(mem_file_template);
+    return fd;
+  }
+  ceph_abort("Unexpected mem_file_mode");
+}
+
+JournaldClient::JournaldClient() :
+  m_msghdr({
+    (struct sockaddr*)&sockaddr, // msg_name
+    sizeof(sockaddr),            // msg_namelen
+  })
+{
+  fd = socket(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0);
+  ceph_assertf(fd > 0, "socket creation failed: %s", strerror(errno));
+
+  int sendbuf = 2 * 1024 * 1024;
+  setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sendbuf, sizeof(sendbuf));
+
+  detect_mem_file_mode();
+}
+
+JournaldClient::~JournaldClient()
+{
+  close(fd);
+}
+
+int JournaldClient::send()
+{
+  int ret = sendmsg(fd, &m_msghdr, MSG_NOSIGNAL);
+  if (ret >= 0)
+    return 0;
+
+  /* Fail silently if the journal is not available */
+  if (errno == ENOENT)
+    return -1;
+
+  if (errno != EMSGSIZE && errno != ENOBUFS) {
+    std::cerr << "Failed to send log to journald: " << strerror(errno) << std::endl;
+    return -1;
+  }
+  /* Message doesn't fit... Let's dump the data in a memfd and
+   * just pass a file descriptor of it to the other side.
+   */
+  int buffer_fd = open_mem_file();
+  if (buffer_fd < 0) {
+    std::cerr << "Failed to open buffer_fd while sending log to journald: " << strerror(errno) << std::endl;
+    return -1;
+  }
+
+  ret = writev(buffer_fd, m_msghdr.msg_iov, m_msghdr.msg_iovlen);
+  if (ret < 0) {
+    std::cerr << "Failed to write to buffer_fd while sending log to journald: " << strerror(errno) << std::endl;
+    goto err_close_buffer_fd;
+  }
+
+  if (mem_file_mode == MemFileMode::MEMFD_CREATE) {
+    ret = fcntl(buffer_fd, F_ADD_SEALS, F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE | F_SEAL_SEAL);
+    if (ret) {
+      std::cerr << "Failed to seal buffer_fd while sending log to journald: " << strerror(errno) << std::endl;
+      goto err_close_buffer_fd;
+    }
+  }
+  
+  ret = sendmsg_fd(fd, buffer_fd);
+  if (ret < 0) {
+    /* Fail silently if the journal is not available */
+    if (errno == ENOENT)
+      goto err_close_buffer_fd;
+
+    std::cerr << "Failed to send fd while sending log to journald: " << strerror(errno) << std::endl;
+    goto err_close_buffer_fd;
+  }
+  close(buffer_fd);
+  return 0;
+
+err_close_buffer_fd:
+  close(buffer_fd);
+  return -1;
+}
+
+} // namespace ceph::logging::detail
+
+JournaldLogger::JournaldLogger(const SubsystemMap *s) :
+  m_entry_encoder(std::make_unique<detail::EntryEncoder>()),
+  m_subs(s)
+{
+  client.m_msghdr.msg_iov = m_entry_encoder->iovec();
+  client.m_msghdr.msg_iovlen = m_entry_encoder->iovec_len();
+}
+
+JournaldLogger::~JournaldLogger() = default;
+
+int JournaldLogger::log_entry(const Entry& e)
+{
+  m_entry_encoder->encode(e, m_subs);
+  return client.send();
+}
+
+JournaldClusterLogger::JournaldClusterLogger() :
+  m_log_entry_encoder(std::make_unique<detail::LogEntryEncoder>())
+{
+  client.m_msghdr.msg_iov = m_log_entry_encoder->iovec();
+  client.m_msghdr.msg_iovlen = m_log_entry_encoder->iovec_len();
+}
+
+JournaldClusterLogger::~JournaldClusterLogger() = default;
+
+int JournaldClusterLogger::log_log_entry(const LogEntry &le)
+{
+  m_log_entry_encoder->encode(le);
+  return client.send();
+}
+
+}
diff --git a/src/common/Journald.h b/src/common/Journald.h
new file mode 100644
index 000000000..377b1ff9e
--- /dev/null
+++ b/src/common/Journald.h
@@ -0,0 +1,110 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_COMMON_JOURNALD_H
+#define CEPH_COMMON_JOURNALD_H
+
+#include "acconfig.h"
+#include <memory>
+#include <sys/types.h>
+#include <sys/socket.h>
+
+struct LogEntry;
+
+namespace ceph::logging {
+
+class Entry;
+class SubsystemMap;
+
+#ifdef WITH_SYSTEMD
+
+namespace detail {
+
+class EntryEncoder;
+class LogEntryEncoder;
+
+class JournaldClient {
+ public:
+  JournaldClient();
+  ~JournaldClient();
+  int send();
+  struct msghdr m_msghdr;
+ private:
+  int fd;
+
+  enum class MemFileMode;
+  MemFileMode mem_file_mode;
+
+  void detect_mem_file_mode();
+  int open_mem_file();
+};
+}
+
+/**
+ * Logger to send local logs to journald
+ * 
+ * local logs means @code dout(0) << ... @endcode and similars
+ * 
+ * @see JournaldClusterLogger
+ */
+class JournaldLogger {
+ public:
+  JournaldLogger(const SubsystemMap *s);
+  ~JournaldLogger();
+
+  /**
+   * @returns 0 if log entry is successfully sent, -1 otherwise.
+   */
+  int log_entry(const Entry &e);
+
+ private:
+  detail::JournaldClient client;
+
+  std::unique_ptr<detail::EntryEncoder> m_entry_encoder;
+
+  const SubsystemMap * m_subs;
+};
+
+/**
+ * Logger to send cluster log recieved by MON to journald
+ * 
+ * @see JournaldLogger
+ */
+class JournaldClusterLogger {
+ public:
+  JournaldClusterLogger();
+  ~JournaldClusterLogger();
+
+  /**
+   * @returns 0 if log entry is successfully sent, -1 otherwise.
+   */
+  int log_log_entry(const LogEntry &le);
+
+ private:
+  detail::JournaldClient client;
+
+  std::unique_ptr<detail::LogEntryEncoder> m_log_entry_encoder;
+};
+
+#else  // WITH_SYSTEMD
+
+class JournaldLogger {
+public:
+  JournaldLogger(const SubsystemMap *) {}
+  int log_entry(const Entry &) {
+    return 0;
+  }
+};
+
+class JournaldClusterLogger {
+public:
+  int log_log_entry(const LogEntry &le) {
+    return 0;
+  }
+};
+
+#endif // WITH_SYSTEMD
+
+} // ceph::logging
+
+#endif
diff --git a/src/common/LRUSet.h b/src/common/LRUSet.h
new file mode 100644
index 000000000..b62956ba4
--- /dev/null
+++ b/src/common/LRUSet.h
@@ -0,0 +1,145 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <functional>
+#include <boost/intrusive/list.hpp>
+#include <boost/intrusive/unordered_set.hpp>
+#include "include/encoding.h"
+
+/// Combination of an LRU with fast hash-based membership lookup
+template<class T, int NUM_BUCKETS=128>
+class LRUSet {
+  /// internal node
+  struct Node
+    : boost::intrusive::unordered_set_base_hook<> {
+    // actual payload
+    T value;
+
+    // for the lru
+    boost::intrusive::list_member_hook<> lru_item;
+
+    Node(const T& v) : value(v) {}
+
+    friend std::size_t hash_value(const Node &node) {
+      return std::hash<T>{}(node.value);
+    }
+    friend bool operator<(const Node &a, const Node &b) {
+      return a.value < b.value;
+    }
+    friend bool operator>(const Node &a, const Node &b) {
+      return a.value > b.value;
+    }
+    friend bool operator==(const Node &a, const Node &b) {
+      return a.value == b.value;
+    }
+  };
+
+  struct NodeDeleteDisposer {
+    void operator()(Node *n) { delete n; }
+  };
+
+  // lru
+  boost::intrusive::list<
+    Node,
+    boost::intrusive::member_hook<Node,
+				  boost::intrusive::list_member_hook<>,
+				  &Node::lru_item>
+    > lru;
+
+  // hash-based set
+  typename boost::intrusive::unordered_set<Node>::bucket_type base_buckets[NUM_BUCKETS];
+  boost::intrusive::unordered_set<Node> set;
+
+ public:
+  LRUSet()
+    : set(typename boost::intrusive::unordered_set<Node>::bucket_traits(base_buckets,
+									NUM_BUCKETS))
+    {}
+  ~LRUSet() {
+    clear();
+  }
+
+  LRUSet(const LRUSet& other)
+    : set(typename boost::intrusive::unordered_set<Node>::bucket_traits(base_buckets,
+									NUM_BUCKETS)) {
+    for (auto & i : other.lru) {
+      insert(i.value);
+    }
+  }
+  const LRUSet& operator=(const LRUSet& other) {
+    clear();
+    for (auto& i : other.lru) {
+      insert(i.value);
+    }
+    return *this;
+  }
+
+  size_t size() const {
+    return set.size();
+  }
+
+  bool empty() const {
+    return set.empty();
+  }
+
+  bool contains(const T& item) const {
+    return set.count(item) > 0;
+  }
+
+  void clear() {
+    prune(0);
+  }
+
+  void insert(const T& item) {
+    erase(item);
+    Node *n = new Node(item);
+    lru.push_back(*n);
+    set.insert(*n);
+  }
+
+  bool erase(const T& item) {
+    auto p = set.find(item);
+    if (p == set.end()) {
+      return false;
+    }
+    lru.erase(lru.iterator_to(*p));
+    set.erase_and_dispose(p, NodeDeleteDisposer());
+    return true;
+  }
+
+  void prune(size_t max) {
+    while (set.size() > max) {
+      auto p = lru.begin();
+      set.erase(*p);
+      lru.erase_and_dispose(p, NodeDeleteDisposer());
+    }
+  }
+
+  void encode(bufferlist& bl) const {
+    using ceph::encode;
+    ENCODE_START(1, 1, bl);
+    uint32_t n = set.size();
+    encode(n, bl);
+    auto p = set.begin();
+    while (n--) {
+      encode(p->value, bl);
+      ++p;
+    }
+    ENCODE_FINISH(bl);
+  }
+  
+  void decode(bufferlist::const_iterator& p) {
+    using ceph::decode;
+    DECODE_START(1, p);
+    uint32_t n;
+    decode(n, p);
+    while (n--) {
+      T v;
+      decode(v, p);
+      insert(v);
+    }
+    DECODE_FINISH(p);
+  }
+};
diff --git a/src/common/LogClient.cc b/src/common/LogClient.cc
new file mode 100644
index 000000000..1ba363da7
--- /dev/null
+++ b/src/common/LogClient.cc
@@ -0,0 +1,331 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "common/LogClient.h"
+#include "include/str_map.h"
+#include "messages/MLog.h"
+#include "messages/MLogAck.h"
+#include "msg/Messenger.h"
+#include "mon/MonMap.h"
+#include "common/Graylog.h"
+
+#define dout_subsys ceph_subsys_monc
+
+using std::map;
+using std::ostream;
+using std::ostringstream;
+using std::string;
+
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+static ostream& _prefix(std::ostream *_dout, LogClient *logc) {
+  return *_dout << "log_client ";
+}
+
+static ostream& _prefix(std::ostream *_dout, LogChannel *lc) {
+  return *_dout << "log_channel(" << lc->get_log_channel() << ") ";
+}
+
+LogChannel::LogChannel(CephContext *cct, LogClient *lc, const string &channel)
+  : cct(cct), parent(lc),
+    log_channel(channel), log_to_syslog(false), log_to_monitors(false)
+{
+}
+
+LogChannel::LogChannel(CephContext *cct, LogClient *lc,
+                       const string &channel, const string &facility,
+                       const string &prio)
+  : cct(cct), parent(lc),
+    log_channel(channel), log_prio(prio), syslog_facility(facility),
+    log_to_syslog(false), log_to_monitors(false)
+{
+}
+
+LogClient::LogClient(CephContext *cct, Messenger *m, MonMap *mm,
+		     enum logclient_flag_t flags)
+  : cct(cct), messenger(m), monmap(mm), is_mon(flags & FLAG_MON),
+    last_log_sent(0), last_log(0)
+{
+}
+
+void LogChannel::set_log_to_monitors(bool v)
+{
+  if (log_to_monitors != v) {
+    parent->reset();
+    log_to_monitors = v;
+  }
+}
+
+void LogChannel::update_config(const clog_targets_conf_t& conf_strings)
+{
+  ldout(cct, 20) << __func__ << " log_to_monitors " << conf_strings.log_to_monitors
+		 << " log_to_syslog " << conf_strings.log_to_syslog
+		 << " log_channels " << conf_strings.log_channels
+		 << " log_prios " << conf_strings.log_prios
+		 << dendl;
+
+  bool to_monitors = (conf_strings.log_to_monitors == "true");
+  bool to_syslog = (conf_strings.log_to_syslog == "true");
+  bool to_graylog = (conf_strings.log_to_graylog == "true");
+  auto graylog_port = atoi(conf_strings.log_to_graylog_port.c_str());
+
+  set_log_to_monitors(to_monitors);
+  set_log_to_syslog(to_syslog);
+  set_syslog_facility(conf_strings.log_channels);
+  set_log_prio(conf_strings.log_prios);
+
+  if (to_graylog && !graylog) { /* should but isn't */
+    graylog = std::make_shared<ceph::logging::Graylog>("clog");
+  } else if (!to_graylog && graylog) { /* shouldn't but is */
+    graylog.reset();
+  }
+
+  if (to_graylog && graylog) {
+    graylog->set_fsid(conf_strings.fsid);
+    graylog->set_hostname(conf_strings.host);
+  }
+
+  if (graylog && !conf_strings.log_to_graylog_host.empty() && (graylog_port != 0)) {
+    graylog->set_destination(conf_strings.log_to_graylog_host, graylog_port);
+  }
+
+  ldout(cct, 10) << __func__
+		 << " to_monitors: " << (to_monitors ? "true" : "false")
+		 << " to_syslog: " << (to_syslog ? "true" : "false")
+		 << " syslog_facility: " << conf_strings.log_channels
+		 << " prio: " << conf_strings.log_prios
+		 << " to_graylog: " << (to_graylog ? "true" : "false")
+		 << " graylog_host: " << conf_strings.log_to_graylog_host
+		 << " graylog_port: " << graylog_port
+		 << ")" << dendl;
+}
+
+clog_targets_conf_t LogChannel::parse_client_options(CephContext* conf_cct)
+{
+  auto parsed_options = parse_log_client_options(conf_cct);
+  update_config(parsed_options);
+  return parsed_options;
+}
+
+clog_targets_conf_t LogChannel::parse_log_client_options(CephContext* cct)
+{
+  clog_targets_conf_t targets;
+
+  targets.log_to_monitors =
+    get_value_via_strmap(cct->_conf.get_val<string>("clog_to_monitors"),
+                         log_channel, CLOG_CONFIG_DEFAULT_KEY);
+  targets.log_to_syslog =
+    get_value_via_strmap(cct->_conf.get_val<string>("clog_to_syslog"),
+                         log_channel, CLOG_CONFIG_DEFAULT_KEY);
+  targets.log_channels =
+    get_value_via_strmap(cct->_conf.get_val<string>("clog_to_syslog_facility"),
+                         log_channel, CLOG_CONFIG_DEFAULT_KEY);
+  targets.log_prios =
+    get_value_via_strmap(cct->_conf.get_val<string>("clog_to_syslog_level"),
+                         log_channel, CLOG_CONFIG_DEFAULT_KEY);
+  targets.log_to_graylog =
+    get_value_via_strmap(cct->_conf.get_val<string>("clog_to_graylog"),
+                         log_channel, CLOG_CONFIG_DEFAULT_KEY);
+  targets.log_to_graylog_host =
+    get_value_via_strmap(cct->_conf.get_val<string>("clog_to_graylog_host"),
+                         log_channel, CLOG_CONFIG_DEFAULT_KEY);
+  targets.log_to_graylog_port =
+    get_value_via_strmap(cct->_conf.get_val<string>("clog_to_graylog_port"),
+                         log_channel, CLOG_CONFIG_DEFAULT_KEY);
+
+  targets.fsid = cct->_conf.get_val<uuid_d>("fsid");
+  targets.host = cct->_conf->host;
+  return targets;
+}
+
+void LogChannel::do_log(clog_type prio, std::stringstream& ss)
+{
+  while (!ss.eof()) {
+    string s;
+    getline(ss, s);
+    if (!s.empty())
+      do_log(prio, s);
+  }
+}
+
+void LogChannel::do_log(clog_type prio, const std::string& s)
+{
+  std::lock_guard l(channel_lock);
+  if (CLOG_ERROR == prio) {
+    ldout(cct,-1) << "log " << prio << " : " << s << dendl;
+  } else {
+    ldout(cct,0) << "log " << prio << " : " << s << dendl;
+  }
+  LogEntry e;
+  e.stamp = ceph_clock_now();
+  // seq and who should be set for syslog/graylog/log_to_mon
+  e.addrs = parent->get_myaddrs();
+  e.name = parent->get_myname();
+  e.rank = parent->get_myrank();
+  e.prio = prio;
+  e.msg = s;
+  e.channel = get_log_channel();
+
+  // log to monitor?
+  if (log_to_monitors) {
+    e.seq = parent->queue(e);
+  } else {
+    e.seq = parent->get_next_seq();
+  }
+
+  // log to syslog?
+  if (do_log_to_syslog()) {
+    ldout(cct,0) << __func__ << " log to syslog"  << dendl;
+    e.log_to_syslog(get_log_prio(), get_syslog_facility());
+  }
+
+  // log to graylog?
+  if (do_log_to_graylog()) {
+    ldout(cct,0) << __func__ << " log to graylog"  << dendl;
+    graylog->log_log_entry(&e);
+  }
+}
+
+ceph::ref_t<Message> LogClient::get_mon_log_message(bool flush)
+{
+  std::lock_guard l(log_lock);
+  if (flush) {
+    if (log_queue.empty())
+      return nullptr;
+    // reset session
+    last_log_sent = log_queue.front().seq;
+  }
+  return _get_mon_log_message();
+}
+
+bool LogClient::are_pending()
+{
+  std::lock_guard l(log_lock);
+  return last_log > last_log_sent;
+}
+
+ceph::ref_t<Message> LogClient::_get_mon_log_message()
+{
+  ceph_assert(ceph_mutex_is_locked(log_lock));
+  if (log_queue.empty())
+    return {};
+
+  // only send entries that haven't been sent yet during this mon
+  // session!  monclient needs to call reset_session() on mon session
+  // reset for this to work right.
+
+  if (last_log_sent == last_log)
+    return {};
+
+  // limit entries per message
+  unsigned num_unsent = last_log - last_log_sent;
+  unsigned num_send;
+  if (cct->_conf->mon_client_max_log_entries_per_message > 0)
+    num_send = std::min(num_unsent, (unsigned)cct->_conf->mon_client_max_log_entries_per_message);
+  else
+    num_send = num_unsent;
+
+  ldout(cct,10) << " log_queue is " << log_queue.size() << " last_log " << last_log << " sent " << last_log_sent
+		<< " num " << log_queue.size()
+		<< " unsent " << num_unsent
+		<< " sending " << num_send << dendl;
+  ceph_assert(num_unsent <= log_queue.size());
+  std::deque<LogEntry>::iterator p = log_queue.begin();
+  std::deque<LogEntry> o;
+  while (p->seq <= last_log_sent) {
+    ++p;
+    ceph_assert(p != log_queue.end());
+  }
+  while (num_send--) {
+    ceph_assert(p != log_queue.end());
+    o.push_back(*p);
+    last_log_sent = p->seq;
+    ldout(cct,10) << " will send " << *p << dendl;
+    ++p;
+  }
+  
+  return ceph::make_message<MLog>(monmap->get_fsid(),
+				  std::move(o));
+}
+
+void LogClient::_send_to_mon()
+{
+  ceph_assert(ceph_mutex_is_locked(log_lock));
+  ceph_assert(is_mon);
+  ceph_assert(messenger->get_myname().is_mon());
+  ldout(cct,10) << __func__ << " log to self" << dendl;
+  auto log = _get_mon_log_message();
+  messenger->get_loopback_connection()->send_message2(std::move(log));
+}
+
+version_t LogClient::queue(LogEntry &entry)
+{
+  std::lock_guard l(log_lock);
+  entry.seq = ++last_log;
+  log_queue.push_back(entry);
+
+  if (is_mon) {
+    _send_to_mon();
+  }
+
+  return entry.seq;
+}
+
+void LogClient::reset()
+{
+  std::lock_guard l(log_lock);
+  if (log_queue.size()) {
+    log_queue.clear();
+  }
+  last_log_sent = last_log;
+}
+
+uint64_t LogClient::get_next_seq()
+{
+  std::lock_guard l(log_lock);
+  return ++last_log;
+}
+
+entity_addrvec_t LogClient::get_myaddrs()
+{
+  return messenger->get_myaddrs();
+}
+
+entity_name_t LogClient::get_myrank()
+{
+  return messenger->get_myname();
+}
+
+const EntityName& LogClient::get_myname()
+{
+  return cct->_conf->name;
+}
+
+bool LogClient::handle_log_ack(MLogAck *m)
+{
+  std::lock_guard l(log_lock);
+  ldout(cct,10) << "handle_log_ack " << *m << dendl;
+
+  version_t last = m->last;
+
+  auto q = log_queue.begin();
+  while (q != log_queue.end()) {
+    const LogEntry &entry(*q);
+    if (entry.seq > last)
+      break;
+    ldout(cct,10) << " logged " << entry << dendl;
+    q = log_queue.erase(q);
+  }
+  return true;
+}
diff --git a/src/common/LogClient.h b/src/common/LogClient.h
new file mode 100644
index 000000000..507fd9a9e
--- /dev/null
+++ b/src/common/LogClient.h
@@ -0,0 +1,255 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_LOGCLIENT_H
+#define CEPH_LOGCLIENT_H
+
+#include <atomic>
+#include "common/LogEntry.h"
+#include "common/ceph_mutex.h"
+#include "common/ostream_temp.h"
+#include "common/ref.h"
+#include "include/health.h"
+
+class LogClient;
+class MLog;
+class MLogAck;
+class Messenger;
+class MonMap;
+class Message;
+struct uuid_d;
+struct Connection;
+
+class LogChannel;
+
+namespace ceph {
+namespace logging {
+  class Graylog;
+}
+}
+
+struct clog_targets_conf_t {
+  std::string log_to_monitors;
+  std::string log_to_syslog;
+  std::string log_channels;
+  std::string log_prios;
+  std::string log_to_graylog;
+  std::string log_to_graylog_host;
+  std::string log_to_graylog_port;
+  uuid_d fsid; // only 16B. Simpler as a copy.
+  std::string host;
+};
+
+/** Manage where we output to and at which priority
+ *
+ * Not to be confused with the LogClient, which is the almighty coordinator
+ * of channels.  We just deal with the boring part of the logging: send to
+ * syslog, send to file, generate LogEntry and queue it for the LogClient.
+ *
+ * Past queueing the LogEntry, the LogChannel is done with the whole thing.
+ * LogClient will deal with sending and handling of LogEntries.
+ */
+class LogChannel : public LoggerSinkSet
+{
+public:
+
+  LogChannel(CephContext *cct, LogClient *lc, const std::string &channel);
+  LogChannel(CephContext *cct, LogClient *lc,
+             const std::string &channel,
+             const std::string &facility,
+             const std::string &prio);
+
+  OstreamTemp debug() final {
+    return OstreamTemp(CLOG_DEBUG, this);
+  }
+  void debug(std::stringstream &s) final {
+    do_log(CLOG_DEBUG, s);
+  }
+  /**
+   * Convenience function mapping health status to
+   * the appropriate cluster log severity.
+   */
+  OstreamTemp health(health_status_t health) {
+    switch(health) {
+      case HEALTH_OK:
+        return info();
+      case HEALTH_WARN:
+        return warn();
+      case HEALTH_ERR:
+        return error();
+      default:
+        // Invalid health_status_t value
+        ceph_abort();
+    }
+  }
+  OstreamTemp info() final {
+    return OstreamTemp(CLOG_INFO, this);
+  }
+  void info(std::stringstream &s) final {
+    do_log(CLOG_INFO, s);
+  }
+  OstreamTemp warn() final {
+    return OstreamTemp(CLOG_WARN, this);
+  }
+  void warn(std::stringstream &s) final {
+    do_log(CLOG_WARN, s);
+  }
+  OstreamTemp error() final {
+    return OstreamTemp(CLOG_ERROR, this);
+  }
+  void error(std::stringstream &s) final {
+    do_log(CLOG_ERROR, s);
+  }
+  OstreamTemp sec() final {
+    return OstreamTemp(CLOG_SEC, this);
+  }
+  void sec(std::stringstream &s) final {
+    do_log(CLOG_SEC, s);
+  }
+
+  void set_log_to_monitors(bool v);
+  void set_log_to_syslog(bool v) {
+    log_to_syslog = v;
+  }
+  void set_log_channel(const std::string& v) {
+    log_channel = v;
+  }
+  void set_log_prio(const std::string& v) {
+    log_prio = v;
+  }
+  void set_syslog_facility(const std::string& v) {
+    syslog_facility = v;
+  }
+  std::string get_log_prio() { return log_prio; }
+  std::string get_log_channel() { return log_channel; }
+  std::string get_syslog_facility() { return syslog_facility; }
+  bool must_log_to_syslog() { return log_to_syslog; }
+  /**
+   * Do we want to log to syslog?
+   *
+   * @return true if log_to_syslog is true and both channel and prio
+   *         are not empty; false otherwise.
+   */
+  bool do_log_to_syslog() {
+    return must_log_to_syslog() &&
+          !log_prio.empty() && !log_channel.empty();
+  }
+  bool must_log_to_monitors() { return log_to_monitors; }
+
+  bool do_log_to_graylog() {
+    return (graylog != nullptr);
+  }
+
+  typedef std::shared_ptr<LogChannel> Ref;
+
+  /**
+   * Query the configuration database in conf_cct for configuration
+   * parameters. Pick out the relevant values based on our channel name.
+   * Update the logger configuration based on these values.
+   *
+   * Return a collection of configuration strings.
+   */
+  clog_targets_conf_t parse_client_options(CephContext* conf_cct);
+
+  void do_log(clog_type prio, std::stringstream& ss) final;
+  void do_log(clog_type prio, const std::string& s) final;
+
+private:
+  CephContext *cct;
+  LogClient *parent;
+  ceph::mutex channel_lock = ceph::make_mutex("LogChannel::channel_lock");
+  std::string log_channel;
+  std::string log_prio;
+  std::string syslog_facility;
+  bool log_to_syslog;
+  bool log_to_monitors;
+  std::shared_ptr<ceph::logging::Graylog> graylog;
+
+  /**
+   * update config values from parsed k/v std::map for each config option
+   */
+  void update_config(const clog_targets_conf_t& conf_strings);
+
+  clog_targets_conf_t parse_log_client_options(CephContext* conf_cct);
+};
+
+typedef LogChannel::Ref LogChannelRef;
+
+class LogClient
+{
+public:
+  enum logclient_flag_t {
+    NO_FLAGS = 0,
+    FLAG_MON = 0x1,
+  };
+
+  LogClient(CephContext *cct, Messenger *m, MonMap *mm,
+          logclient_flag_t flags);
+
+  virtual ~LogClient() {
+    channels.clear();
+  }
+
+  bool handle_log_ack(MLogAck *m);
+  ceph::ref_t<Message> get_mon_log_message(bool flush);
+  bool are_pending();
+
+  LogChannelRef create_channel() {
+    return create_channel(CLOG_CHANNEL_DEFAULT);
+  }
+
+  LogChannelRef create_channel(const std::string& name) {
+    LogChannelRef c;
+    if (channels.count(name))
+      c = channels[name];
+    else {
+      c = std::make_shared<LogChannel>(cct, this, name);
+      channels[name] = c;
+    }
+    return c;
+  }
+
+  void destroy_channel(const std::string& name) {
+    if (channels.count(name))
+      channels.erase(name);
+  }
+
+  void shutdown() {
+    channels.clear();
+  }
+
+  uint64_t get_next_seq();
+  entity_addrvec_t get_myaddrs();
+  const EntityName& get_myname();
+  entity_name_t get_myrank();
+  version_t queue(LogEntry &entry);
+  void reset();
+
+private:
+  ceph::ref_t<Message> _get_mon_log_message();
+  void _send_to_mon();
+
+  CephContext *cct;
+  Messenger *messenger;
+  MonMap *monmap;
+  bool is_mon;
+  ceph::mutex log_lock = ceph::make_mutex("LogClient::log_lock");
+  version_t last_log_sent;
+  version_t last_log;
+  std::deque<LogEntry> log_queue;
+
+  std::map<std::string, LogChannelRef> channels;
+
+};
+#endif
diff --git a/src/common/LogEntry.cc b/src/common/LogEntry.cc
new file mode 100644
index 000000000..d7b44a211
--- /dev/null
+++ b/src/common/LogEntry.cc
@@ -0,0 +1,360 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+//
+#include <syslog.h>
+#include <boost/algorithm/string/predicate.hpp>
+
+#include "LogEntry.h"
+#include "Formatter.h"
+#include "include/stringify.h"
+
+using std::list;
+using std::map;
+using std::make_pair;
+using std::pair;
+using std::string;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+
+// ----
+// LogEntryKey
+
+void LogEntryKey::dump(Formatter *f) const
+{
+  f->dump_stream("rank") << rank;
+  f->dump_stream("stamp") << stamp;
+  f->dump_unsigned("seq", seq);
+}
+
+void LogEntryKey::generate_test_instances(list<LogEntryKey*>& o)
+{
+  o.push_back(new LogEntryKey);
+  o.push_back(new LogEntryKey(entity_name_t::CLIENT(1234), utime_t(1,2), 34));
+}
+
+clog_type LogEntry::str_to_level(std::string const &str)
+{
+  std::string level_str = str;
+  std::transform(level_str.begin(), level_str.end(), level_str.begin(),
+      [](char c) {return std::tolower(c);});
+
+  if (level_str == "debug") {
+    return CLOG_DEBUG;
+  } else if (level_str == "info") {
+    return CLOG_INFO;
+  } else if (level_str == "sec") {
+    return CLOG_SEC;
+  } else if (level_str == "warn" || level_str == "warning") {
+    return CLOG_WARN;
+  } else if (level_str == "error" || level_str == "err") {
+    return CLOG_ERROR;
+  } else {
+    return CLOG_UNKNOWN;
+  }
+}
+
+// ----
+
+int clog_type_to_syslog_level(clog_type t)
+{
+  switch (t) {
+    case CLOG_DEBUG:
+      return LOG_DEBUG;
+    case CLOG_INFO:
+      return LOG_INFO;
+    case CLOG_WARN:
+      return LOG_WARNING;
+    case CLOG_ERROR:
+      return LOG_ERR;
+    case CLOG_SEC:
+      return LOG_CRIT;
+    default:
+      ceph_abort();
+      return 0;
+  }
+}
+
+clog_type string_to_clog_type(const string& s)
+{
+  if (boost::iequals(s, "debug") ||
+      boost::iequals(s, "dbg"))
+    return CLOG_DEBUG;
+  if (boost::iequals(s, "info") ||
+      boost::iequals(s, "inf"))
+    return CLOG_INFO;
+  if (boost::iequals(s, "warning") ||
+      boost::iequals(s, "warn") ||
+      boost::iequals(s, "wrn"))
+    return CLOG_WARN;
+  if (boost::iequals(s, "error") ||
+      boost::iequals(s, "err"))
+    return CLOG_ERROR;
+  if (boost::iequals(s, "security") ||
+      boost::iequals(s, "sec"))
+    return CLOG_SEC;
+
+  return CLOG_UNKNOWN;
+}
+
+int string_to_syslog_level(string s)
+{
+  if (boost::iequals(s, "debug"))
+    return LOG_DEBUG;
+  if (boost::iequals(s, "info") ||
+      boost::iequals(s, "notice"))
+    return LOG_INFO;
+  if (boost::iequals(s, "warning") ||
+      boost::iequals(s, "warn"))
+    return LOG_WARNING;
+  if (boost::iequals(s, "error") ||
+      boost::iequals(s, "err"))
+    return LOG_ERR;
+  if (boost::iequals(s, "crit") ||
+      boost::iequals(s, "critical") ||
+      boost::iequals(s, "emerg"))
+    return LOG_CRIT;
+
+  // err on the side of noise!
+  return LOG_DEBUG;
+}
+
+int string_to_syslog_facility(string s)
+{
+  if (boost::iequals(s, "auth"))
+    return LOG_AUTH;
+  if (boost::iequals(s, "authpriv"))
+    return LOG_AUTHPRIV;
+  if (boost::iequals(s, "cron"))
+    return LOG_CRON;
+  if (boost::iequals(s, "daemon"))
+    return LOG_DAEMON;
+  if (boost::iequals(s, "ftp"))
+    return LOG_FTP;
+  if (boost::iequals(s, "kern"))
+    return LOG_KERN;
+  if (boost::iequals(s, "local0"))
+    return LOG_LOCAL0;
+  if (boost::iequals(s, "local1"))
+    return LOG_LOCAL1;
+  if (boost::iequals(s, "local2"))
+    return LOG_LOCAL2;
+  if (boost::iequals(s, "local3"))
+    return LOG_LOCAL3;
+  if (boost::iequals(s, "local4"))
+    return LOG_LOCAL4;
+  if (boost::iequals(s, "local5"))
+    return LOG_LOCAL5;
+  if (boost::iequals(s, "local6"))
+    return LOG_LOCAL6;
+  if (boost::iequals(s, "local7"))
+    return LOG_LOCAL7;
+  if (boost::iequals(s, "lpr"))
+    return LOG_LPR;
+  if (boost::iequals(s, "mail"))
+    return LOG_MAIL;
+  if (boost::iequals(s, "news"))
+    return LOG_NEWS;
+  if (boost::iequals(s, "syslog"))
+    return LOG_SYSLOG;
+  if (boost::iequals(s, "user"))
+    return LOG_USER;
+  if (boost::iequals(s, "uucp"))
+    return LOG_UUCP;
+
+  // default to USER
+  return LOG_USER;
+}
+
+string clog_type_to_string(clog_type t)
+{
+  switch (t) {
+    case CLOG_DEBUG:
+      return "debug";
+    case CLOG_INFO:
+      return "info";
+    case CLOG_WARN:
+      return "warn";
+    case CLOG_ERROR:
+      return "err";
+    case CLOG_SEC:
+      return "crit";
+    default:
+      ceph_abort();
+      return 0;
+  }
+}
+
+void LogEntry::log_to_syslog(string level, string facility) const
+{
+  int min = string_to_syslog_level(level);
+  int l = clog_type_to_syslog_level(prio);
+  if (l <= min) {
+    int f = string_to_syslog_facility(facility);
+    syslog(l | f, "%s %s %llu : %s",
+	   name.to_cstr(),
+	   stringify(rank).c_str(),
+	   (long long unsigned)seq,
+	   msg.c_str());
+  }
+}
+
+void LogEntry::encode(bufferlist& bl, uint64_t features) const
+{
+  assert(HAVE_FEATURE(features, SERVER_NAUTILUS));
+  ENCODE_START(5, 5, bl);
+  __u16 t = prio;
+  encode(name, bl);
+  encode(rank, bl);
+  encode(addrs, bl, features);
+  encode(stamp, bl);
+  encode(seq, bl);
+  encode(t, bl);
+  encode(msg, bl);
+  encode(channel, bl);
+  ENCODE_FINISH(bl);
+}
+
+void LogEntry::decode(bufferlist::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(5, 2, 2, bl);
+  if (struct_v < 5) {
+    __u16 t;
+    entity_inst_t who;
+    decode(who, bl);
+    rank = who.name;
+    addrs.v.clear();
+    addrs.v.push_back(who.addr);
+    decode(stamp, bl);
+    decode(seq, bl);
+    decode(t, bl);
+    prio = (clog_type)t;
+    decode(msg, bl);
+    if (struct_v >= 3) {
+      decode(channel, bl);
+    } else {
+      // prior to having logging channels we only had a cluster log.
+      // Ensure we keep that appearance when the other party has no
+      // clue of what a 'channel' is.
+      channel = CLOG_CHANNEL_CLUSTER;
+    }
+    if (struct_v >= 4) {
+      decode(name, bl);
+    }
+  } else {
+    __u16 t;
+    decode(name, bl);
+    decode(rank, bl);
+    decode(addrs, bl);
+    decode(stamp, bl);
+    decode(seq, bl);
+    decode(t, bl);
+    prio = (clog_type)t;
+    decode(msg, bl);
+    decode(channel, bl);
+  }
+  DECODE_FINISH(bl);
+}
+
+void LogEntry::dump(Formatter *f) const
+{
+  f->dump_stream("name") << name;
+  f->dump_stream("rank") << rank;
+  f->dump_object("addrs", addrs);
+  f->dump_stream("stamp") << stamp;
+  f->dump_unsigned("seq", seq);
+  f->dump_string("channel", channel);
+  f->dump_stream("priority") << prio;
+  f->dump_string("message", msg);
+}
+
+void LogEntry::generate_test_instances(list<LogEntry*>& o)
+{
+  o.push_back(new LogEntry);
+}
+
+
+// -----
+
+void LogSummary::build_ordered_tail_legacy(list<LogEntry> *tail) const
+{
+  tail->clear();
+  // channel -> (begin, end)
+  map<string,pair<list<pair<uint64_t,LogEntry>>::const_iterator,
+		  list<pair<uint64_t,LogEntry>>::const_iterator>> pos;
+  for (auto& i : tail_by_channel) {
+    pos.emplace(i.first, make_pair(i.second.begin(), i.second.end()));
+  }
+  while (true) {
+    uint64_t min_seq = 0;
+    list<pair<uint64_t,LogEntry>>::const_iterator *minp = 0;
+    for (auto& i : pos) {
+      if (i.second.first == i.second.second) {
+	continue;
+      }
+      if (min_seq == 0 || i.second.first->first < min_seq) {
+	min_seq = i.second.first->first;
+	minp = &i.second.first;
+      }
+    }
+    if (min_seq == 0) {
+      break; // done
+    }
+    tail->push_back((*minp)->second);
+    ++(*minp);
+  }
+}
+
+void LogSummary::encode(bufferlist& bl, uint64_t features) const
+{
+  assert(HAVE_FEATURE(features, SERVER_MIMIC));
+  ENCODE_START(4, 3, bl);
+  encode(version, bl);
+  encode(seq, bl);
+  encode(tail_by_channel, bl, features);
+  encode(channel_info, bl);
+  recent_keys.encode(bl);
+  ENCODE_FINISH(bl);
+}
+
+void LogSummary::decode(bufferlist::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
+  decode(version, bl);
+  decode(seq, bl);
+  decode(tail_by_channel, bl);
+  if (struct_v >= 4) {
+    decode(channel_info, bl);
+    recent_keys.decode(bl);
+  }
+  DECODE_FINISH(bl);
+  keys.clear();
+  for (auto& i : tail_by_channel) {
+    for (auto& e : i.second) {
+      keys.insert(e.second.key());
+    }
+  }
+}
+
+void LogSummary::dump(Formatter *f) const
+{
+  f->dump_unsigned("version", version);
+  f->open_object_section("tail_by_channel");
+  for (auto& i : tail_by_channel) {
+    f->open_object_section(i.first.c_str());
+    for (auto& j : i.second) {
+      string s = stringify(j.first);
+      f->dump_object(s.c_str(), j.second);
+    }
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void LogSummary::generate_test_instances(list<LogSummary*>& o)
+{
+  o.push_back(new LogSummary);
+  // more!
+}
diff --git a/src/common/LogEntry.h b/src/common/LogEntry.h
new file mode 100644
index 000000000..3ddebbd30
--- /dev/null
+++ b/src/common/LogEntry.h
@@ -0,0 +1,212 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_LOGENTRY_H
+#define CEPH_LOGENTRY_H
+
+#include <fmt/format.h>
+
+#include "include/utime.h"
+#include "msg/msg_fmt.h"
+#include "msg/msg_types.h"
+#include "common/entity_name.h"
+#include "ostream_temp.h"
+#include "LRUSet.h"
+
+namespace ceph {
+  class Formatter;
+}
+
+static const std::string CLOG_CHANNEL_NONE    = "none";
+static const std::string CLOG_CHANNEL_DEFAULT = "cluster";
+static const std::string CLOG_CHANNEL_CLUSTER = "cluster";
+static const std::string CLOG_CHANNEL_AUDIT   = "audit";
+
+// this is the key name used in the config options for the default, e.g.
+//   default=true foo=false bar=false
+static const std::string CLOG_CONFIG_DEFAULT_KEY = "default";
+
+/*
+ * Given a clog log_type, return the equivalent syslog priority
+ */
+int clog_type_to_syslog_level(clog_type t);
+
+clog_type string_to_clog_type(const std::string& s);
+int string_to_syslog_level(std::string s);
+int string_to_syslog_facility(std::string s);
+
+std::string clog_type_to_string(clog_type t);
+
+
+struct LogEntryKey {
+private:
+  uint64_t _hash = 0;
+
+  void _calc_hash() {
+    std::hash<entity_name_t> h;
+    _hash = seq + h(rank);
+  }
+
+  entity_name_t rank;
+  utime_t stamp;
+  uint64_t seq = 0;
+
+public:
+  LogEntryKey() {}
+  LogEntryKey(const entity_name_t& w, utime_t t, uint64_t s)
+    : rank(w), stamp(t), seq(s) {
+    _calc_hash();
+  }
+
+  uint64_t get_hash() const {
+    return _hash;
+  }
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<LogEntryKey*>& o);
+
+  friend bool operator==(const LogEntryKey& l, const LogEntryKey& r) {
+    return l.rank == r.rank && l.stamp == r.stamp && l.seq == r.seq;
+  }
+
+  void encode(bufferlist& bl) const {
+    using ceph::encode;
+    encode(rank, bl);
+    encode(stamp, bl);
+    encode(seq, bl);
+  }
+  void decode(bufferlist::const_iterator &p) {
+    using ceph::decode;
+    decode(rank, p);
+    decode(stamp, p);
+    decode(seq, p);
+  }
+};
+WRITE_CLASS_ENCODER(LogEntryKey)
+
+
+namespace std {
+template<> struct hash<LogEntryKey> {
+  size_t operator()(const LogEntryKey& r) const {
+    return r.get_hash();
+  }
+};
+} // namespace std
+
+struct LogEntry {
+  EntityName name;
+  entity_name_t rank;
+  entity_addrvec_t addrs;
+  utime_t stamp;
+  uint64_t seq;
+  clog_type prio;
+  std::string msg;
+  std::string channel;
+
+  LogEntry() : seq(0), prio(CLOG_DEBUG) {}
+
+  LogEntryKey key() const { return LogEntryKey(rank, stamp, seq); }
+
+  void log_to_syslog(std::string level, std::string facility) const;
+
+  void encode(ceph::buffer::list& bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<LogEntry*>& o);
+  static clog_type str_to_level(std::string const &str);
+};
+WRITE_CLASS_ENCODER_FEATURES(LogEntry)
+
+struct LogSummary {
+  version_t version;
+
+  // ---- pre-quincy ----
+  // channel -> [(seq#, entry), ...]
+  std::map<std::string,std::list<std::pair<uint64_t,LogEntry>>> tail_by_channel;
+  uint64_t seq = 0;
+  ceph::unordered_set<LogEntryKey> keys;
+
+  // ---- quincy+ ----
+  LRUSet<LogEntryKey> recent_keys;
+  std::map<std::string, std::pair<uint64_t,uint64_t>> channel_info; // channel -> [begin, end)
+
+  LogSummary() : version(0) {}
+
+  void build_ordered_tail_legacy(std::list<LogEntry> *tail) const;
+
+  void add_legacy(const LogEntry& e) {
+    keys.insert(e.key());
+    tail_by_channel[e.channel].push_back(std::make_pair(++seq, e));
+  }
+  void prune(size_t max) {
+    for (auto& i : tail_by_channel) {
+      while (i.second.size() > max) {
+	keys.erase(i.second.front().second.key());
+	i.second.pop_front();
+      }
+    }
+    recent_keys.prune(max);
+  }
+  bool contains(const LogEntryKey& k) const {
+    return keys.count(k) || recent_keys.contains(k);
+  }
+
+  void encode(ceph::buffer::list& bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<LogSummary*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(LogSummary)
+
+inline std::ostream& operator<<(std::ostream& out, const clog_type t)
+{
+  switch (t) {
+  case CLOG_DEBUG:
+    return out << "[DBG]";
+  case CLOG_INFO:
+    return out << "[INF]";
+  case CLOG_SEC:
+    return out << "[SEC]";
+  case CLOG_WARN:
+    return out << "[WRN]";
+  case CLOG_ERROR:
+    return out << "[ERR]";
+  default:
+    return out << "[???]";
+  }
+}
+
+inline std::ostream& operator<<(std::ostream& out, const LogEntry& e)
+{
+  return out << e.stamp << " " << e.name << " (" << e.rank << ") "
+	     << e.seq << " : "
+             << e.channel << " " << e.prio << " " << e.msg;
+}
+
+template <> struct fmt::formatter<EntityName> : fmt::formatter<std::string_view> {
+  template <typename FormatContext>
+  auto format(const EntityName& e, FormatContext& ctx) {
+    return formatter<std::string_view>::format(e.to_str(), ctx);
+  }
+};
+
+template <> struct fmt::formatter<LogEntry> : fmt::formatter<std::string_view> {
+  template <typename FormatContext>
+  auto format(const LogEntry& e, FormatContext& ctx) {
+    return fmt::format_to(ctx.out(), "{} {} ({}) {} : {} {} {}",
+			  e.stamp, e.name, e.rank, e.seq, e.channel, e.prio, e.msg);
+  }
+};
+
+#endif
diff --git a/src/common/MemoryModel.cc b/src/common/MemoryModel.cc
new file mode 100644
index 000000000..0f6ab986f
--- /dev/null
+++ b/src/common/MemoryModel.cc
@@ -0,0 +1,96 @@
+#include "MemoryModel.h"
+#include "include/compat.h"
+#include "debug.h"
+#if defined(__linux__)
+#include <malloc.h>
+#endif
+
+#include <fstream>
+
+#define dout_subsys ceph_subsys_
+
+using namespace std;
+
+MemoryModel::MemoryModel(CephContext *cct_)
+  : cct(cct_)
+{
+}
+
+void MemoryModel::_sample(snap *psnap)
+{
+  ifstream f;
+
+  f.open(PROCPREFIX "/proc/self/status");
+  if (!f.is_open()) {
+    ldout(cct, 0) << "check_memory_usage unable to open " PROCPREFIX "/proc/self/status" << dendl;
+    return;
+  }
+  while (!f.eof()) {
+    string line;
+    getline(f, line);
+    
+    if (strncmp(line.c_str(), "VmSize:", 7) == 0)
+      psnap->size = atol(line.c_str() + 7);
+    else if (strncmp(line.c_str(), "VmRSS:", 6) == 0)
+      psnap->rss = atol(line.c_str() + 7);
+    else if (strncmp(line.c_str(), "VmHWM:", 6) == 0)
+      psnap->hwm = atol(line.c_str() + 7);
+    else if (strncmp(line.c_str(), "VmLib:", 6) == 0)
+      psnap->lib = atol(line.c_str() + 7);
+    else if (strncmp(line.c_str(), "VmPeak:", 7) == 0)
+      psnap->peak = atol(line.c_str() + 7);
+    else if (strncmp(line.c_str(), "VmData:", 7) == 0)
+      psnap->data = atol(line.c_str() + 7);
+  }
+  f.close();
+
+  f.open(PROCPREFIX "/proc/self/maps");
+  if (!f.is_open()) {
+    ldout(cct, 0) << "check_memory_usage unable to open " PROCPREFIX "/proc/self/maps" << dendl;
+    return;
+  }
+
+  long heap = 0;
+  while (f.is_open() && !f.eof()) {
+    string line;
+    getline(f, line);
+    //ldout(cct, 0) << "line is " << line << dendl;
+
+    const char *start = line.c_str();
+    const char *dash = start;
+    while (*dash && *dash != '-') dash++;
+    if (!*dash)
+      continue;
+    const char *end = dash + 1;
+    while (*end && *end != ' ') end++;
+    if (!*end)
+      continue;
+    unsigned long long as = strtoll(start, 0, 16);
+    unsigned long long ae = strtoll(dash+1, 0, 16);
+
+    //ldout(cct, 0) << std::hex << as << " to " << ae << std::dec << dendl;
+
+    end++;
+    const char *mode = end;
+
+    int skip = 4;
+    while (skip--) {
+      end++;
+      while (*end && *end != ' ') end++;
+    }
+    if (*end)
+      end++;
+
+    long size = ae - as;
+    //ldout(cct, 0) << "size " << size << " mode is '" << mode << "' end is '" << end << "'" << dendl;
+
+    /*
+     * anything 'rw' and anon is assumed to be heap.
+     */
+    if (mode[0] == 'r' && mode[1] == 'w' && !*end)
+      heap += size;
+  }
+
+  psnap->heap = heap >> 10;
+
+}
diff --git a/src/common/MemoryModel.h b/src/common/MemoryModel.h
new file mode 100644
index 000000000..ee87c6f3b
--- /dev/null
+++ b/src/common/MemoryModel.h
@@ -0,0 +1,54 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MEMORYMODEL_H
+#define CEPH_MEMORYMODEL_H
+
+#include "include/common_fwd.h"
+
+class MemoryModel {
+public:
+  struct snap {
+    long peak;
+    long size;
+    long hwm;
+    long rss;
+    long data;
+    long lib;
+    
+    long heap;
+
+    snap() : peak(0), size(0), hwm(0), rss(0), data(0), lib(0),
+	     heap(0)
+    {}
+
+    long get_total() { return size; }
+    long get_rss() { return rss; }
+    long get_heap() { return heap; }
+  } last;
+
+private:
+  CephContext *cct;
+  void _sample(snap *p);
+
+public:
+  explicit MemoryModel(CephContext *cct);
+  void sample(snap *p = 0) {
+    _sample(&last);
+    if (p)
+      *p = last;
+  }
+};
+
+#endif
diff --git a/src/common/OpQueue.h b/src/common/OpQueue.h
new file mode 100644
index 000000000..0204f4b44
--- /dev/null
+++ b/src/common/OpQueue.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef OP_QUEUE_H
+#define OP_QUEUE_H
+
+#include "include/msgr.h"
+
+#include <list>
+#include <functional>
+
+namespace ceph {
+  class Formatter;
+}
+
+/**
+ * Abstract class for all Op Queues
+ *
+ * In order to provide optimized code, be sure to declare all
+ * virtual functions as final in the derived class.
+ */
+
+template <typename T, typename K>
+class OpQueue {
+public:
+  // Ops of this class should be deleted immediately. If out isn't
+  // nullptr then items should be added to the front in
+  // front-to-back order. The typical strategy is to visit items in
+  // the queue in *reverse* order and to use *push_front* to insert
+  // them into out.
+  virtual void remove_by_class(K k, std::list<T> *out) = 0;
+
+  // Enqueue op in the back of the strict queue
+  virtual void enqueue_strict(K cl, unsigned priority, T &&item) = 0;
+
+  // Enqueue op in the front of the strict queue
+  virtual void enqueue_strict_front(K cl, unsigned priority, T &&item) = 0;
+
+  // Enqueue op in the back of the regular queue
+  virtual void enqueue(K cl, unsigned priority, unsigned cost, T &&item) = 0;
+
+  // Enqueue the op in the front of the regular queue
+  virtual void enqueue_front(
+    K cl, unsigned priority, unsigned cost, T &&item) = 0;
+
+  // Returns if the queue is empty
+  virtual bool empty() const = 0;
+
+  // Return an op to be dispatch
+  virtual T dequeue() = 0;
+
+  // Formatted output of the queue
+  virtual void dump(ceph::Formatter *f) const = 0;
+
+  // Human readable brief description of queue and relevant parameters
+  virtual void print(std::ostream &f) const = 0;
+
+  // Don't leak resources on destruction
+  virtual ~OpQueue() {};
+};
+
+#endif
diff --git a/src/common/OutputDataSocket.cc b/src/common/OutputDataSocket.cc
new file mode 100644
index 000000000..5828daebc
--- /dev/null
+++ b/src/common/OutputDataSocket.cc
@@ -0,0 +1,407 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include <poll.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#include "common/OutputDataSocket.h"
+#include "common/errno.h"
+#include "common/debug.h"
+#include "common/safe_io.h"
+#include "include/compat.h"
+#include "include/sock_compat.h"
+
+// re-include our assert to clobber the system one; fix dout:
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_asok
+#undef dout_prefix
+#define dout_prefix *_dout << "asok(" << (void*)m_cct << ") "
+
+using std::ostringstream;
+
+/*
+ * UNIX domain sockets created by an application persist even after that
+ * application closes, unless they're explicitly unlinked. This is because the
+ * directory containing the socket keeps a reference to the socket.
+ *
+ * This code makes things a little nicer by unlinking those dead sockets when
+ * the application exits normally.
+ */
+static pthread_mutex_t cleanup_lock = PTHREAD_MUTEX_INITIALIZER;
+static std::vector <const char*> cleanup_files;
+static bool cleanup_atexit = false;
+
+static void remove_cleanup_file(const char *file)
+{
+  pthread_mutex_lock(&cleanup_lock);
+  VOID_TEMP_FAILURE_RETRY(unlink(file));
+  for (std::vector <const char*>::iterator i = cleanup_files.begin();
+       i != cleanup_files.end(); ++i) {
+    if (strcmp(file, *i) == 0) {
+      free((void*)*i);
+      cleanup_files.erase(i);
+      break;
+    }
+  }
+  pthread_mutex_unlock(&cleanup_lock);
+}
+
+static void remove_all_cleanup_files()
+{
+  pthread_mutex_lock(&cleanup_lock);
+  for (std::vector <const char*>::iterator i = cleanup_files.begin();
+       i != cleanup_files.end(); ++i) {
+    VOID_TEMP_FAILURE_RETRY(unlink(*i));
+    free((void*)*i);
+  }
+  cleanup_files.clear();
+  pthread_mutex_unlock(&cleanup_lock);
+}
+
+static void add_cleanup_file(const char *file)
+{
+  char *fname = strdup(file);
+  if (!fname)
+    return;
+  pthread_mutex_lock(&cleanup_lock);
+  cleanup_files.push_back(fname);
+  if (!cleanup_atexit) {
+    atexit(remove_all_cleanup_files);
+    cleanup_atexit = true;
+  }
+  pthread_mutex_unlock(&cleanup_lock);
+}
+
+
+OutputDataSocket::OutputDataSocket(CephContext *cct, uint64_t _backlog)
+  : m_cct(cct),
+    data_max_backlog(_backlog),
+    m_sock_fd(-1),
+    m_shutdown_rd_fd(-1),
+    m_shutdown_wr_fd(-1),
+    going_down(false),
+    data_size(0),
+    skipped(0)
+{
+}
+
+OutputDataSocket::~OutputDataSocket()
+{
+  shutdown();
+}
+
+/*
+ * This thread listens on the UNIX domain socket for incoming connections.
+ * It only handles one connection at a time at the moment. All I/O is nonblocking,
+ * so that we can implement sensible timeouts. [TODO: make all I/O nonblocking]
+ *
+ * This thread also listens to m_shutdown_rd_fd. If there is any data sent to this
+ * pipe, the thread terminates itself gracefully, allowing the
+ * OutputDataSocketConfigObs class to join() it.
+ */
+
+#define PFL_SUCCESS ((void*)(intptr_t)0)
+#define PFL_FAIL ((void*)(intptr_t)1)
+
+std::string OutputDataSocket::create_shutdown_pipe(int *pipe_rd, int *pipe_wr)
+{
+  int pipefd[2];
+  if (pipe_cloexec(pipefd, 0) < 0) {
+    int e = errno;
+    ostringstream oss;
+    oss << "OutputDataSocket::create_shutdown_pipe error: " << cpp_strerror(e);
+    return oss.str();
+  }
+  
+  *pipe_rd = pipefd[0];
+  *pipe_wr = pipefd[1];
+  return "";
+}
+
+std::string OutputDataSocket::bind_and_listen(const std::string &sock_path, int *fd)
+{
+  ldout(m_cct, 5) << "bind_and_listen " << sock_path << dendl;
+
+  struct sockaddr_un address;
+  if (sock_path.size() > sizeof(address.sun_path) - 1) {
+    ostringstream oss;
+    oss << "OutputDataSocket::bind_and_listen: "
+	<< "The UNIX domain socket path " << sock_path << " is too long! The "
+	<< "maximum length on this system is "
+	<< (sizeof(address.sun_path) - 1);
+    return oss.str();
+  }
+  int sock_fd = socket_cloexec(PF_UNIX, SOCK_STREAM, 0);
+  if (sock_fd < 0) {
+    int err = errno;
+    ostringstream oss;
+    oss << "OutputDataSocket::bind_and_listen: "
+	<< "failed to create socket: " << cpp_strerror(err);
+    return oss.str();
+  }
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&address, 0, sizeof(struct sockaddr_un));
+  address.sun_family = AF_UNIX;
+  snprintf(address.sun_path, sizeof(address.sun_path),
+	   "%s", sock_path.c_str());
+  if (::bind(sock_fd, (struct sockaddr*)&address,
+	   sizeof(struct sockaddr_un)) != 0) {
+    int err = errno;
+    if (err == EADDRINUSE) {
+      // The old UNIX domain socket must still be there.
+      // Let's unlink it and try again.
+      VOID_TEMP_FAILURE_RETRY(unlink(sock_path.c_str()));
+      if (::bind(sock_fd, (struct sockaddr*)&address,
+	       sizeof(struct sockaddr_un)) == 0) {
+	err = 0;
+      }
+      else {
+	err = errno;
+      }
+    }
+    if (err != 0) {
+      ostringstream oss;
+      oss << "OutputDataSocket::bind_and_listen: "
+	  << "failed to bind the UNIX domain socket to '" << sock_path
+	  << "': " << cpp_strerror(err);
+      close(sock_fd);
+      return oss.str();
+    }
+  }
+  if (listen(sock_fd, 5) != 0) {
+    int err = errno;
+    ostringstream oss;
+    oss << "OutputDataSocket::bind_and_listen: "
+	  << "failed to listen to socket: " << cpp_strerror(err);
+    close(sock_fd);
+    VOID_TEMP_FAILURE_RETRY(unlink(sock_path.c_str()));
+    return oss.str();
+  }
+  *fd = sock_fd;
+  return "";
+}
+
+void* OutputDataSocket::entry()
+{
+  ldout(m_cct, 5) << "entry start" << dendl;
+  while (true) {
+    struct pollfd fds[2];
+    // FIPS zeroization audit 20191115: this memset is not security related.
+    memset(fds, 0, sizeof(fds));
+    fds[0].fd = m_sock_fd;
+    fds[0].events = POLLIN | POLLRDBAND;
+    fds[1].fd = m_shutdown_rd_fd;
+    fds[1].events = POLLIN | POLLRDBAND;
+
+    int ret = poll(fds, 2, -1);
+    if (ret < 0) {
+      int err = errno;
+      if (err == EINTR) {
+	continue;
+      }
+      lderr(m_cct) << "OutputDataSocket: poll(2) error: '"
+		   << cpp_strerror(err) << dendl;
+      return PFL_FAIL;
+    }
+
+    if (fds[0].revents & POLLIN) {
+      // Send out some data
+      do_accept();
+    }
+    if (fds[1].revents & POLLIN) {
+      // Parent wants us to shut down
+      return PFL_SUCCESS;
+    }
+  }
+  ldout(m_cct, 5) << "entry exit" << dendl;
+
+  return PFL_SUCCESS; // unreachable
+}
+
+
+bool OutputDataSocket::do_accept()
+{
+  struct sockaddr_un address;
+  socklen_t address_length = sizeof(address);
+  ldout(m_cct, 30) << "OutputDataSocket: calling accept" << dendl;
+  int connection_fd = accept_cloexec(m_sock_fd, (struct sockaddr*) &address,
+			     &address_length);
+  if (connection_fd < 0) {
+    int err = errno;
+    lderr(m_cct) << "OutputDataSocket: do_accept error: '"
+			   << cpp_strerror(err) << dendl;
+    return false;
+  }
+  ldout(m_cct, 30) << "OutputDataSocket: finished accept" << dendl;
+
+  handle_connection(connection_fd);
+  close_connection(connection_fd);
+
+  return 0;
+}
+
+void OutputDataSocket::handle_connection(int fd)
+{
+  ceph::buffer::list bl;
+
+  m_lock.lock();
+  init_connection(bl);
+  m_lock.unlock();
+
+  if (bl.length()) {
+    /* need to special case the connection init buffer output, as it needs
+     * to be dumped before any data, including older data that was sent
+     * before the connection was established, or before we identified
+     * older connection was broken
+     */
+    int ret = safe_write(fd, bl.c_str(), bl.length());
+    if (ret < 0) {
+      return;
+    }
+  }
+
+  int ret = dump_data(fd);
+  if (ret < 0)
+    return;
+
+  do {
+    {
+      std::unique_lock l(m_lock);
+      if (!going_down) {
+	cond.wait(l);
+      }
+      if (going_down) {
+	break;
+      }
+    }
+    ret = dump_data(fd);
+  } while (ret >= 0);
+}
+
+int OutputDataSocket::dump_data(int fd)
+{
+  m_lock.lock();
+  auto l = std::move(data);
+  data.clear();
+  data_size = 0;
+  m_lock.unlock();
+
+  for (auto iter = l.begin(); iter != l.end(); ++iter) {
+    ceph::buffer::list& bl = *iter;
+    int ret = safe_write(fd, bl.c_str(), bl.length());
+    if (ret >= 0) {
+      ret = safe_write(fd, delim.c_str(), delim.length());
+    }
+    if (ret < 0) {
+      std::scoped_lock lock(m_lock);
+      for (; iter != l.end(); ++iter) {
+        ceph::buffer::list& bl = *iter;
+	data.push_back(bl);
+	data_size += bl.length();
+      }
+      return ret;
+    }
+  }
+
+  return 0;
+}
+
+void OutputDataSocket::close_connection(int fd)
+{
+  VOID_TEMP_FAILURE_RETRY(close(fd));
+}
+
+bool OutputDataSocket::init(const std::string &path)
+{
+  ldout(m_cct, 5) << "init " << path << dendl;
+
+  /* Set up things for the new thread */
+  std::string err;
+  int pipe_rd = -1, pipe_wr = -1;
+  err = create_shutdown_pipe(&pipe_rd, &pipe_wr);
+  if (!err.empty()) {
+    lderr(m_cct) << "OutputDataSocketConfigObs::init: error: " << err << dendl;
+    return false;
+  }
+  int sock_fd;
+  err = bind_and_listen(path, &sock_fd);
+  if (!err.empty()) {
+    lderr(m_cct) << "OutputDataSocketConfigObs::init: failed: " << err << dendl;
+    close(pipe_rd);
+    close(pipe_wr);
+    return false;
+  }
+
+  /* Create new thread */
+  m_sock_fd = sock_fd;
+  m_shutdown_rd_fd = pipe_rd;
+  m_shutdown_wr_fd = pipe_wr;
+  m_path = path;
+  create("out_data_socket");
+  add_cleanup_file(m_path.c_str());
+  return true;
+}
+
+void OutputDataSocket::shutdown()
+{
+  m_lock.lock();
+  going_down = true;
+  cond.notify_all();
+  m_lock.unlock();
+
+  if (m_shutdown_wr_fd < 0)
+    return;
+
+  ldout(m_cct, 5) << "shutdown" << dendl;
+
+  // Send a byte to the shutdown pipe that the thread is listening to
+  char buf[1] = { 0x0 };
+  int ret = safe_write(m_shutdown_wr_fd, buf, sizeof(buf));
+  VOID_TEMP_FAILURE_RETRY(close(m_shutdown_wr_fd));
+  m_shutdown_wr_fd = -1;
+
+  if (ret == 0) {
+    join();
+  } else {
+    lderr(m_cct) << "OutputDataSocket::shutdown: failed to write "
+      "to thread shutdown pipe: error " << ret << dendl;
+  }
+
+  remove_cleanup_file(m_path.c_str());
+  m_path.clear();
+}
+
+void OutputDataSocket::append_output(ceph::buffer::list& bl)
+{
+  std::lock_guard l(m_lock);
+
+  if (data_size + bl.length() > data_max_backlog) {
+    if (skipped % 100 == 0) {
+      ldout(m_cct, 0) << "dropping data output, max backlog reached (skipped=="
+		      << skipped << ")"
+		      << dendl;
+      skipped = 1;
+    } else
+      ++skipped;
+
+    cond.notify_all();
+    return;
+  }
+
+  data.push_back(bl);
+  data_size += bl.length();
+  cond.notify_all();
+}
diff --git a/src/common/OutputDataSocket.h b/src/common/OutputDataSocket.h
new file mode 100644
index 000000000..397d93f16
--- /dev/null
+++ b/src/common/OutputDataSocket.h
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_COMMON_OUTPUTDATASOCKET_H
+#define CEPH_COMMON_OUTPUTDATASOCKET_H
+
+#include "common/ceph_mutex.h"
+#include "common/Thread.h"
+#include "include/common_fwd.h"
+#include "include/buffer.h"
+
+
+class OutputDataSocket : public Thread
+{
+public:
+  OutputDataSocket(CephContext *cct, uint64_t _backlog);
+  ~OutputDataSocket() override;
+
+  bool init(const std::string &path);
+  
+  void append_output(ceph::buffer::list& bl);
+
+protected:
+  virtual void init_connection(ceph::buffer::list& bl) {}
+  void shutdown();
+
+  std::string create_shutdown_pipe(int *pipe_rd, int *pipe_wr);
+  std::string bind_and_listen(const std::string &sock_path, int *fd);
+
+  void *entry() override;
+  bool do_accept();
+
+  void handle_connection(int fd);
+  void close_connection(int fd);
+
+  int dump_data(int fd);
+
+  CephContext *m_cct;
+  uint64_t data_max_backlog;
+  std::string m_path;
+  int m_sock_fd;
+  int m_shutdown_rd_fd;
+  int m_shutdown_wr_fd;
+  bool going_down;
+
+  uint64_t data_size;
+  uint32_t skipped;
+
+  std::vector<ceph::buffer::list> data;
+
+  ceph::mutex m_lock = ceph::make_mutex("OutputDataSocket::m_lock");
+  ceph::condition_variable cond;
+  ceph::buffer::list delim;
+};
+
+#endif
diff --git a/src/common/PluginRegistry.cc b/src/common/PluginRegistry.cc
new file mode 100644
index 000000000..dd85d64fd
--- /dev/null
+++ b/src/common/PluginRegistry.cc
@@ -0,0 +1,232 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ * Copyright (C) 2014 Red Hat <contact@redhat.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ * 
+ */
+
+#include "PluginRegistry.h"
+#include "ceph_ver.h"
+#include "common/ceph_context.h"
+#include "common/errno.h"
+#include "common/debug.h"
+#include "include/dlfcn_compat.h"
+
+#define PLUGIN_PREFIX "libceph_"
+#define PLUGIN_SUFFIX SHARED_LIB_SUFFIX
+#define PLUGIN_INIT_FUNCTION "__ceph_plugin_init"
+#define PLUGIN_VERSION_FUNCTION "__ceph_plugin_version"
+
+#define dout_subsys ceph_subsys_context
+
+using std::map;
+using std::string;
+
+namespace ceph {
+
+PluginRegistry::PluginRegistry(CephContext *cct) :
+  cct(cct),
+  loading(false),
+  disable_dlclose(false)
+{
+}
+
+PluginRegistry::~PluginRegistry()
+{
+  if (disable_dlclose)
+    return;
+
+  for (std::map<std::string,std::map<std::string, Plugin*> >::iterator i =
+	 plugins.begin();
+       i != plugins.end();
+       ++i) {
+    for (std::map<std::string,Plugin*>::iterator j = i->second.begin();
+	 j != i->second.end(); ++j) {
+      void *library = j->second->library;
+      delete j->second;
+      dlclose(library);
+    }
+  }
+}
+
+int PluginRegistry::remove(const std::string& type, const std::string& name)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+
+  std::map<std::string,std::map<std::string,Plugin*> >::iterator i =
+    plugins.find(type);
+  if (i == plugins.end())
+    return -ENOENT;
+  std::map<std::string,Plugin*>::iterator j = i->second.find(name);
+  if (j == i->second.end())
+    return -ENOENT;
+
+  ldout(cct, 1) << __func__ << " " << type << " " << name << dendl;
+  void *library = j->second->library;
+  delete j->second;
+  dlclose(library);
+  i->second.erase(j);
+  if (i->second.empty())
+    plugins.erase(i);
+
+  return 0;
+}
+
+int PluginRegistry::add(const std::string& type,
+			const std::string& name,
+			Plugin* plugin)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  if (plugins.count(type) &&
+      plugins[type].count(name)) {
+    return -EEXIST;
+  }
+  ldout(cct, 1) << __func__ << " " << type << " " << name
+		<< " " << plugin << dendl;
+  plugins[type][name] = plugin;
+  return 0;
+}
+
+Plugin *PluginRegistry::get_with_load(const std::string& type,
+          const std::string& name)
+{
+  std::lock_guard l(lock);
+  Plugin* ret = get(type, name);
+  if (!ret) {
+    int err = load(type, name);
+    if (err == 0)
+      ret = get(type, name);
+  } 
+  return ret;
+}
+
+Plugin *PluginRegistry::get(const std::string& type,
+			    const std::string& name)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  Plugin *ret = 0;
+
+  std::map<std::string,Plugin*>::iterator j;
+  std::map<std::string,map<std::string,Plugin*> >::iterator i =
+    plugins.find(type);
+  if (i == plugins.end()) 
+    goto out;
+  j = i->second.find(name);
+  if (j == i->second.end()) 
+    goto out;
+  ret = j->second;
+
+ out:
+  ldout(cct, 1) << __func__ << " " << type << " " << name
+		<< " = " << ret << dendl;
+  return ret;
+}
+
+int PluginRegistry::load(const std::string &type,
+			 const std::string &name)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ldout(cct, 1) << __func__ << " " << type << " " << name << dendl;
+
+  std::string fname = cct->_conf.get_val<std::string>("plugin_dir") + "/" + type + "/" + PLUGIN_PREFIX
+      + name + PLUGIN_SUFFIX;
+  void *library = dlopen(fname.c_str(), RTLD_NOW);
+  if (!library) {
+    string err1(dlerror());
+    // fall back to plugin_dir
+    fname = cct->_conf.get_val<std::string>("plugin_dir") + "/" + PLUGIN_PREFIX +
+      name + PLUGIN_SUFFIX;
+    library = dlopen(fname.c_str(), RTLD_NOW);
+    if (!library) {
+      lderr(cct) << __func__
+		 << " failed dlopen(): \""	<< err1.c_str() 
+		 << "\" or \"" << dlerror() << "\""
+		 << dendl;
+      return -EIO;
+    }
+  }
+
+  const char * (*code_version)() =
+    (const char *(*)())dlsym(library, PLUGIN_VERSION_FUNCTION);
+  if (code_version == NULL) {
+    lderr(cct) << __func__ << " code_version == NULL" << dlerror() << dendl;
+    return -EXDEV;
+  }
+  if (code_version() != string(CEPH_GIT_NICE_VER)) {
+    lderr(cct) << __func__ << " plugin " << fname << " version "
+	       << code_version() << " != expected "
+	       << CEPH_GIT_NICE_VER << dendl;
+    dlclose(library);
+    return -EXDEV;
+  }
+
+  int (*code_init)(CephContext *,
+		   const std::string& type,
+		   const std::string& name) =
+    (int (*)(CephContext *,
+	     const std::string& type,
+	     const std::string& name))dlsym(library, PLUGIN_INIT_FUNCTION);
+  if (code_init) {
+    int r = code_init(cct, type, name);
+    if (r != 0) {
+      lderr(cct) << __func__ << " " << fname << " "
+		 << PLUGIN_INIT_FUNCTION << "(" << cct
+		 << "," << type << "," << name << "): " << cpp_strerror(r)
+		 << dendl;
+      dlclose(library);
+      return r;
+    }
+  } else {
+    lderr(cct) << __func__ << " " << fname << " dlsym(" << PLUGIN_INIT_FUNCTION
+	       << "): " << dlerror() << dendl;
+    dlclose(library);
+    return -ENOENT;
+  }
+
+  Plugin *plugin = get(type, name);
+  if (plugin == 0) {
+    lderr(cct) << __func__ << " " << fname << " "
+	       << PLUGIN_INIT_FUNCTION << "()"
+	       << "did not register plugin type " << type << " name " << name
+	       << dendl;
+    dlclose(library);
+    return -EBADF;
+  }
+
+  plugin->library = library;
+
+  ldout(cct, 1) << __func__ << ": " << type << " " << name
+		<< " loaded and registered" << dendl;
+  return 0;
+}
+}
+
+/*
+int ErasureCodePluginRegistry::preload(const std::string &plugins,
+				       const std::string &directory,
+				       ostream &ss)
+{
+  std::lock_guard l(lock);
+  list<string> plugins_list;
+  get_str_list(plugins, plugins_list);
+  for (list<string>::iterator i = plugins_list.begin();
+       i != plugins_list.end();
+       ++i) {
+    ErasureCodePlugin *plugin;
+    int r = load(*i, directory, &plugin, ss);
+    if (r)
+      return r;
+  }
+  return 0;
+}
+*/
diff --git a/src/common/PluginRegistry.h b/src/common/PluginRegistry.h
new file mode 100644
index 000000000..938fb6e16
--- /dev/null
+++ b/src/common/PluginRegistry.h
@@ -0,0 +1,68 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ * Copyright (C) 2014 Red Hat <contact@redhat.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ * 
+ */
+
+#ifndef CEPH_COMMON_PLUGINREGISTRY_H
+#define CEPH_COMMON_PLUGINREGISTRY_H
+
+#include <map>
+#include <string>
+#include "common/ceph_mutex.h"
+#include "include/common_fwd.h"
+
+extern "C" {
+  const char *__ceph_plugin_version();
+  int __ceph_plugin_init(CephContext *cct,
+			 const std::string& type,
+			 const std::string& name);
+}
+
+namespace ceph {
+
+  class Plugin {
+  public:
+    void *library;
+    CephContext *cct;
+
+    explicit Plugin(CephContext *cct) : library(NULL), cct(cct) {}
+    virtual ~Plugin() {}
+  };
+
+  class PluginRegistry {
+  public:
+    CephContext *cct;
+    ceph::mutex lock = ceph::make_mutex("PluginRegistery::lock");
+    bool loading;
+    bool disable_dlclose;
+    std::map<std::string,std::map<std::string,Plugin*> > plugins;
+
+    explicit PluginRegistry(CephContext *cct);
+    ~PluginRegistry();
+
+    int add(const std::string& type, const std::string& name,
+	    Plugin *factory);
+    int remove(const std::string& type, const std::string& name);
+    Plugin *get(const std::string& type, const std::string& name);
+    Plugin *get_with_load(const std::string& type, const std::string& name);
+
+    int load(const std::string& type,
+	     const std::string& name);
+    int preload();
+    int preload(const std::string& type);
+  };
+}
+
+#endif
diff --git a/src/common/Preforker.h b/src/common/Preforker.h
new file mode 100644
index 000000000..d34179b40
--- /dev/null
+++ b/src/common/Preforker.h
@@ -0,0 +1,144 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_COMMON_PREFORKER_H
+#define CEPH_COMMON_PREFORKER_H
+
+#include <signal.h>
+#include <sys/socket.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <sstream>
+
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "include/ceph_assert.h"
+#include "include/compat.h"
+#include "include/sock_compat.h"
+
+/**
+ * pre-fork fork/daemonize helper class
+ *
+ * Hide the details of letting a process fork early, do a bunch of
+ * initialization work that may spam stdout or exit with an error, and
+ * then daemonize.  The exit() method will either exit directly (if we
+ * haven't forked) or pass a message to the parent with the error if
+ * we have.
+ */
+class Preforker {
+  pid_t childpid;
+  bool forked;
+  int fd[2];  // parent's, child's
+
+public:
+  Preforker()
+    : childpid(0),
+      forked(false)
+  {}
+
+  int prefork(std::string &err) {
+    ceph_assert(!forked);
+    std::ostringstream oss;
+    int r = socketpair_cloexec(AF_UNIX, SOCK_STREAM, 0, fd);
+    if (r < 0) {
+      int e = errno;
+      oss << "[" << getpid() << "]: unable to create socketpair: " << cpp_strerror(e);
+      err = oss.str();
+      return (errno = e, -1);
+    }
+
+    struct sigaction sa;
+    sa.sa_handler = SIG_IGN;
+    sigemptyset(&sa.sa_mask);
+    sa.sa_flags = 0;
+    if (sigaction(SIGHUP, &sa, nullptr) != 0) {
+      int e = errno;
+      oss << "[" << getpid() << "]: unable to ignore SIGHUP: " << cpp_strerror(e);
+      err = oss.str();
+      return (errno = e, -1);
+    }
+
+    forked = true;
+
+    childpid = fork();
+    if (childpid < 0) {
+      int e = errno;
+      oss << "[" << getpid() << "]: unable to fork: " << cpp_strerror(e);
+      err = oss.str();
+      return (errno = e, -1);
+    }
+    if (is_child()) {
+      ::close(fd[0]);
+    } else {
+      ::close(fd[1]);
+    }
+    return 0;
+  }
+
+  int get_signal_fd() const {
+    return forked ? fd[1] : 0;
+  }
+
+  bool is_child() {
+    return childpid == 0;
+  }
+
+  bool is_parent() {
+    return childpid != 0;
+  }
+
+  int parent_wait(std::string &err_msg) {
+    ceph_assert(forked);
+
+    int r = -1;
+    std::ostringstream oss;
+    int err = safe_read_exact(fd[0], &r, sizeof(r));
+    if (err == 0 && r == -1) {
+      // daemonize
+      ::close(0);
+      ::close(1);
+      ::close(2);
+    } else if (err) {
+      oss << "[" << getpid() << "]: " << cpp_strerror(err);
+    } else {
+      // wait for child to exit
+      int status;
+      err = waitpid(childpid, &status, 0);
+      if (err < 0) {
+        oss << "[" << getpid() << "]" << " waitpid error: " << cpp_strerror(err);
+      } else if (WIFSIGNALED(status)) {
+        oss << "[" << getpid() << "]" << " exited with a signal";
+      } else if (!WIFEXITED(status)) {
+        oss << "[" << getpid() << "]" << " did not exit normally";
+      } else {
+        err = WEXITSTATUS(status);
+        if (err != 0)
+         oss << "[" << getpid() << "]" << " returned exit_status " << cpp_strerror(err);
+      }
+    }
+    err_msg = oss.str();
+    return err;
+  }
+
+  int signal_exit(int r) {
+    if (forked) {
+      /* If we get an error here, it's too late to do anything reasonable about it. */
+      [[maybe_unused]] auto n = safe_write(fd[1], &r, sizeof(r));
+    }
+    return r;
+  }
+  void exit(int r) {
+    if (is_child())
+        signal_exit(r);
+    ::exit(r);
+  }
+
+  void daemonize() {
+    ceph_assert(forked);
+    static int r = -1;
+    int r2 = ::write(fd[1], &r, sizeof(r));
+    r += r2;  // make the compiler shut up about the unused return code from ::write(2).
+  }
+  
+};
+
+#endif
diff --git a/src/common/PrioritizedQueue.h b/src/common/PrioritizedQueue.h
new file mode 100644
index 000000000..9adf21aaf
--- /dev/null
+++ b/src/common/PrioritizedQueue.h
@@ -0,0 +1,352 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef PRIORITY_QUEUE_H
+#define PRIORITY_QUEUE_H
+
+#include "include/ceph_assert.h"
+
+#include "common/Formatter.h"
+#include "common/OpQueue.h"
+
+/**
+ * Manages queue for normal and strict priority items
+ *
+ * On dequeue, the queue will select the lowest priority queue
+ * such that the q has bucket > cost of front queue item.
+ *
+ * If there is no such queue, we choose the next queue item for
+ * the highest priority queue.
+ *
+ * Before returning a dequeued item, we place into each bucket
+ * cost * (priority/total_priority) tokens.
+ *
+ * enqueue_strict and enqueue_strict_front queue items into queues
+ * which are serviced in strict priority order before items queued
+ * with enqueue and enqueue_front
+ *
+ * Within a priority class, we schedule round robin based on the class
+ * of type K used to enqueue items.  e.g. you could use entity_inst_t
+ * to provide fairness for different clients.
+ */
+template <typename T, typename K>
+class PrioritizedQueue : public OpQueue <T, K> {
+  int64_t total_priority;
+  int64_t max_tokens_per_subqueue;
+  int64_t min_cost;
+
+  typedef std::list<std::pair<unsigned, T> > ListPairs;
+
+  struct SubQueue {
+  private:
+    typedef std::map<K, ListPairs> Classes;
+    Classes q;
+    unsigned tokens, max_tokens;
+    int64_t size;
+    typename Classes::iterator cur;
+  public:
+    SubQueue(const SubQueue &other)
+      : q(other.q),
+	tokens(other.tokens),
+	max_tokens(other.max_tokens),
+	size(other.size),
+	cur(q.begin()) {}
+    SubQueue()
+      : tokens(0),
+	max_tokens(0),
+	size(0), cur(q.begin()) {}
+    void set_max_tokens(unsigned mt) {
+      max_tokens = mt;
+    }
+    unsigned get_max_tokens() const {
+      return max_tokens;
+    }
+    unsigned num_tokens() const {
+      return tokens;
+    }
+    void put_tokens(unsigned t) {
+      tokens += t;
+      if (tokens > max_tokens) {
+	tokens = max_tokens;
+      }
+    }
+    void take_tokens(unsigned t) {
+      if (tokens > t) {
+	tokens -= t;
+      } else {
+	tokens = 0;
+      }
+    }
+    void enqueue(K cl, unsigned cost, T &&item) {
+      q[cl].push_back(std::make_pair(cost, std::move(item)));
+      if (cur == q.end())
+	cur = q.begin();
+      size++;
+    }
+    void enqueue_front(K cl, unsigned cost, T &&item) {
+      q[cl].push_front(std::make_pair(cost, std::move(item)));
+      if (cur == q.end())
+	cur = q.begin();
+      size++;
+    }
+    std::pair<unsigned, T> &front() const {
+      ceph_assert(!(q.empty()));
+      ceph_assert(cur != q.end());
+      return cur->second.front();
+    }
+    T pop_front() {
+      ceph_assert(!(q.empty()));
+      ceph_assert(cur != q.end());
+      T ret = std::move(cur->second.front().second);
+      cur->second.pop_front();
+      if (cur->second.empty()) {
+	q.erase(cur++);
+      } else {
+	++cur;
+      }
+      if (cur == q.end()) {
+	cur = q.begin();
+      }
+      size--;
+      return ret;
+    }
+    unsigned length() const {
+      ceph_assert(size >= 0);
+      return (unsigned)size;
+    }
+    bool empty() const {
+      return q.empty();
+    }
+    void remove_by_class(K k, std::list<T> *out) {
+      typename Classes::iterator i = q.find(k);
+      if (i == q.end()) {
+	return;
+      }
+      size -= i->second.size();
+      if (i == cur) {
+	++cur;
+      }
+      if (out) {
+	for (typename ListPairs::reverse_iterator j =
+	       i->second.rbegin();
+	     j != i->second.rend();
+	     ++j) {
+	  out->push_front(std::move(j->second));
+	}
+      }
+      q.erase(i);
+      if (cur == q.end()) {
+	cur = q.begin();
+      }
+    }
+
+    void dump(ceph::Formatter *f) const {
+      f->dump_int("tokens", tokens);
+      f->dump_int("max_tokens", max_tokens);
+      f->dump_int("size", size);
+      f->dump_int("num_keys", q.size());
+      if (!empty()) {
+	f->dump_int("first_item_cost", front().first);
+      }
+    }
+  };
+
+  typedef std::map<unsigned, SubQueue> SubQueues;
+  SubQueues high_queue;
+  SubQueues queue;
+
+  SubQueue *create_queue(unsigned priority) {
+    typename SubQueues::iterator p = queue.find(priority);
+    if (p != queue.end()) {
+      return &p->second;
+    }
+    total_priority += priority;
+    SubQueue *sq = &queue[priority];
+    sq->set_max_tokens(max_tokens_per_subqueue);
+    return sq;
+  }
+
+  void remove_queue(unsigned priority) {
+    ceph_assert(queue.count(priority));
+    queue.erase(priority);
+    total_priority -= priority;
+    ceph_assert(total_priority >= 0);
+  }
+
+  void distribute_tokens(unsigned cost) {
+    if (total_priority == 0) {
+      return;
+    }
+    for (typename SubQueues::iterator i = queue.begin();
+	 i != queue.end();
+	 ++i) {
+      i->second.put_tokens(((i->first * cost) / total_priority) + 1);
+    }
+  }
+
+public:
+  PrioritizedQueue(unsigned max_per, unsigned min_c)
+    : total_priority(0),
+      max_tokens_per_subqueue(max_per),
+      min_cost(min_c)
+  {}
+
+  unsigned length() const {
+    unsigned total = 0;
+    for (typename SubQueues::const_iterator i = queue.begin();
+	 i != queue.end();
+	 ++i) {
+      ceph_assert(i->second.length());
+      total += i->second.length();
+    }
+    for (typename SubQueues::const_iterator i = high_queue.begin();
+	 i != high_queue.end();
+	 ++i) {
+      ceph_assert(i->second.length());
+      total += i->second.length();
+    }
+    return total;
+  }
+
+  void remove_by_class(K k, std::list<T> *out = 0) final {
+    for (typename SubQueues::iterator i = queue.begin();
+	 i != queue.end();
+	 ) {
+      i->second.remove_by_class(k, out);
+      if (i->second.empty()) {
+	unsigned priority = i->first;
+	++i;
+	remove_queue(priority);
+      } else {
+	++i;
+      }
+    }
+    for (typename SubQueues::iterator i = high_queue.begin();
+	 i != high_queue.end();
+	 ) {
+      i->second.remove_by_class(k, out);
+      if (i->second.empty()) {
+	high_queue.erase(i++);
+      } else {
+	++i;
+      }
+    }
+  }
+
+  void enqueue_strict(K cl, unsigned priority, T&& item) final {
+    high_queue[priority].enqueue(cl, 0, std::move(item));
+  }
+
+  void enqueue_strict_front(K cl, unsigned priority, T&& item) final {
+    high_queue[priority].enqueue_front(cl, 0, std::move(item));
+  }
+
+  void enqueue(K cl, unsigned priority, unsigned cost, T&& item) final {
+    if (cost < min_cost)
+      cost = min_cost;
+    if (cost > max_tokens_per_subqueue)
+      cost = max_tokens_per_subqueue;
+    create_queue(priority)->enqueue(cl, cost, std::move(item));
+  }
+
+  void enqueue_front(K cl, unsigned priority, unsigned cost, T&& item) final {
+    if (cost < min_cost)
+      cost = min_cost;
+    if (cost > max_tokens_per_subqueue)
+      cost = max_tokens_per_subqueue;
+    create_queue(priority)->enqueue_front(cl, cost, std::move(item));
+  }
+
+  bool empty() const final {
+    ceph_assert(total_priority >= 0);
+    ceph_assert((total_priority == 0) || !(queue.empty()));
+    return queue.empty() && high_queue.empty();
+  }
+
+  T dequeue() final {
+    ceph_assert(!empty());
+
+    if (!(high_queue.empty())) {
+      T ret = std::move(high_queue.rbegin()->second.front().second);
+      high_queue.rbegin()->second.pop_front();
+      if (high_queue.rbegin()->second.empty()) {
+	high_queue.erase(high_queue.rbegin()->first);
+      }
+      return ret;
+    }
+
+    // if there are multiple buckets/subqueues with sufficient tokens,
+    // we behave like a strict priority queue among all subqueues that
+    // are eligible to run.
+    for (typename SubQueues::iterator i = queue.begin();
+	 i != queue.end();
+	 ++i) {
+      ceph_assert(!(i->second.empty()));
+      if (i->second.front().first < i->second.num_tokens()) {
+	unsigned cost = i->second.front().first;
+	i->second.take_tokens(cost);
+	T ret = std::move(i->second.front().second);
+	i->second.pop_front();
+	if (i->second.empty()) {
+	  remove_queue(i->first);
+	}
+	distribute_tokens(cost);
+	return ret;
+      }
+    }
+
+    // if no subqueues have sufficient tokens, we behave like a strict
+    // priority queue.
+    unsigned cost = queue.rbegin()->second.front().first;
+    T ret = std::move(queue.rbegin()->second.front().second);
+    queue.rbegin()->second.pop_front();
+    if (queue.rbegin()->second.empty()) {
+      remove_queue(queue.rbegin()->first);
+    }
+    distribute_tokens(cost);
+    return ret;
+  }
+
+  void dump(ceph::Formatter *f) const final {
+    f->dump_int("total_priority", total_priority);
+    f->dump_int("max_tokens_per_subqueue", max_tokens_per_subqueue);
+    f->dump_int("min_cost", min_cost);
+    f->open_array_section("high_queues");
+    for (typename SubQueues::const_iterator p = high_queue.begin();
+	 p != high_queue.end();
+	 ++p) {
+      f->open_object_section("subqueue");
+      f->dump_int("priority", p->first);
+      p->second.dump(f);
+      f->close_section();
+    }
+    f->close_section();
+    f->open_array_section("queues");
+    for (typename SubQueues::const_iterator p = queue.begin();
+	 p != queue.end();
+	 ++p) {
+      f->open_object_section("subqueue");
+      f->dump_int("priority", p->first);
+      p->second.dump(f);
+      f->close_section();
+    }
+    f->close_section();
+  }
+
+  void print(std::ostream &ostream) const final {
+    ostream << "PrioritizedQueue";
+  }
+};
+
+#endif
diff --git a/src/common/PriorityCache.cc b/src/common/PriorityCache.cc
new file mode 100644
index 000000000..0fe781b3e
--- /dev/null
+++ b/src/common/PriorityCache.cc
@@ -0,0 +1,406 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "PriorityCache.h"
+#include "common/dout.h"
+#include "perfglue/heap_profiler.h"
+#define dout_context cct
+#define dout_subsys ceph_subsys_prioritycache
+#undef dout_prefix
+#define dout_prefix *_dout << "prioritycache "
+
+namespace PriorityCache
+{
+  int64_t get_chunk(uint64_t usage, uint64_t total_bytes)
+  {
+    uint64_t chunk = total_bytes;
+
+    // Find the nearest power of 2
+    chunk -= 1;
+    chunk |= chunk >> 1;
+    chunk |= chunk >> 2;
+    chunk |= chunk >> 4;
+    chunk |= chunk >> 8;
+    chunk |= chunk >> 16;
+    chunk |= chunk >> 32;
+    chunk += 1;
+    // shrink it to 1/256 of the rounded up cache size
+    chunk /= 256;
+
+    // bound the chunk size to be between 4MB and 64MB
+    chunk = (chunk > 4ul*1024*1024) ? chunk : 4ul*1024*1024;
+    chunk = (chunk < 64ul*1024*1024) ? chunk : 64ul*1024*1024;
+
+    /* FIXME: Hardcoded to force get_chunk to never drop below 64MB. 
+     * if RocksDB is used, it's a good idea to have N MB of headroom where
+     * N is the target_file_size_base value.  RocksDB will read SST files
+     * into the block cache during compaction which potentially can force out
+     * all existing cached data.  Once compaction is finished, the SST data is
+     * released leaving an empty cache.  Having enough headroom to absorb
+     * compaction reads allows the kv cache grow even during extremely heavy
+     * compaction workloads.
+     */
+    uint64_t val = usage + 64*1024*1024;
+    uint64_t r = (val) % chunk;
+    if (r > 0)
+      val = val + chunk - r;
+    return val;
+  }
+
+  Manager::Manager(CephContext *c,
+                   uint64_t min,
+                   uint64_t max,
+                   uint64_t target,
+                   bool reserve_extra,
+		   const std::string& name) :
+      cct(c),
+      caches{},
+      min_mem(min),
+      max_mem(max),
+      target_mem(target),
+      tuned_mem(min),
+      reserve_extra(reserve_extra),
+      name(name.empty() ? "prioritycache" : name)
+  {
+    PerfCountersBuilder b(cct, this->name, MallocStats::M_FIRST, MallocStats::M_LAST);
+
+    b.add_u64(MallocStats::M_TARGET_BYTES, "target_bytes",
+              "target process memory usage in bytes", "t",
+              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+
+    b.add_u64(MallocStats::M_MAPPED_BYTES, "mapped_bytes",
+              "total bytes mapped by the process", "m",
+              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+
+    b.add_u64(MallocStats::M_UNMAPPED_BYTES, "unmapped_bytes",
+              "unmapped bytes that the kernel has yet to reclaim", "u",
+              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+
+    b.add_u64(MallocStats::M_HEAP_BYTES, "heap_bytes",
+              "aggregate bytes in use by the heap", "h",
+              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+
+    b.add_u64(MallocStats::M_CACHE_BYTES, "cache_bytes",
+              "current memory available for caches.", "c",
+              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+
+    logger = b.create_perf_counters();
+    cct->get_perfcounters_collection()->add(logger);
+
+    tune_memory();
+  }
+
+  Manager::~Manager()
+  {
+    clear();
+    cct->get_perfcounters_collection()->remove(logger);
+    delete logger;
+  }
+
+  void Manager::tune_memory()
+  {
+    size_t heap_size = 0;
+    size_t unmapped = 0;
+    uint64_t mapped = 0;
+
+    ceph_heap_release_free_memory();
+    ceph_heap_get_numeric_property("generic.heap_size", &heap_size);
+    ceph_heap_get_numeric_property("tcmalloc.pageheap_unmapped_bytes", &unmapped);
+    mapped = heap_size - unmapped;
+
+    uint64_t new_size = tuned_mem;
+    new_size = (new_size < max_mem) ? new_size : max_mem;
+    new_size = (new_size > min_mem) ? new_size : min_mem;
+
+    // Approach the min/max slowly, but bounce away quickly.
+    if ((uint64_t) mapped < target_mem) {
+      double ratio = 1 - ((double) mapped / target_mem);
+      new_size += ratio * (max_mem - new_size);
+    } else { 
+      double ratio = 1 - ((double) target_mem / mapped);
+      new_size -= ratio * (new_size - min_mem);
+    }
+
+    ldout(cct, 5) << __func__
+                  << " target: " << target_mem
+                  << " mapped: " << mapped  
+                  << " unmapped: " << unmapped
+                  << " heap: " << heap_size
+                  << " old mem: " << tuned_mem
+                  << " new mem: " << new_size << dendl;
+
+    tuned_mem = new_size;
+
+    logger->set(MallocStats::M_TARGET_BYTES, target_mem);
+    logger->set(MallocStats::M_MAPPED_BYTES, mapped);
+    logger->set(MallocStats::M_UNMAPPED_BYTES, unmapped);
+    logger->set(MallocStats::M_HEAP_BYTES, heap_size);
+    logger->set(MallocStats::M_CACHE_BYTES, new_size);
+  }
+
+  void Manager::insert(const std::string& name, std::shared_ptr<PriCache> c,
+                       bool enable_perf_counters)
+  {
+    ceph_assert(!caches.count(name));
+    ceph_assert(!indexes.count(name));
+
+    caches.emplace(name, c);
+
+    if (!enable_perf_counters) {
+      return;
+    }
+
+    // TODO: If we ever assign more than
+    // PERF_COUNTER_MAX_BOUND - PERF_COUNTER_LOWER_BOUND perf counters for
+    // priority caching we could run out of slots.  Recycle them some day?
+    // Also note that start and end are *exclusive*.
+    int start = cur_index++;
+    int end = cur_index + Extra::E_LAST + 1;
+
+    ceph_assert(end < PERF_COUNTER_MAX_BOUND);
+    indexes.emplace(name, std::vector<int>(Extra::E_LAST + 1));
+
+    PerfCountersBuilder b(cct, this->name + ":" + name, start, end);
+
+    b.add_u64(cur_index + Priority::PRI0, "pri0_bytes",
+              "bytes allocated to pri0", "p0",
+              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Priority::PRI1, "pri1_bytes",
+              "bytes allocated to pri1", "p1",
+              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Priority::PRI2, "pri2_bytes",
+              "bytes allocated to pri2", "p2",
+              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Priority::PRI3, "pri3_bytes",
+              "bytes allocated to pri3", "p3",
+              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Priority::PRI4, "pri4_bytes",
+              "bytes allocated to pri4", "p4",
+              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Priority::PRI5, "pri5_bytes",
+              "bytes allocated to pri5", "p5",
+              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Priority::PRI6, "pri6_bytes",
+              "bytes allocated to pri6", "p6",
+              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Priority::PRI7, "pri7_bytes",
+              "bytes allocated to pri7", "p7",
+              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Priority::PRI8, "pri8_bytes",
+              "bytes allocated to pri8", "p8",
+              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Priority::PRI9, "pri9_bytes",
+              "bytes allocated to pri9", "p9",
+              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Priority::PRI10, "pri10_bytes",
+              "bytes allocated to pri10", "p10",
+              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Priority::PRI11, "pri11_bytes",
+              "bytes allocated to pri11", "p11",
+              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Extra::E_RESERVED, "reserved_bytes",
+              "bytes reserved for future growth.", "r",
+              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Extra::E_COMMITTED, "committed_bytes",
+              "total bytes committed,", "c",
+              PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+
+    for (int i = 0; i < Extra::E_LAST+1; i++) {
+      indexes[name][i] = cur_index + i;
+    }
+
+    auto l = b.create_perf_counters();
+    loggers.emplace(name, l);
+    cct->get_perfcounters_collection()->add(l);
+
+    cur_index = end;
+  }
+
+  void Manager::erase(const std::string& name)
+  {
+    auto li = loggers.find(name);
+    if (li != loggers.end()) {
+      cct->get_perfcounters_collection()->remove(li->second);
+      delete li->second;
+      loggers.erase(li);
+    }
+    indexes.erase(name);
+    caches.erase(name);
+  }
+
+  void Manager::clear()
+  {
+    auto li = loggers.begin();
+    while (li != loggers.end()) {
+      cct->get_perfcounters_collection()->remove(li->second);
+      delete li->second;
+      li = loggers.erase(li);
+    }
+    indexes.clear();
+    caches.clear();
+  }
+
+  void Manager::balance()
+  {
+    int64_t mem_avail = tuned_mem;
+    // Each cache is going to get a little extra from get_chunk, so shrink the
+    // available memory here to compensate.
+    if (reserve_extra) {
+      mem_avail -= get_chunk(1, tuned_mem) * caches.size();
+    }
+
+    if (mem_avail < 0) {
+      // There's so little memory available that just assigning a chunk per
+      // cache pushes us over the limit. Set mem_avail to 0 and continue to
+      // ensure each priority's byte counts are zeroed in balance_priority.
+      mem_avail = 0;
+    }
+
+    // Assign memory for each priority level
+    for (int i = 0; i < Priority::LAST+1; i++) {
+      ldout(cct, 10) << __func__ << " assigning cache bytes for PRI: " << i << dendl;
+
+      auto pri = static_cast<Priority>(i);
+      balance_priority(&mem_avail, pri);
+
+      // Update the per-priority perf counters
+      for (auto &l : loggers) {
+        auto it = caches.find(l.first);
+        ceph_assert(it != caches.end());
+
+        auto bytes = it->second->get_cache_bytes(pri);
+        l.second->set(indexes[it->first][pri], bytes);
+      }
+    }
+    // assert if we assigned more memory than is available.
+    ceph_assert(mem_avail >= 0);
+
+    for (auto &l : loggers) {
+      auto it = caches.find(l.first);
+      ceph_assert(it != caches.end());
+
+      // Commit the new cache size
+      int64_t committed = it->second->commit_cache_size(tuned_mem);
+      // Update the perf counters
+      int64_t alloc = it->second->get_cache_bytes();
+
+      l.second->set(indexes[it->first][Extra::E_RESERVED], committed - alloc);
+      l.second->set(indexes[it->first][Extra::E_COMMITTED], committed);
+    }
+  }
+
+  void Manager::shift_bins()
+  {
+    for (auto &l : loggers) {
+      auto it = caches.find(l.first);
+      it->second->shift_bins();
+    }
+  }
+
+  void Manager::balance_priority(int64_t *mem_avail, Priority pri)
+  {
+    std::unordered_map<std::string, std::shared_ptr<PriCache>> tmp_caches = caches;
+    double cur_ratios = 0;
+    double new_ratios = 0;
+    uint64_t round = 0;
+
+    // First, zero this priority's bytes, sum the initial ratios.
+    for (auto it = caches.begin(); it != caches.end(); it++) {
+      it->second->set_cache_bytes(pri, 0);
+      cur_ratios += it->second->get_cache_ratio();
+    }
+
+    // For other priorities, loop until caches are satisified or we run out of
+    // memory (stop if we can't guarantee a full byte allocation).
+    while (!tmp_caches.empty() && *mem_avail > static_cast<int64_t>(tmp_caches.size())) {
+      uint64_t total_assigned = 0;
+      for (auto it = tmp_caches.begin(); it != tmp_caches.end();) {
+        int64_t cache_wants = it->second->request_cache_bytes(pri, tuned_mem);
+        // Usually the ratio should be set to the fraction of the current caches'
+        // assigned ratio compared to the total ratio of all caches that still
+        // want memory.  There is a special case where the only caches left are
+        // all assigned 0% ratios but still want memory.  In that case, give 
+        // them an equal shot at the remaining memory for this priority.
+        double ratio = 1.0 / tmp_caches.size();
+        if (cur_ratios > 0) {
+          ratio = it->second->get_cache_ratio() / cur_ratios;
+        }
+        int64_t fair_share = static_cast<int64_t>(*mem_avail * ratio);
+
+        ldout(cct, 10) << __func__ << " " << it->first
+                       << " pri: " << (int) pri
+                       << " round: " << round
+                       << " wanted: " << cache_wants
+                       << " ratio: " << it->second->get_cache_ratio()
+                       << " cur_ratios: " << cur_ratios
+                       << " fair_share: " << fair_share
+                       << " mem_avail: " << *mem_avail
+                       << dendl;
+
+        if (cache_wants > fair_share) {
+          // If we want too much, take what we can get but stick around for more
+          it->second->add_cache_bytes(pri, fair_share);
+          total_assigned += fair_share;
+          new_ratios += it->second->get_cache_ratio();
+          ++it;
+        } else {
+          // Otherwise assign only what we want
+          if (cache_wants > 0) {
+            it->second->add_cache_bytes(pri, cache_wants);
+            total_assigned += cache_wants;
+          }
+          // Either the cache didn't want anything or got what it wanted, so
+          // remove it from the tmp list.
+          it = tmp_caches.erase(it);
+        }
+      }
+      // Reset the ratios 
+      *mem_avail -= total_assigned;
+      cur_ratios = new_ratios;
+      new_ratios = 0;
+      ++round;
+    }
+
+    // If this is the last priority, divide up any remaining memory based
+    // solely on the ratios.
+    if (pri == Priority::LAST) {
+      uint64_t total_assigned = 0;
+      for (auto it = caches.begin(); it != caches.end(); it++) {
+        double ratio = it->second->get_cache_ratio();
+        int64_t fair_share = static_cast<int64_t>(*mem_avail * ratio);
+        it->second->set_cache_bytes(Priority::LAST, fair_share);
+        total_assigned += fair_share;
+      }
+      *mem_avail -= total_assigned;
+      return;
+    }
+  }
+
+  PriCache::~PriCache()
+  {
+  }
+}
diff --git a/src/common/PriorityCache.h b/src/common/PriorityCache.h
new file mode 100644
index 000000000..8233d0ecf
--- /dev/null
+++ b/src/common/PriorityCache.h
@@ -0,0 +1,161 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_PRIORITY_CACHE_H
+#define CEPH_PRIORITY_CACHE_H
+
+#include <stdint.h>
+#include <string>
+#include <vector>
+#include <memory>
+#include <unordered_map>
+#include "common/perf_counters.h"
+#include "include/ceph_assert.h"
+
+namespace PriorityCache {
+  // Reserve 16384 slots for PriorityCache perf counters
+  const int PERF_COUNTER_LOWER_BOUND = 1073741824;
+  const int PERF_COUNTER_MAX_BOUND = 1073758208;
+
+  enum MallocStats {
+    M_FIRST = PERF_COUNTER_LOWER_BOUND,
+    M_TARGET_BYTES,
+    M_MAPPED_BYTES,
+    M_UNMAPPED_BYTES,
+    M_HEAP_BYTES,
+    M_CACHE_BYTES,
+    M_LAST,
+  };
+
+  enum Priority {
+    PRI0,
+    PRI1,
+    PRI2,
+    PRI3,
+    PRI4,
+    PRI5,
+    PRI6,
+    PRI7,
+    PRI8,
+    PRI9,
+    PRI10,
+    PRI11,
+    LAST = PRI11,
+  };
+
+  enum Extra {
+    E_RESERVED = Priority::LAST+1,
+    E_COMMITTED,
+    E_LAST = E_COMMITTED,
+  };
+
+  int64_t get_chunk(uint64_t usage, uint64_t total_bytes);
+
+  struct PriCache {
+    virtual ~PriCache();
+
+    /* Ask the cache to request memory for the given priority. Note that the
+     * cache may ultimately be allocated less memory than it requests here.
+     */
+    virtual int64_t request_cache_bytes(PriorityCache::Priority pri, uint64_t total_cache) const = 0;
+
+    // Get the number of bytes currently allocated to the given priority.
+    virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const = 0;
+
+    // Get the number of bytes currently allocated to all priorities.
+    virtual int64_t get_cache_bytes() const = 0;
+
+    // Allocate bytes for a given priority.
+    virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) = 0;
+
+    // Allocate additional bytes for a given priority.
+    virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) = 0;
+
+    /* Commit the current number of bytes allocated to the cache.  Space is
+     * allocated in chunks based on the allocation size and current total size
+     * of memory available for caches. */
+    virtual int64_t commit_cache_size(uint64_t total_cache) = 0;
+
+    /* Get the current number of bytes allocated to the cache. this may be
+     * larger than the value returned by get_cache_bytes as it includes extra
+     * space for future growth. */
+    virtual int64_t get_committed_size() const = 0;
+
+    // Get the ratio of available memory this cache should target.
+    virtual double get_cache_ratio() const = 0;
+
+    // Set the ratio of available memory this cache should target.
+    virtual void set_cache_ratio(double ratio) = 0;
+
+    // Get the name of this cache.
+    virtual std::string get_cache_name() const = 0;
+
+    // Rotate the bins
+    virtual void shift_bins() = 0;
+
+    // Import user bins (from PRI1 to LAST-1)
+    virtual void import_bins(const std::vector<uint64_t> &bins) = 0;
+
+    // Set bins (PRI0 and LAST should be ignored)
+    virtual void set_bins(PriorityCache::Priority pri, uint64_t end_bin) = 0;
+
+    // Get bins
+    virtual uint64_t get_bins(PriorityCache::Priority pri) const = 0;
+  };
+
+  class Manager {
+    CephContext* cct = nullptr;
+    PerfCounters* logger;
+    std::unordered_map<std::string, PerfCounters*> loggers;
+    std::unordered_map<std::string, std::vector<int>> indexes;
+    std::unordered_map<std::string, std::shared_ptr<PriCache>> caches;
+
+    // Start perf counter slots after the malloc stats.
+    int cur_index = MallocStats::M_LAST;
+
+    uint64_t min_mem = 0;
+    uint64_t max_mem = 0;
+    uint64_t target_mem = 0;
+    uint64_t tuned_mem = 0;
+    bool reserve_extra;
+    std::string name;
+  public:
+    Manager(CephContext *c, uint64_t min, uint64_t max, uint64_t target,
+            bool reserve_extra, const std::string& name = std::string());
+    ~Manager();
+    void set_min_memory(uint64_t min) {
+      min_mem = min;
+    }
+    void set_max_memory(uint64_t max) {
+      max_mem = max;
+    }
+    void set_target_memory(uint64_t target) {
+      target_mem = target;
+    }
+    uint64_t get_tuned_mem() const {
+      return tuned_mem;
+    }
+    void insert(const std::string& name, const std::shared_ptr<PriCache> c,
+                bool enable_perf_counters);
+    void erase(const std::string& name);
+    void clear();
+    void tune_memory();
+    void balance();
+    void shift_bins();
+  private:
+    void balance_priority(int64_t *mem_avail, Priority pri);
+  };
+}
+
+#endif
diff --git a/src/common/QueueRing.h b/src/common/QueueRing.h
new file mode 100644
index 000000000..af5c47be2
--- /dev/null
+++ b/src/common/QueueRing.h
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef QUEUE_RING_H
+#define QUEUE_RING_H
+
+#include "common/ceph_mutex.h"
+
+#include <list>
+#include <atomic>
+#include <vector>
+
+template <class T>
+class QueueRing {
+  struct QueueBucket {
+    ceph::mutex lock = ceph::make_mutex("QueueRing::QueueBucket::lock");
+    ceph::condition_variable cond;
+    typename std::list<T> entries;
+
+    QueueBucket() {}
+    QueueBucket(const QueueBucket& rhs) {
+      entries = rhs.entries;
+    }
+
+    void enqueue(const T& entry) {
+      lock.lock();
+      if (entries.empty()) {
+        cond.notify_all();
+      }
+      entries.push_back(entry);
+      lock.unlock();
+    }
+
+    void dequeue(T *entry) {
+      std::unique_lock l(lock);
+      while (entries.empty()) {
+        cond.wait(l);
+      };
+      ceph_assert(!entries.empty());
+      *entry = entries.front();
+      entries.pop_front();
+    };
+  };
+
+  std::vector<QueueBucket> buckets;
+  int num_buckets;
+
+  std::atomic<int64_t> cur_read_bucket = { 0 };
+  std::atomic<int64_t> cur_write_bucket = { 0 };
+
+public:
+  QueueRing(int n) : buckets(n), num_buckets(n) {
+  }
+
+  void enqueue(const T& entry) {
+    buckets[++cur_write_bucket % num_buckets].enqueue(entry);
+  };
+
+  void dequeue(T *entry) {
+    buckets[++cur_read_bucket % num_buckets].dequeue(entry);
+  }
+};
+
+#endif
diff --git a/src/common/RWLock.h b/src/common/RWLock.h
new file mode 100644
index 000000000..08c8edc7b
--- /dev/null
+++ b/src/common/RWLock.h
@@ -0,0 +1,274 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#ifndef CEPH_RWLock_Posix__H
+#define CEPH_RWLock_Posix__H
+
+#include <pthread.h>
+#include <string>
+#include "include/ceph_assert.h"
+#include "acconfig.h"
+#include "lockdep.h"
+#include "common/valgrind.h"
+
+#include <atomic>
+
+class RWLock final
+{
+  mutable pthread_rwlock_t L;
+  std::string name;
+  mutable int id;
+  mutable std::atomic<unsigned> nrlock = { 0 }, nwlock = { 0 };
+  bool track, lockdep;
+
+  std::string unique_name(const char* name) const;
+
+public:
+  RWLock(const RWLock& other) = delete;
+  const RWLock& operator=(const RWLock& other) = delete;
+
+  RWLock(const std::string &n, bool track_lock=true, bool ld=true, bool prioritize_write=false)
+    : name(n), id(-1), track(track_lock),
+      lockdep(ld) {
+#if defined(HAVE_PTHREAD_RWLOCKATTR_SETKIND_NP)
+    if (prioritize_write) {
+      pthread_rwlockattr_t attr;
+      pthread_rwlockattr_init(&attr);
+      // PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP
+      //   Setting the lock kind to this avoids writer starvation as long as
+      //   long as any read locking is not done in a recursive fashion.
+      pthread_rwlockattr_setkind_np(&attr,
+          PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP);
+      pthread_rwlock_init(&L, &attr);
+      pthread_rwlockattr_destroy(&attr);
+    } else 
+#endif 
+    // Next block is in {} to possibly connect to the above if when code is used.
+    {
+      pthread_rwlock_init(&L, NULL);
+    }
+    ANNOTATE_BENIGN_RACE_SIZED(&id, sizeof(id), "RWLock lockdep id");
+    ANNOTATE_BENIGN_RACE_SIZED(&nrlock, sizeof(nrlock), "RWlock nrlock");
+    ANNOTATE_BENIGN_RACE_SIZED(&nwlock, sizeof(nwlock), "RWlock nwlock");
+    if (lockdep && g_lockdep) id = lockdep_register(name.c_str());
+  }
+
+  bool is_locked() const {
+    ceph_assert(track);
+    return (nrlock > 0) || (nwlock > 0);
+  }
+
+  bool is_wlocked() const {
+    ceph_assert(track);
+    return (nwlock > 0);
+  }
+  ~RWLock() {
+    // The following check is racy but we are about to destroy
+    // the object and we assume that there are no other users.
+    if (track)
+      ceph_assert(!is_locked());
+    pthread_rwlock_destroy(&L);
+    if (lockdep && g_lockdep) {
+      lockdep_unregister(id);
+    }
+  }
+
+  void unlock(bool lockdep=true) const {
+    if (track) {
+      if (nwlock > 0) {
+        nwlock--;
+      } else {
+        ceph_assert(nrlock > 0);
+        nrlock--;
+      }
+    }
+    if (lockdep && this->lockdep && g_lockdep)
+      id = lockdep_will_unlock(name.c_str(), id);
+    int r = pthread_rwlock_unlock(&L);
+    ceph_assert(r == 0);
+  }
+
+  // read
+  void get_read() const {
+    if (lockdep && g_lockdep) id = lockdep_will_lock(name.c_str(), id);
+    int r = pthread_rwlock_rdlock(&L);
+    ceph_assert(r == 0);
+    if (lockdep && g_lockdep) id = lockdep_locked(name.c_str(), id);
+    if (track)
+      nrlock++;
+  }
+  bool try_get_read() const {
+    if (pthread_rwlock_tryrdlock(&L) == 0) {
+      if (track)
+         nrlock++;
+      if (lockdep && g_lockdep) id = lockdep_locked(name.c_str(), id);
+      return true;
+    }
+    return false;
+  }
+  void put_read() const {
+    unlock();
+  }
+  void lock_shared() {
+    get_read();
+  }
+  void unlock_shared() {
+    put_read();
+  }
+  // write
+  void get_write(bool lockdep=true) {
+    if (lockdep && this->lockdep && g_lockdep)
+      id = lockdep_will_lock(name.c_str(), id);
+    int r = pthread_rwlock_wrlock(&L);
+    ceph_assert(r == 0);
+    if (lockdep && this->lockdep && g_lockdep)
+      id = lockdep_locked(name.c_str(), id);
+    if (track)
+      nwlock++;
+
+  }
+  bool try_get_write(bool lockdep=true) {
+    if (pthread_rwlock_trywrlock(&L) == 0) {
+      if (lockdep && this->lockdep && g_lockdep)
+	id = lockdep_locked(name.c_str(), id);
+      if (track)
+         nwlock++;
+      return true;
+    }
+    return false;
+  }
+  void put_write() {
+    unlock();
+  }
+  void lock() {
+    get_write();
+  }
+  void get(bool for_write) {
+    if (for_write) {
+      get_write();
+    } else {
+      get_read();
+    }
+  }
+
+public:
+  class RLocker {
+    const RWLock &m_lock;
+
+    bool locked;
+
+  public:
+   explicit  RLocker(const RWLock& lock) : m_lock(lock) {
+      m_lock.get_read();
+      locked = true;
+    }
+    void unlock() {
+      ceph_assert(locked);
+      m_lock.unlock();
+      locked = false;
+    }
+    ~RLocker() {
+      if (locked) {
+        m_lock.unlock();
+      }
+    }
+  };
+
+  class WLocker {
+    RWLock &m_lock;
+
+    bool locked;
+
+  public:
+    explicit WLocker(RWLock& lock) : m_lock(lock) {
+      m_lock.get_write();
+      locked = true;
+    }
+    void unlock() {
+      ceph_assert(locked);
+      m_lock.unlock();
+      locked = false;
+    }
+    ~WLocker() {
+      if (locked) {
+        m_lock.unlock();
+      }
+    }
+  };
+
+  class Context {
+    RWLock& lock;
+
+  public:
+    enum LockState {
+      Untaken = 0,
+      TakenForRead = 1,
+      TakenForWrite = 2,
+    };
+
+  private:
+    LockState state;
+
+  public:
+    explicit Context(RWLock& l) : lock(l), state(Untaken) {}
+    Context(RWLock& l, LockState s) : lock(l), state(s) {}
+
+    void get_write() {
+      ceph_assert(state == Untaken);
+
+      lock.get_write();
+      state = TakenForWrite;
+    }
+
+    void get_read() {
+      ceph_assert(state == Untaken);
+
+      lock.get_read();
+      state = TakenForRead;
+    }
+
+    void unlock() {
+      ceph_assert(state != Untaken);
+      lock.unlock();
+      state = Untaken;
+    }
+
+    void promote() {
+      ceph_assert(state == TakenForRead);
+      unlock();
+      get_write();
+    }
+
+    LockState get_state() { return state; }
+    void set_state(LockState s) {
+      state = s;
+    }
+
+    bool is_locked() {
+      return (state != Untaken);
+    }
+
+    bool is_rlocked() {
+      return (state == TakenForRead);
+    }
+
+    bool is_wlocked() {
+      return (state == TakenForWrite);
+    }
+  };
+};
+
+#endif // !CEPH_RWLock_Posix__H
diff --git a/src/common/Readahead.cc b/src/common/Readahead.cc
new file mode 100644
index 000000000..5ce820fed
--- /dev/null
+++ b/src/common/Readahead.cc
@@ -0,0 +1,196 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/Readahead.h"
+#include "common/Cond.h"
+
+using std::vector;
+
+Readahead::Readahead()
+  : m_trigger_requests(10),
+    m_readahead_min_bytes(0),
+    m_readahead_max_bytes(NO_LIMIT),
+    m_alignments(),
+    m_nr_consec_read(0),
+    m_consec_read_bytes(0),
+    m_last_pos(0),
+    m_readahead_pos(0),
+    m_readahead_trigger_pos(0),
+    m_readahead_size(0),
+    m_pending(0) {
+}
+
+Readahead::~Readahead() {
+}
+
+Readahead::extent_t Readahead::update(const vector<extent_t>& extents, uint64_t limit) {
+  m_lock.lock();
+  for (vector<extent_t>::const_iterator p = extents.begin(); p != extents.end(); ++p) {
+    _observe_read(p->first, p->second);
+  }
+  if (m_readahead_pos >= limit|| m_last_pos >= limit) {
+    m_lock.unlock();
+    return extent_t(0, 0);
+  }
+  std::pair<uint64_t, uint64_t> extent = _compute_readahead(limit);
+  m_lock.unlock();
+  return extent;
+}
+
+Readahead::extent_t Readahead::update(uint64_t offset, uint64_t length, uint64_t limit) {
+  m_lock.lock();
+  _observe_read(offset, length);
+  if (m_readahead_pos >= limit || m_last_pos >= limit) {
+    m_lock.unlock();
+    return extent_t(0, 0);
+  }
+  extent_t extent = _compute_readahead(limit);
+  m_lock.unlock();
+  return extent;
+}
+
+void Readahead::_observe_read(uint64_t offset, uint64_t length) {
+  if (offset == m_last_pos) {
+    m_nr_consec_read++;
+    m_consec_read_bytes += length;
+  } else {
+    m_nr_consec_read = 0;
+    m_consec_read_bytes = 0;
+    m_readahead_trigger_pos = 0;
+    m_readahead_size = 0;
+    m_readahead_pos = 0;
+  }
+  m_last_pos = offset + length;
+}
+
+Readahead::extent_t Readahead::_compute_readahead(uint64_t limit) {
+  uint64_t readahead_offset = 0;
+  uint64_t readahead_length = 0;
+  if (m_nr_consec_read >= m_trigger_requests) {
+    // currently reading sequentially
+    if (m_last_pos >= m_readahead_trigger_pos) {
+      // need to read ahead
+      if (m_readahead_size == 0) {
+	// initial readahead trigger
+	m_readahead_size = m_consec_read_bytes;
+	m_readahead_pos = m_last_pos;
+      } else {
+	// continuing readahead trigger
+	m_readahead_size *= 2;
+	if (m_last_pos > m_readahead_pos) {
+	  m_readahead_pos = m_last_pos;
+	}
+      }
+      m_readahead_size = std::max(m_readahead_size, m_readahead_min_bytes);
+      m_readahead_size = std::min(m_readahead_size, m_readahead_max_bytes);
+      readahead_offset = m_readahead_pos;
+      readahead_length = m_readahead_size;
+
+      // Snap to the first alignment possible
+      uint64_t readahead_end = readahead_offset + readahead_length;
+      for (vector<uint64_t>::iterator p = m_alignments.begin(); p != m_alignments.end(); ++p) {
+	// Align the readahead, if possible.
+	uint64_t alignment = *p;
+	uint64_t align_prev = readahead_end / alignment * alignment;
+	uint64_t align_next = align_prev + alignment;
+	uint64_t dist_prev = readahead_end - align_prev;
+	uint64_t dist_next = align_next - readahead_end;
+	if (dist_prev < readahead_length / 2 && dist_prev < dist_next) {
+	  // we can snap to the previous alignment point by a less than 50% reduction in size
+	  ceph_assert(align_prev > readahead_offset);
+	  readahead_length = align_prev - readahead_offset;
+	  break;
+	} else if(dist_next < readahead_length / 2) {
+	  // we can snap to the next alignment point by a less than 50% increase in size
+	  ceph_assert(align_next > readahead_offset);
+	  readahead_length = align_next - readahead_offset;
+	  break;
+	}
+	// Note that m_readahead_size should remain unadjusted.
+      }
+
+      if (m_readahead_pos + readahead_length > limit) {
+	readahead_length = limit - m_readahead_pos;
+      }
+
+      m_readahead_trigger_pos = m_readahead_pos + readahead_length / 2;
+      m_readahead_pos += readahead_length;
+    }
+  }
+  return extent_t(readahead_offset, readahead_length);
+}
+
+void Readahead::inc_pending(int count) {
+  ceph_assert(count > 0);
+  m_pending_lock.lock();
+  m_pending += count;
+  m_pending_lock.unlock();
+}
+
+void Readahead::dec_pending(int count) {
+  ceph_assert(count > 0);
+  m_pending_lock.lock();
+  ceph_assert(m_pending >= count);
+  m_pending -= count;
+  if (m_pending == 0) {
+    std::list<Context *> pending_waiting(std::move(m_pending_waiting));
+    m_pending_lock.unlock();
+
+    for (auto ctx : pending_waiting) {
+      ctx->complete(0);
+    }
+  } else {
+    m_pending_lock.unlock();
+  }
+}
+
+void Readahead::wait_for_pending() {
+  C_SaferCond ctx;
+  wait_for_pending(&ctx);
+  ctx.wait();
+}
+
+void Readahead::wait_for_pending(Context *ctx) {
+  m_pending_lock.lock();
+  if (m_pending > 0) {
+    m_pending_lock.unlock();
+    m_pending_waiting.push_back(ctx);
+    return;
+  }
+  m_pending_lock.unlock();
+
+  ctx->complete(0);
+}
+void Readahead::set_trigger_requests(int trigger_requests) {
+  m_lock.lock();
+  m_trigger_requests = trigger_requests;
+  m_lock.unlock();
+}
+
+uint64_t Readahead::get_min_readahead_size(void) {
+  std::lock_guard lock(m_lock);
+  return m_readahead_min_bytes;
+}
+
+uint64_t Readahead::get_max_readahead_size(void) {
+  std::lock_guard lock(m_lock);
+  return m_readahead_max_bytes;
+}
+
+void Readahead::set_min_readahead_size(uint64_t min_readahead_size) {
+  m_lock.lock();
+  m_readahead_min_bytes = min_readahead_size;
+  m_lock.unlock();
+}
+
+void Readahead::set_max_readahead_size(uint64_t max_readahead_size) {
+  m_lock.lock();
+  m_readahead_max_bytes = max_readahead_size;
+  m_lock.unlock();
+}
+
+void Readahead::set_alignments(const vector<uint64_t> &alignments) {
+  m_lock.lock();
+  m_alignments = alignments;
+  m_lock.unlock();
+}
diff --git a/src/common/Readahead.h b/src/common/Readahead.h
new file mode 100644
index 000000000..716e58cd3
--- /dev/null
+++ b/src/common/Readahead.h
@@ -0,0 +1,167 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_READAHEAD_H
+#define CEPH_READAHEAD_H
+
+#include <list>
+#include <vector>
+
+#include "include/Context.h"
+#include "common/ceph_mutex.h"
+
+/**
+   This class provides common state and logic for code that needs to perform readahead
+   on linear things such as RBD images or files.
+   Unless otherwise specified, all methods are thread-safe.
+
+   Minimum and maximum readahead sizes may be violated by up to 50\% if alignment is enabled.
+   Minimum readahead size may be violated if the end of the readahead target is reached.
+ */
+class Readahead {
+public:
+  typedef std::pair<uint64_t, uint64_t> extent_t;
+
+  // equal to UINT64_MAX
+  static const uint64_t NO_LIMIT = 18446744073709551615ULL;
+
+  Readahead();
+
+  ~Readahead();
+
+  /**
+     Update state with new reads and return readahead to be performed.
+     If the length of the returned extent is 0, no readahead should be performed.
+     The readahead extent is guaranteed not to pass \c limit.
+
+     Note that passing in NO_LIMIT as the limit and truncating the returned extent
+     is not the same as passing in the correct limit, because the internal state
+     will differ in the two cases.
+
+     @param extents read operations since last call to update
+     @param limit size of the thing readahead is being applied to
+   */
+  extent_t update(const std::vector<extent_t>& extents, uint64_t limit);
+
+  /**
+     Update state with a new read and return readahead to be performed.
+     If the length of the returned extent is 0, no readahead should be performed.
+     The readahead extent is guaranteed not to pass \c limit.
+
+     Note that passing in NO_LIMIT as the limit and truncating the returned extent
+     is not the same as passing in the correct limit, because the internal state
+     will differ in the two cases.
+
+     @param offset offset of the read operation
+     @param length length of the read operation
+     @param limit size of the thing readahead is being applied to
+   */
+  extent_t update(uint64_t offset, uint64_t length, uint64_t limit);
+
+  /**
+     Increment the pending counter.
+   */
+  void inc_pending(int count = 1);
+
+  /**
+     Decrement the pending counter.
+     The counter must not be decremented below 0.
+   */
+  void dec_pending(int count = 1);
+
+  /**
+     Waits until the pending count reaches 0.
+   */
+  void wait_for_pending();
+  void wait_for_pending(Context *ctx);
+
+  /**
+     Sets the number of sequential requests necessary to trigger readahead.
+   */
+  void set_trigger_requests(int trigger_requests);
+
+  /**
+     Gets the minimum size of a readahead request, in bytes.
+   */
+  uint64_t get_min_readahead_size(void);
+
+  /**
+     Gets the maximum size of a readahead request, in bytes.
+   */
+  uint64_t get_max_readahead_size(void);
+
+  /**
+     Sets the minimum size of a readahead request, in bytes.
+   */
+  void set_min_readahead_size(uint64_t min_readahead_size);
+
+  /**
+     Sets the maximum size of a readahead request, in bytes.
+   */
+  void set_max_readahead_size(uint64_t max_readahead_size);
+
+  /**
+     Sets the alignment units.
+     If the end point of a readahead request can be aligned to an alignment unit
+     by increasing or decreasing the size of the request by 50\% or less, it will.
+     Alignments are tested in order, so larger numbers should almost always come first.
+   */
+  void set_alignments(const std::vector<uint64_t> &alignments);
+
+private:
+  /**
+     Records that a read request has been received.
+     m_lock must be held while calling.
+   */
+  void _observe_read(uint64_t offset, uint64_t length);
+
+  /**
+     Computes the next readahead request.
+     m_lock must be held while calling.
+  */
+  extent_t _compute_readahead(uint64_t limit);
+
+  /// Number of sequential requests necessary to trigger readahead
+  int m_trigger_requests;
+
+  /// Minimum size of a readahead request, in bytes
+  uint64_t m_readahead_min_bytes;
+
+  /// Maximum size of a readahead request, in bytes
+  uint64_t m_readahead_max_bytes;
+
+  /// Alignment units, in bytes
+  std::vector<uint64_t> m_alignments;
+
+  /// Held while reading/modifying any state except m_pending
+  ceph::mutex m_lock = ceph::make_mutex("Readahead::m_lock");
+
+  /// Number of consecutive read requests in the current sequential stream
+  int m_nr_consec_read;
+
+  /// Number of bytes read in the current sequenial stream
+  uint64_t m_consec_read_bytes;
+
+  /// Position of the read stream
+  uint64_t m_last_pos;
+
+  /// Position of the readahead stream
+  uint64_t m_readahead_pos;
+
+  /// When readahead is already triggered and the read stream crosses this point, readahead is continued
+  uint64_t m_readahead_trigger_pos;
+
+  /// Size of the next readahead request (barring changes due to alignment, etc.)
+  uint64_t m_readahead_size;
+
+  /// Number of pending readahead requests, as determined by inc_pending() and dec_pending()
+  int m_pending;
+
+  /// Lock for m_pending
+  ceph::mutex m_pending_lock = ceph::make_mutex("Readahead::m_pending_lock");
+
+  /// Waiters for pending readahead
+  std::list<Context *> m_pending_waiting;
+};
+
+#endif
diff --git a/src/common/RefCountedObj.cc b/src/common/RefCountedObj.cc
new file mode 100644
index 000000000..e4c3fef58
--- /dev/null
+++ b/src/common/RefCountedObj.cc
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+//
+#include "include/ceph_assert.h"
+
+#include "common/RefCountedObj.h"
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "common/valgrind.h"
+
+namespace TOPNSPC::common {
+RefCountedObject::~RefCountedObject()
+{
+  ceph_assert(nref == 0);
+}
+
+void RefCountedObject::put() const {
+  CephContext *local_cct = cct;
+  auto v = --nref;
+  if (local_cct) {
+    lsubdout(local_cct, refs, 1) << "RefCountedObject::put " << this << " "
+		   << (v + 1) << " -> " << v
+		   << dendl;
+  }
+  if (v == 0) {
+    ANNOTATE_HAPPENS_AFTER(&nref);
+    ANNOTATE_HAPPENS_BEFORE_FORGET_ALL(&nref);
+    delete this;
+  } else {
+    ANNOTATE_HAPPENS_BEFORE(&nref);
+  }
+}
+
+void RefCountedObject::_get() const {
+  auto v = ++nref;
+  ceph_assert(v > 1); /* it should never happen that _get() sees nref == 0 */
+  if (cct) {
+    lsubdout(cct, refs, 1) << "RefCountedObject::get " << this << " "
+	     << (v - 1) << " -> " << v << dendl;
+  }
+}
+
+}
diff --git a/src/common/RefCountedObj.h b/src/common/RefCountedObj.h
new file mode 100644
index 000000000..2de1cf14e
--- /dev/null
+++ b/src/common/RefCountedObj.h
@@ -0,0 +1,207 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_REFCOUNTEDOBJ_H
+#define CEPH_REFCOUNTEDOBJ_H
+ 
+#include "common/ceph_mutex.h"
+#include "common/ref.h"
+#include "include/common_fwd.h"
+
+#include <atomic>
+
+/* This class provides mechanisms to make a sub-class work with
+ * boost::intrusive_ptr (aka ceph::ref_t).
+ *
+ * Generally, you'll want to inherit from RefCountedObjectSafe and not from
+ * RefCountedObject directly. This is because the ::get and ::put methods are
+ * public and can be used to create/delete references outside of the
+ * ceph::ref_t pointers with the potential to leak memory.
+ *
+ * It is also suggested that you make constructors and destructors private in
+ * your final class. This prevents instantiation of the object with assignment
+ * to a raw pointer. Consequently, you'll want to use ceph::make_ref<> to
+ * create a ceph::ref_t<> holding your object:
+ *
+ *    auto ptr = ceph::make_ref<Foo>(...);
+ *
+ * Use FRIEND_MAKE_REF(ClassName) to allow ceph::make_ref to call the private
+ * constructors.
+ *
+ */
+namespace TOPNSPC::common {
+class RefCountedObject {
+public:
+  void set_cct(CephContext *c) {
+    cct = c;
+  }
+
+  uint64_t get_nref() const {
+    return nref;
+  }
+
+  const RefCountedObject *get() const {
+    _get();
+    return this;
+  }
+  RefCountedObject *get() {
+    _get();
+    return this;
+  }
+  void put() const;
+
+protected:
+  RefCountedObject() = default;
+  RefCountedObject(const RefCountedObject& o) : cct(o.cct) {}
+  RefCountedObject& operator=(const RefCountedObject& o) = delete;
+  RefCountedObject(RefCountedObject&&) = delete;
+  RefCountedObject& operator=(RefCountedObject&&) = delete;
+  RefCountedObject(CephContext* c) : cct(c) {}
+
+  virtual ~RefCountedObject();
+
+private:
+  void _get() const;
+
+#if defined(WITH_SEASTAR) && !defined(WITH_ALIEN)
+  // crimson is single threaded at the moment
+  mutable uint64_t nref{1};
+#else
+  mutable std::atomic<uint64_t> nref{1};
+#endif
+  CephContext *cct{nullptr};
+};
+
+class RefCountedObjectSafe : public RefCountedObject {
+public:
+  RefCountedObject *get() = delete;
+  const RefCountedObject *get() const = delete;
+  void put() const = delete;
+protected:
+template<typename... Args>
+  RefCountedObjectSafe(Args&&... args) : RefCountedObject(std::forward<Args>(args)...) {}
+  virtual ~RefCountedObjectSafe() override {}
+};
+
+#if !defined(WITH_SEASTAR)|| defined(WITH_ALIEN)
+
+/**
+ * RefCountedCond
+ *
+ *  a refcounted condition, will be removed when all references are dropped
+ */
+struct RefCountedCond : public RefCountedObject {
+  RefCountedCond() = default;
+  ~RefCountedCond() = default;
+
+  int wait() {
+    std::unique_lock l(lock);
+    while (!complete) {
+      cond.wait(l);
+    }
+    return rval;
+  }
+
+  void done(int r) {
+    std::lock_guard l(lock);
+    rval = r;
+    complete = true;
+    cond.notify_all();
+  }
+
+  void done() {
+    done(0);
+  }
+
+private:
+  bool complete = false;
+  ceph::mutex lock = ceph::make_mutex("RefCountedCond::lock");
+  ceph::condition_variable cond;
+  int rval = 0;
+};
+
+/**
+ * RefCountedWaitObject
+ *
+ * refcounted object that allows waiting for the object's last reference.
+ * Any referrer can either put or put_wait(). A simple put() will return
+ * immediately, a put_wait() will return only when the object is destroyed.
+ * e.g., useful when we want to wait for a specific event completion. We
+ * use RefCountedCond, as the condition can be referenced after the object
+ * destruction. 
+ *    
+ */
+struct RefCountedWaitObject {
+  std::atomic<uint64_t> nref = { 1 };
+  RefCountedCond *c;
+
+  RefCountedWaitObject() {
+    c = new RefCountedCond;
+  }
+  virtual ~RefCountedWaitObject() {
+    c->put();
+  }
+
+  RefCountedWaitObject *get() {
+    nref++;
+    return this;
+  }
+
+  bool put() {
+    bool ret = false;
+    RefCountedCond *cond = c;
+    cond->get();
+    if (--nref == 0) {
+      cond->done();
+      delete this;
+      ret = true;
+    }
+    cond->put();
+    return ret;
+  }
+
+  void put_wait() {
+    RefCountedCond *cond = c;
+
+    cond->get();
+    if (--nref == 0) {
+      cond->done();
+      delete this;
+    } else {
+      cond->wait();
+    }
+    cond->put();
+  }
+};
+
+#endif // !defined(WITH_SEASTAR)|| defined(WITH_ALIEN)
+
+static inline void intrusive_ptr_add_ref(const RefCountedObject *p) {
+  p->get();
+}
+static inline void intrusive_ptr_release(const RefCountedObject *p) {
+  p->put();
+}
+struct UniquePtrDeleter
+{
+  void operator()(RefCountedObject *p) const
+  {
+    // Don't expect a call to `get()` in the ctor as we manually set nref to 1
+    p->put();
+  }
+};
+}
+using RefCountedPtr = ceph::ref_t<TOPNSPC::common::RefCountedObject>;
+
+#endif
diff --git a/src/common/Semaphore.h b/src/common/Semaphore.h
new file mode 100644
index 000000000..88aa9c840
--- /dev/null
+++ b/src/common/Semaphore.h
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_Sem_Posix__H
+#define CEPH_Sem_Posix__H
+
+#include "common/ceph_mutex.h"
+
+class Semaphore
+{
+  ceph::mutex m = ceph::make_mutex("Semaphore::m");
+  ceph::condition_variable c;
+  int count = 0;
+
+  public:
+
+  void Put()
+  { 
+    std::lock_guard l(m);
+    count++;
+    c.notify_all();
+  }
+
+  void Get() 
+  {
+    std::unique_lock l(m);
+    while(count <= 0) {
+      c.wait(l);
+    }
+    count--;
+  }
+};
+
+#endif // !_Mutex_Posix_
diff --git a/src/common/SloppyCRCMap.cc b/src/common/SloppyCRCMap.cc
new file mode 100644
index 000000000..ec9cbdf53
--- /dev/null
+++ b/src/common/SloppyCRCMap.cc
@@ -0,0 +1,183 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#include "common/SloppyCRCMap.h"
+#include "common/Formatter.h"
+
+using namespace std;
+using ceph::bufferlist;
+
+void SloppyCRCMap::write(uint64_t offset, uint64_t len, const bufferlist& bl,
+			 std::ostream *out)
+{
+  int64_t left = len;
+  uint64_t pos = offset;
+  unsigned o = offset % block_size;
+  if (o) {
+    crc_map.erase(offset - o);
+    if (out)
+      *out << "write invalidate " << (offset - o) << "\n";
+    pos += (block_size - o);
+    left -= (block_size - o);
+  }
+  while (left >= block_size) {
+    bufferlist t;
+    t.substr_of(bl, pos - offset, block_size);
+    crc_map[pos] = t.crc32c(crc_iv);
+    if (out)
+      *out << "write set " << pos << " " << crc_map[pos] << "\n";
+    pos += block_size;
+    left -= block_size;
+  }
+  if (left > 0) {
+    crc_map.erase(pos);
+    if (out)
+      *out << "write invalidate " << pos << "\n";
+  }
+}
+
+int SloppyCRCMap::read(uint64_t offset, uint64_t len, const bufferlist& bl,
+		       std::ostream *err)
+{
+  int errors = 0;
+  int64_t left = len;
+  uint64_t pos = offset;
+  unsigned o = offset % block_size;
+  if (o) {
+    pos += (block_size - o);
+    left -= (block_size - o);
+  }
+  while (left >= block_size) {
+    // FIXME: this could be more efficient if we avoid doing a find()
+    // on each iteration
+    std::map<uint64_t,uint32_t>::iterator p = crc_map.find(pos);
+    if (p != crc_map.end()) {
+      bufferlist t;
+      t.substr_of(bl, pos - offset, block_size);
+      uint32_t crc = t.crc32c(crc_iv);
+      if (p->second != crc) {
+	errors++;
+	if (err)
+	  *err << "offset " << pos << " len " << block_size
+	       << " has crc " << crc << " expected " << p->second << "\n";
+      }
+    }
+    pos += block_size;
+    left -= block_size;
+  }
+  return errors;  
+}
+
+void SloppyCRCMap::truncate(uint64_t offset)
+{
+  offset -= offset % block_size;
+  std::map<uint64_t,uint32_t>::iterator p = crc_map.lower_bound(offset);
+  while (p != crc_map.end())
+    crc_map.erase(p++);
+}
+
+void SloppyCRCMap::zero(uint64_t offset, uint64_t len)
+{
+  int64_t left = len;
+  uint64_t pos = offset;
+  unsigned o = offset % block_size;
+  if (o) {
+    crc_map.erase(offset - o);
+    pos += (block_size - o);
+    left -= (block_size - o);
+  }
+  while (left >= block_size) {
+    crc_map[pos] = zero_crc;
+    pos += block_size;
+    left -= block_size;
+  }
+  if (left > 0)
+    crc_map.erase(pos);
+}
+
+void SloppyCRCMap::clone_range(uint64_t offset, uint64_t len,
+			       uint64_t srcoff, const SloppyCRCMap& src,
+			       std::ostream *out)
+{
+  int64_t left = len;
+  uint64_t pos = offset;
+  uint64_t srcpos = srcoff;
+  unsigned o = offset % block_size;
+  if (o) {
+    crc_map.erase(offset - o);
+    pos += (block_size - o);
+    srcpos += (block_size - o);
+    left -= (block_size - o);
+    if (out)
+      *out << "clone_range invalidate " << (offset - o) << "\n";
+  }
+  while (left >= block_size) {
+    // FIXME: this could be more efficient.
+    if (block_size == src.block_size) {
+      map<uint64_t,uint32_t>::const_iterator p = src.crc_map.find(srcpos);
+      if (p != src.crc_map.end()) {
+	crc_map[pos] = p->second;
+	if (out)
+	  *out << "clone_range copy " << pos << " " << p->second << "\n";
+      } else {
+	crc_map.erase(pos);
+	if (out)
+	  *out << "clone_range invalidate " << pos << "\n";
+      }
+    } else {
+      crc_map.erase(pos);
+      if (out)
+	*out << "clone_range invalidate " << pos << "\n";
+    }
+    pos += block_size;
+    srcpos += block_size;
+    left -= block_size;
+  }
+  if (left > 0) {
+    crc_map.erase(pos);
+    if (out)
+      *out << "clone_range invalidate " << pos << "\n";
+  }
+}
+
+void SloppyCRCMap::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(block_size, bl);
+  encode(crc_map, bl);
+  ENCODE_FINISH(bl);
+}
+
+void SloppyCRCMap::decode(bufferlist::const_iterator& bl)
+{
+  DECODE_START(1, bl);
+  uint32_t bs;
+  decode(bs, bl);
+  set_block_size(bs);
+  decode(crc_map, bl);
+  DECODE_FINISH(bl);
+}
+
+void SloppyCRCMap::dump(ceph::Formatter *f) const
+{
+  f->dump_unsigned("block_size", block_size);
+  f->open_array_section("crc_map");
+  for (map<uint64_t,uint32_t>::const_iterator p = crc_map.begin(); p != crc_map.end(); ++p) {
+    f->open_object_section("crc");
+    f->dump_unsigned("offset", p->first);
+    f->dump_unsigned("crc", p->second);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void SloppyCRCMap::generate_test_instances(list<SloppyCRCMap*>& ls)
+{
+  ls.push_back(new SloppyCRCMap);
+  ls.push_back(new SloppyCRCMap(2));
+  bufferlist bl;
+  bl.append("some data");
+  ls.back()->write(1, bl.length(), bl);
+  ls.back()->write(10, bl.length(), bl);
+  ls.back()->zero(4, 2);
+}
diff --git a/src/common/SloppyCRCMap.h b/src/common/SloppyCRCMap.h
new file mode 100644
index 000000000..6bbfe978a
--- /dev/null
+++ b/src/common/SloppyCRCMap.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_COMMON_SLOPPYCRCMAP_H
+#define CEPH_COMMON_SLOPPYCRCMAP_H
+
+#include "include/encoding.h"
+
+namespace ceph {
+class Formatter;
+}
+
+/**
+ * SloppyCRCMap
+ *
+ * Opportunistically track CRCs on any reads or writes that cover full
+ * blocks.  Verify read results when we have CRC data available for
+ * the given extent.
+ */
+class SloppyCRCMap {
+  static const int crc_iv = 0xffffffff;
+
+  std::map<uint64_t, uint32_t> crc_map;  // offset -> crc(-1)
+  uint32_t block_size;
+  uint32_t zero_crc;
+
+public:
+  SloppyCRCMap(uint32_t b=0) {
+    set_block_size(b);
+  }
+
+  void set_block_size(uint32_t b) {
+    block_size = b;
+    //zero_crc = ceph_crc32c(0xffffffff, NULL, block_size);
+    if (b) {
+      ceph::buffer::list bl;
+      bl.append_zero(block_size);
+      zero_crc = bl.crc32c(crc_iv);
+    } else {
+      zero_crc = crc_iv;
+    }
+  }
+
+  /// update based on a write
+  void write(uint64_t offset, uint64_t len, const ceph::buffer::list& bl,
+	     std::ostream *out = NULL);
+
+  /// update based on a truncate
+  void truncate(uint64_t offset);
+
+  /// update based on a zero/punch_hole
+  void zero(uint64_t offset, uint64_t len);
+
+  /// update based on a zero/punch_hole
+  void clone_range(uint64_t offset, uint64_t len, uint64_t srcoff, const SloppyCRCMap& src,
+		   std::ostream *out = NULL);
+
+  /**
+   * validate a read result
+   *
+   * @param offset offset
+   * @param length length
+   * @param bl data read
+   * @param err option ostream to describe errors in detail
+   * @returns error count, 0 for success
+   */
+  int read(uint64_t offset, uint64_t len, const ceph::buffer::list& bl, std::ostream *err);
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<SloppyCRCMap*>& ls);
+};
+WRITE_CLASS_ENCODER(SloppyCRCMap)
+
+#endif
diff --git a/src/common/StackStringStream.h b/src/common/StackStringStream.h
new file mode 100644
index 000000000..3324e7add
--- /dev/null
+++ b/src/common/StackStringStream.h
@@ -0,0 +1,192 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.	See file COPYING.
+ *
+ */
+
+#ifndef COMMON_STACKSTRINGSTREAM_H
+#define COMMON_STACKSTRINGSTREAM_H
+
+#include <boost/container/small_vector.hpp>
+
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <ostream>
+#include <sstream>
+#include <string_view>
+#include <vector>
+
+#include "include/inline_memory.h"
+
+template<std::size_t SIZE>
+class StackStringBuf : public std::basic_streambuf<char>
+{
+public:
+  StackStringBuf()
+    : vec{SIZE, boost::container::default_init_t{}}
+  {
+    setp(vec.data(), vec.data() + vec.size());
+  }
+  StackStringBuf(const StackStringBuf&) = delete;
+  StackStringBuf& operator=(const StackStringBuf&) = delete;
+  StackStringBuf(StackStringBuf&& o) = delete;
+  StackStringBuf& operator=(StackStringBuf&& o) = delete;
+  ~StackStringBuf() override = default;
+
+  void clear()
+  {
+    vec.resize(SIZE);
+    setp(vec.data(), vec.data() + SIZE);
+  }
+
+  std::string_view strv() const
+  {
+    return std::string_view(pbase(), pptr() - pbase());
+  }
+
+protected:
+  std::streamsize xsputn(const char *s, std::streamsize n) final
+  {
+    std::streamsize capacity = epptr() - pptr();
+    std::streamsize left = n;
+    if (capacity >= left) {
+      maybe_inline_memcpy(pptr(), s, left, 32);
+      pbump(left);
+    } else {
+      maybe_inline_memcpy(pptr(), s, capacity, 64);
+      s += capacity;
+      left -= capacity;
+      vec.insert(vec.end(), s, s + left);
+      setp(vec.data(), vec.data() + vec.size());
+      pbump(vec.size());
+    }
+    return n;
+  }
+
+  int overflow(int c) final
+  {
+    if (traits_type::not_eof(c)) {
+      char str = traits_type::to_char_type(c);
+      vec.push_back(str);
+      return c;
+    } else {
+      return traits_type::eof();
+    }
+  }
+
+private:
+
+  boost::container::small_vector<char, SIZE> vec;
+};
+
+template<std::size_t SIZE>
+class StackStringStream : public std::basic_ostream<char>
+{
+public:
+  StackStringStream() : basic_ostream<char>(&ssb), default_fmtflags(flags()) {}
+  StackStringStream(const StackStringStream& o) = delete;
+  StackStringStream& operator=(const StackStringStream& o) = delete;
+  StackStringStream(StackStringStream&& o) = delete;
+  StackStringStream& operator=(StackStringStream&& o) = delete;
+  ~StackStringStream() override = default;
+
+  void reset() {
+    clear(); /* reset state flags */
+    flags(default_fmtflags); /* reset fmtflags to constructor defaults */
+    ssb.clear();
+  }
+
+  std::string_view strv() const {
+    return ssb.strv();
+  }
+  std::string str() const {
+    return std::string(ssb.strv());
+  }
+
+private:
+  StackStringBuf<SIZE> ssb;
+  fmtflags const default_fmtflags;
+};
+
+/* In an ideal world, we could use StackStringStream indiscriminately, but alas
+ * it's very expensive to construct/destruct. So, we cache them in a
+ * thread_local vector. DO NOT share these with other threads. The copy/move
+ * constructors are deliberately restrictive to make this more difficult to
+ * accidentally do.
+ */
+class CachedStackStringStream {
+public:
+  using sss = StackStringStream<4096>;
+  using osptr = std::unique_ptr<sss>;
+
+  CachedStackStringStream() {
+    if (cache.destructed || cache.c.empty()) {
+      osp = std::make_unique<sss>();
+    } else {
+      osp = std::move(cache.c.back());
+      cache.c.pop_back();
+      osp->reset();
+    }
+  }
+  CachedStackStringStream(const CachedStackStringStream&) = delete;
+  CachedStackStringStream& operator=(const CachedStackStringStream&) = delete;
+  CachedStackStringStream(CachedStackStringStream&&) = delete;
+  CachedStackStringStream& operator=(CachedStackStringStream&&) = delete;
+  ~CachedStackStringStream() {
+    if (!cache.destructed && cache.c.size() < max_elems) {
+      cache.c.emplace_back(std::move(osp));
+    }
+  }
+
+  sss& operator*() {
+    return *osp;
+  }
+  sss const& operator*() const {
+    return *osp;
+  }
+  sss* operator->() {
+    return osp.get();
+  }
+  sss const* operator->() const {
+    return osp.get();
+  }
+
+  sss const* get() const {
+    return osp.get();
+  }
+  sss* get() {
+    return osp.get();
+  }
+
+private:
+  static constexpr std::size_t max_elems = 8;
+
+  /* The thread_local cache may be destructed before other static structures.
+   * If those destructors try to create a CachedStackStringStream (e.g. for
+   * logging) and access this cache, that access will be undefined. So note if
+   * the cache has been destructed and check before use.
+   */
+  struct Cache {
+    using container = std::vector<osptr>;
+
+    Cache() {}
+    ~Cache() { destructed = true; }
+
+    container c;
+    bool destructed = false;
+  };
+
+  inline static thread_local Cache cache;
+  osptr osp;
+};
+
+#endif
diff --git a/src/common/SubProcess.cc b/src/common/SubProcess.cc
new file mode 100644
index 000000000..1faf33e36
--- /dev/null
+++ b/src/common/SubProcess.cc
@@ -0,0 +1,394 @@
+#include "SubProcess.h"
+
+#if defined(__FreeBSD__) || defined(__APPLE__)
+#include <sys/types.h>
+#include <signal.h>
+#endif
+#include <stdarg.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <iostream>
+
+#include "common/errno.h"
+#include "include/ceph_assert.h"
+#include "include/compat.h"
+
+SubProcess::SubProcess(const char *cmd_, std_fd_op stdin_op_, std_fd_op stdout_op_, std_fd_op stderr_op_) :
+  cmd(cmd_),
+  cmd_args(),
+  stdin_op(stdin_op_),
+  stdout_op(stdout_op_),
+  stderr_op(stderr_op_),
+  stdin_pipe_out_fd(-1),
+  stdout_pipe_in_fd(-1),
+  stderr_pipe_in_fd(-1),
+  pid(-1),
+  errstr() {
+}
+
+SubProcess::~SubProcess() {
+  ceph_assert(!is_spawned());
+  ceph_assert(stdin_pipe_out_fd == -1);
+  ceph_assert(stdout_pipe_in_fd == -1);
+  ceph_assert(stderr_pipe_in_fd == -1);
+}
+
+void SubProcess::add_cmd_args(const char *arg, ...) {
+  ceph_assert(!is_spawned());
+
+  va_list ap;
+  va_start(ap, arg);
+  const char *p = arg;
+  do {
+    add_cmd_arg(p);
+    p = va_arg(ap, const char*);
+  } while (p != NULL);
+  va_end(ap);
+}
+
+void SubProcess::add_cmd_arg(const char *arg) {
+  ceph_assert(!is_spawned());
+
+  cmd_args.push_back(arg);
+}
+
+int SubProcess::get_stdin() const {
+  ceph_assert(is_spawned());
+  ceph_assert(stdin_op == PIPE);
+
+  return stdin_pipe_out_fd;
+}
+
+int SubProcess::get_stdout() const {
+  ceph_assert(is_spawned());
+  ceph_assert(stdout_op == PIPE);
+
+  return stdout_pipe_in_fd;
+}
+
+int SubProcess::get_stderr() const {
+  ceph_assert(is_spawned());
+  ceph_assert(stderr_op == PIPE);
+
+  return stderr_pipe_in_fd;
+}
+
+void SubProcess::close(int &fd) {
+  if (fd == -1)
+    return;
+
+  ::close(fd);
+  fd = -1;
+}
+
+void SubProcess::close_stdin() {
+  ceph_assert(is_spawned());
+  ceph_assert(stdin_op == PIPE);
+
+  close(stdin_pipe_out_fd);
+}
+
+void SubProcess::close_stdout() {
+  ceph_assert(is_spawned());
+  ceph_assert(stdout_op == PIPE);
+
+  close(stdout_pipe_in_fd);
+}
+
+void SubProcess::close_stderr() {
+  ceph_assert(is_spawned());
+  ceph_assert(stderr_op == PIPE);
+
+  close(stderr_pipe_in_fd);
+}
+
+void SubProcess::kill(int signo) const {
+  ceph_assert(is_spawned());
+
+  int ret = ::kill(pid, signo);
+  ceph_assert(ret == 0);
+}
+
+const std::string SubProcess::err() const {
+  return errstr.str();
+}
+
+class fd_buf : public std::streambuf {
+  int fd;
+public:
+  fd_buf (int fd) : fd(fd)
+  {}
+protected:
+  int_type overflow (int_type c) override {
+    if (c == EOF) return EOF;
+    char buf = c;
+    if (write (fd, &buf, 1) != 1) {
+      return EOF;
+    }
+    return c;
+  }
+  std::streamsize xsputn (const char* s, std::streamsize count) override {
+    return write(fd, s, count);
+  }
+};
+
+int SubProcess::spawn() {
+  ceph_assert(!is_spawned());
+  ceph_assert(stdin_pipe_out_fd == -1);
+  ceph_assert(stdout_pipe_in_fd == -1);
+  ceph_assert(stderr_pipe_in_fd == -1);
+
+  enum { IN = 0, OUT = 1 };
+
+  int ipipe[2], opipe[2], epipe[2];
+
+  ipipe[0] = ipipe[1] = opipe[0] = opipe[1] = epipe[0] = epipe[1] = -1;
+
+  int ret = 0;
+
+  if ((stdin_op == PIPE  && pipe_cloexec(ipipe, 0) == -1) ||
+      (stdout_op == PIPE && pipe_cloexec(opipe, 0) == -1) ||
+      (stderr_op == PIPE && pipe_cloexec(epipe, 0) == -1)) {
+    ret = -errno;
+    errstr << "pipe failed: " << cpp_strerror(errno);
+    goto fail;
+  }
+
+  pid = fork();
+
+  if (pid > 0) { // Parent
+    stdin_pipe_out_fd = ipipe[OUT]; close(ipipe[IN ]);
+    stdout_pipe_in_fd = opipe[IN ]; close(opipe[OUT]);
+    stderr_pipe_in_fd = epipe[IN ]; close(epipe[OUT]);
+    return 0;
+  }
+
+  if (pid == 0) { // Child
+    close(ipipe[OUT]);
+    close(opipe[IN ]);
+    close(epipe[IN ]);
+
+    if (ipipe[IN] >= 0) {
+      if (ipipe[IN] == STDIN_FILENO) {
+        ::fcntl(STDIN_FILENO, F_SETFD, 0); /* clear FD_CLOEXEC */
+      } else {
+        ::dup2(ipipe[IN], STDIN_FILENO);
+        ::close(ipipe[IN]);
+      }
+    }
+    if (opipe[OUT] >= 0) {
+      if (opipe[OUT] == STDOUT_FILENO) {
+        ::fcntl(STDOUT_FILENO, F_SETFD, 0); /* clear FD_CLOEXEC */
+      } else {
+        ::dup2(opipe[OUT], STDOUT_FILENO);
+        ::close(opipe[OUT]);
+        static fd_buf buf(STDOUT_FILENO);
+        std::cout.rdbuf(&buf);
+      }
+    }
+    if (epipe[OUT] >= 0) {
+      if (epipe[OUT] == STDERR_FILENO) {
+        ::fcntl(STDERR_FILENO, F_SETFD, 0); /* clear FD_CLOEXEC */
+      } else {
+        ::dup2(epipe[OUT], STDERR_FILENO);
+        ::close(epipe[OUT]);
+        static fd_buf buf(STDERR_FILENO);
+        std::cerr.rdbuf(&buf);
+      }
+    }
+
+    int maxfd = sysconf(_SC_OPEN_MAX);
+    if (maxfd == -1)
+      maxfd = 16384;
+    for (int fd = 0; fd <= maxfd; fd++) {
+      if (fd == STDIN_FILENO && stdin_op != CLOSE)
+	continue;
+      if (fd == STDOUT_FILENO && stdout_op != CLOSE)
+	continue;
+      if (fd == STDERR_FILENO && stderr_op != CLOSE)
+	continue;
+      ::close(fd);
+    }
+
+    exec();
+    ceph_abort(); // Never reached
+  }
+
+  ret = -errno;
+  errstr << "fork failed: " << cpp_strerror(errno);
+
+fail:
+  close(ipipe[0]);
+  close(ipipe[1]);
+  close(opipe[0]);
+  close(opipe[1]);
+  close(epipe[0]);
+  close(epipe[1]);
+
+  return ret;
+}
+
+void SubProcess::exec() {
+  ceph_assert(is_child());
+
+  std::vector<const char *> args;
+  args.push_back(cmd.c_str());
+  for (std::vector<std::string>::iterator i = cmd_args.begin();
+       i != cmd_args.end();
+       i++) {
+    args.push_back(i->c_str());
+  }
+  args.push_back(NULL);
+
+  int ret = execvp(cmd.c_str(), (char * const *)&args[0]);
+  ceph_assert(ret == -1);
+
+  std::cerr << cmd << ": exec failed: " << cpp_strerror(errno) << "\n";
+  _exit(EXIT_FAILURE);
+}
+
+int SubProcess::join() {
+  ceph_assert(is_spawned());
+
+  close(stdin_pipe_out_fd);
+  close(stdout_pipe_in_fd);
+  close(stderr_pipe_in_fd);
+
+  int status;
+
+  while (waitpid(pid, &status, 0) == -1)
+    ceph_assert(errno == EINTR);
+
+  pid = -1;
+
+  if (WIFEXITED(status)) {
+    if (WEXITSTATUS(status) != EXIT_SUCCESS)
+      errstr << cmd << ": exit status: " << WEXITSTATUS(status);
+    return WEXITSTATUS(status);
+  }
+  if (WIFSIGNALED(status)) {
+    errstr << cmd << ": got signal: " << WTERMSIG(status);
+    return 128 + WTERMSIG(status);
+  }
+  errstr << cmd << ": waitpid: unknown status returned\n";
+  return EXIT_FAILURE;
+}
+
+SubProcessTimed::SubProcessTimed(const char *cmd, std_fd_op stdin_op,
+				 std_fd_op stdout_op, std_fd_op stderr_op,
+				 int timeout_, int sigkill_) :
+  SubProcess(cmd, stdin_op, stdout_op, stderr_op),
+  timeout(timeout_),
+  sigkill(sigkill_) {
+}
+
+static bool timedout = false; // only used after fork
+void timeout_sighandler(int sig) {
+  timedout = true;
+}
+static void dummy_sighandler(int sig) {}
+
+void SubProcessTimed::exec() {
+  ceph_assert(is_child());
+
+  if (timeout <= 0) {
+    SubProcess::exec();
+    ceph_abort(); // Never reached
+  }
+
+  sigset_t mask, oldmask;
+  int pid;
+
+  // Restore default action for SIGTERM in case the parent process decided
+  // to ignore it.
+  if (signal(SIGTERM, SIG_DFL) == SIG_ERR) {
+    std::cerr << cmd << ": signal failed: " << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+  // Because SIGCHLD is ignored by default, setup dummy handler for it,
+  // so we can mask it.
+  if (signal(SIGCHLD, dummy_sighandler) == SIG_ERR) {
+    std::cerr << cmd << ": signal failed: " << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+  // Setup timeout handler.
+  if (signal(SIGALRM, timeout_sighandler) == SIG_ERR) {
+    std::cerr << cmd << ": signal failed: " << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+  // Block interesting signals.
+  sigemptyset(&mask);
+  sigaddset(&mask, SIGINT);
+  sigaddset(&mask, SIGTERM);
+  sigaddset(&mask, SIGCHLD);
+  sigaddset(&mask, SIGALRM);
+  if (sigprocmask(SIG_SETMASK, &mask, &oldmask) == -1) {
+    std::cerr << cmd << ": sigprocmask failed: " << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+
+  pid = fork();
+
+  if (pid == -1) {
+    std::cerr << cmd << ": fork failed: " << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+
+  if (pid == 0) { // Child
+    // Restore old sigmask.
+    if (sigprocmask(SIG_SETMASK, &oldmask, NULL) == -1) {
+      std::cerr << cmd << ": sigprocmask failed: " << cpp_strerror(errno) << "\n";
+      goto fail_exit;
+    }
+    (void)setpgid(0, 0); // Become process group leader.
+    SubProcess::exec();
+    ceph_abort(); // Never reached
+  }
+
+  // Parent
+  (void)alarm(timeout);
+
+  for (;;) {
+    int signo;
+    if (sigwait(&mask, &signo) == -1) {
+      std::cerr << cmd << ": sigwait failed: " << cpp_strerror(errno) << "\n";
+      goto fail_exit;
+    }
+    switch (signo) {
+    case SIGCHLD:
+      int status;
+      if (waitpid(pid, &status, WNOHANG) == -1) {
+	std::cerr << cmd << ": waitpid failed: " << cpp_strerror(errno) << "\n";
+	goto fail_exit;
+      }
+      if (WIFEXITED(status))
+	_exit(WEXITSTATUS(status));
+      if (WIFSIGNALED(status))
+	_exit(128 + WTERMSIG(status));
+      std::cerr << cmd << ": unknown status returned\n";
+      goto fail_exit;
+    case SIGINT:
+    case SIGTERM:
+      // Pass SIGINT and SIGTERM, which are usually used to terminate
+      // a process, to the child.
+      if (::kill(pid, signo) == -1) {
+	std::cerr << cmd << ": kill failed: " << cpp_strerror(errno) << "\n";
+	goto fail_exit;
+      }
+      continue;
+    case SIGALRM:
+      std::cerr << cmd << ": timed out (" << timeout << " sec)\n";
+      if (::killpg(pid, sigkill) == -1) {
+	std::cerr << cmd << ": kill failed: " << cpp_strerror(errno) << "\n";
+	goto fail_exit;
+      }
+      continue;
+    default:
+      std::cerr << cmd << ": sigwait: invalid signal: " << signo << "\n";
+      goto fail_exit;
+    }
+  }
+
+fail_exit:
+  _exit(EXIT_FAILURE);
+}
diff --git a/src/common/SubProcess.h b/src/common/SubProcess.h
new file mode 100644
index 000000000..ea81000d6
--- /dev/null
+++ b/src/common/SubProcess.h
@@ -0,0 +1,138 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * Copyright (C) 2015 Mirantis Inc
+ *
+ * Author: Mykola Golub <mgolub@mirantis.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef SUB_PROCESS_H
+#define SUB_PROCESS_H
+
+#if defined(__FreeBSD__) || defined(__APPLE__)
+#include <signal.h>
+#endif
+
+#ifndef _WIN32
+#include <sys/wait.h>
+#endif
+#include <sstream>
+#include <vector>
+
+#include "include/compat.h"
+
+/**
+ * SubProcess:
+ * A helper class to spawn a subprocess.
+ *
+ * Example:
+ *
+ *   SubProcess cat("cat", SubProcess::PIPE, SubProcess::PIPE);
+ *   if (cat.spawn() != 0) {
+ *     std::cerr << "cat failed: " << cat.err() << std::endl;
+ *     return false;
+ *   }
+ *   write_to_fd(cat.get_stdout(), "hello world!\n");
+ *   cat.close_stdout();
+ *   read_from_fd(cat.get_stdin(), buf);
+ *   if (cat.join() != 0) {
+ *     std::cerr << cat.err() << std::endl;
+ *     return false;
+ *   }
+ */
+
+class SubProcess {
+public:
+  enum std_fd_op{
+    KEEP,
+    CLOSE,
+    PIPE
+  };
+public:
+  SubProcess(const char *cmd,
+             std_fd_op stdin_op = CLOSE,
+             std_fd_op stdout_op = CLOSE,
+             std_fd_op stderr_op = CLOSE);
+  virtual ~SubProcess();
+
+  void add_cmd_args(const char *arg, ...);
+  void add_cmd_arg(const char *arg);
+
+  virtual int spawn(); // Returns 0 on success or -errno on failure.
+  virtual int join();  // Returns exit code (0 on success).
+
+  bool is_spawned() const { return pid > 0; }
+
+  int get_stdin() const;
+  int get_stdout() const;
+  int get_stderr() const;
+
+  void close_stdin();
+  void close_stdout();
+  void close_stderr();
+
+  void kill(int signo = SIGTERM) const;
+
+  const std::string err() const;
+
+protected:
+  bool is_child() const { return pid == 0; }
+  virtual void exec();
+
+  void close(int &fd);
+
+#ifdef _WIN32
+  void close_h(HANDLE &handle);
+#endif
+
+protected:
+  std::string cmd;
+  std::vector<std::string> cmd_args;
+  std_fd_op stdin_op;
+  std_fd_op stdout_op;
+  std_fd_op stderr_op;
+  int stdin_pipe_out_fd;
+  int stdout_pipe_in_fd;
+  int stderr_pipe_in_fd;
+  int pid;
+  std::ostringstream errstr;
+
+#ifdef _WIN32
+  HANDLE proc_handle = INVALID_HANDLE_VALUE;
+#endif
+};
+
+class SubProcessTimed : public SubProcess {
+public:
+  SubProcessTimed(const char *cmd, std_fd_op stdin_op = CLOSE,
+		  std_fd_op stdout_op = CLOSE, std_fd_op stderr_op = CLOSE,
+		  int timeout = 0, int sigkill = SIGKILL);
+
+#ifdef _WIN32
+  int spawn() override;
+  int join() override;
+#endif
+
+protected:
+  void exec() override;
+
+private:
+  int timeout;
+  int sigkill;
+
+#ifdef _WIN32
+  std::thread waiter;
+#endif
+};
+
+void timeout_sighandler(int sig);
+
+#endif
diff --git a/src/common/TextTable.cc b/src/common/TextTable.cc
new file mode 100644
index 000000000..e17e7eb8d
--- /dev/null
+++ b/src/common/TextTable.cc
@@ -0,0 +1,91 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "TextTable.h"
+
+using namespace std;
+
+void TextTable::define_column(const string &heading,
+			      enum TextTable::Align hd_align,
+			      enum TextTable::Align col_align)
+{
+  TextTableColumn def(heading, heading.length(), hd_align, col_align);
+  col.push_back(def);
+}
+
+void TextTable::clear() {
+  currow = 0;
+  curcol = 0;
+  indent = 0;
+  row.clear();
+  // reset widths to heading widths
+  for (unsigned int i = 0; i < col.size(); i++)
+    col[i].width = col[i].heading.size();
+}
+
+/**
+ * Pad s with space to appropriate alignment
+ *
+ * @param s string to pad
+ * @param width width of field to contain padded string
+ * @param align desired alignment (LEFT, CENTER, RIGHT)
+ *
+ * @return padded string
+ */
+static string
+pad(string s, int width, TextTable::Align align)
+{
+  int lpad, rpad;
+  lpad = 0;
+  rpad = 0;
+  switch (align) {
+    case TextTable::LEFT:
+      rpad = width - s.length();
+      break;
+    case TextTable::CENTER:
+      lpad = width / 2 - s.length() / 2;
+      rpad = width - lpad - s.length();
+      break;
+    case TextTable::RIGHT:
+      lpad = width - s.length();
+      break;
+  }
+
+  return string(lpad, ' ') + s + string(rpad, ' ');
+}
+
+std::ostream &operator<<(std::ostream &out, const TextTable &t)
+{
+  for (unsigned int i = 0; i < t.col.size(); i++) {
+    TextTable::TextTableColumn col = t.col[i];
+    if (i) {
+      out << t.column_separation;
+    }
+    out << string(t.indent, ' ')
+        << pad(col.heading, col.width, col.hd_align);
+  }
+  out << endl;
+
+  for (unsigned int i = 0; i < t.row.size(); i++) {
+    for (unsigned int j = 0; j < t.row[i].size(); j++) {
+      TextTable::TextTableColumn col = t.col[j];
+      if (j) {
+	out << t.column_separation;
+      }
+      out << string(t.indent, ' ')
+	  << pad(t.row[i][j], col.width, col.col_align);
+    }
+    out << endl;
+  }
+  return out;
+}
diff --git a/src/common/TextTable.h b/src/common/TextTable.h
new file mode 100644
index 000000000..6702c0057
--- /dev/null
+++ b/src/common/TextTable.h
@@ -0,0 +1,175 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef TEXT_TABLE_H_
+#define TEXT_TABLE_H_
+
+#include <vector>
+#include <sstream>
+#include "include/ceph_assert.h"
+
+/**
+ * TextTable:
+ * Manage tabular output of data.  Caller defines heading of each column
+ * and alignment of heading and column data,
+ * then inserts rows of data including tuples of
+ * length (ncolumns) terminated by TextTable::endrow.  When all rows
+ * are inserted, caller asks for output with ostream <<
+ * which sizes/pads/dumps the table to ostream.
+ *
+ * Columns autosize to largest heading or datum.  One space is printed
+ * between columns.
+ */
+
+class TextTable {
+
+public:
+  enum Align {LEFT = 1, CENTER, RIGHT};
+
+private:
+  struct TextTableColumn {
+    std::string heading;
+    int width;
+    Align hd_align;
+    Align col_align;
+
+    TextTableColumn() {}
+    TextTableColumn(const std::string &h, int w, Align ha, Align ca) :
+		    heading(h), width(w), hd_align(ha), col_align(ca) { }
+    ~TextTableColumn() {}
+  };
+
+  std::vector<TextTableColumn> col;	// column definitions
+  unsigned int curcol, currow;		// col, row being inserted into
+  unsigned int indent;			// indent width when rendering
+  std::string column_separation = {"  "};
+
+protected:
+  std::vector<std::vector<std::string> > row;	// row data array
+
+public:
+  TextTable(): curcol(0), currow(0), indent(0) {}
+  ~TextTable() {}
+
+  /**
+   * Define a column in the table.
+   *
+   * @param heading Column heading string (or "")
+   * @param hd_align Alignment for heading in column
+   * @param col_align Data alignment
+   *
+   * @note alignment is of type TextTable::Align; values are
+   * TextTable::LEFT, TextTable::CENTER, or TextTable::RIGHT
+   *
+   */
+  void define_column(const std::string& heading, Align hd_align,
+		     Align col_align);
+
+  /**
+   * Set indent for table.  Only affects table output.
+   *
+   * @param i Number of spaces to indent
+   */
+  void set_indent(int i) { indent = i; }
+
+  /**
+   * Set column separation
+   *
+   * @param s String to separate columns
+   */
+  void set_column_separation(const std::string& s) {
+    column_separation = s;
+  }
+
+  /**
+   * Add item to table, perhaps on new row.
+   * table << val1 << val2 << TextTable::endrow;
+   *
+   * @param: value to output.
+   *
+   * @note: Numerics are output in decimal; strings are not truncated.
+   * Output formatting choice is limited to alignment in define_column().
+   *
+   * @return TextTable& for chaining.
+   */
+
+  template<typename T> TextTable& operator<<(const T& item)
+  {
+    if (row.size() < currow + 1)
+      row.resize(currow + 1);
+
+    /**
+     * col.size() is a good guess for how big row[currow] needs to be,
+     * so just expand it out now
+     */
+    if (row[currow].size() < col.size()) {
+      row[currow].resize(col.size());
+    }
+
+    // inserting more items than defined columns is a coding error
+    ceph_assert(curcol + 1 <= col.size());
+
+    // get rendered width of item alone
+    std::ostringstream oss;
+    oss << item;
+    int width = oss.str().length();
+    oss.seekp(0);
+
+    // expand column width if necessary
+    if (width > col[curcol].width) {
+      col[curcol].width = width;
+    }
+
+    // now store the rendered item with its proper width
+    row[currow][curcol] = oss.str();
+
+    curcol++;
+    return *this;
+  }
+
+  /**
+   * Degenerate type/variable here is just to allow selection of the
+   * following operator<< for "<< TextTable::endrow"
+   */
+
+  struct endrow_t {};
+  static constexpr endrow_t endrow{};
+
+  /**
+   * Implements TextTable::endrow
+   */
+
+  TextTable &operator<<(endrow_t)
+  {
+    curcol = 0;
+    currow++;
+    return *this;
+  }
+
+  /**
+   * Render table to ostream (i.e. cout << table)
+   */
+
+  friend std::ostream &operator<<(std::ostream &out, const TextTable &t);
+
+  /**
+   * clear: Reset everything in a TextTable except column defs
+   * resize cols to heading widths, clear indent
+   */
+
+  void clear();
+};
+
+#endif
+
diff --git a/src/common/Thread.cc b/src/common/Thread.cc
new file mode 100644
index 000000000..aaee01aef
--- /dev/null
+++ b/src/common/Thread.cc
@@ -0,0 +1,230 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <signal.h>
+#include <unistd.h>
+#ifdef __linux__
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#endif
+
+#ifdef WITH_SEASTAR
+#include "crimson/os/alienstore/alien_store.h"
+#endif
+
+#include "common/Thread.h"
+#include "common/code_environment.h"
+#include "common/debug.h"
+#include "common/signal.h"
+
+#ifdef HAVE_SCHED
+#include <sched.h>
+#endif
+
+
+pid_t ceph_gettid(void)
+{
+#ifdef __linux__
+  return syscall(SYS_gettid);
+#else
+  return -ENOSYS;
+#endif
+}
+
+static int _set_affinity(int id)
+{
+#ifdef HAVE_SCHED
+  if (id >= 0 && id < CPU_SETSIZE) {
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+
+    CPU_SET(id, &cpuset);
+
+    if (sched_setaffinity(0, sizeof(cpuset), &cpuset) < 0)
+      return -errno;
+    /* guaranteed to take effect immediately */
+    sched_yield();
+  }
+#endif
+  return 0;
+}
+
+Thread::Thread()
+  : thread_id(0),
+    pid(0),
+    cpuid(-1)
+{
+}
+
+Thread::~Thread()
+{
+}
+
+void *Thread::_entry_func(void *arg) {
+  void *r = ((Thread*)arg)->entry_wrapper();
+  return r;
+}
+
+void *Thread::entry_wrapper()
+{
+  int p = ceph_gettid(); // may return -ENOSYS on other platforms
+  if (p > 0)
+    pid = p;
+  if (pid && cpuid >= 0)
+    _set_affinity(cpuid);
+
+  ceph_pthread_setname(pthread_self(), thread_name.c_str());
+  return entry();
+}
+
+const pthread_t &Thread::get_thread_id() const
+{
+  return thread_id;
+}
+
+bool Thread::is_started() const
+{
+  return thread_id != 0;
+}
+
+bool Thread::am_self() const
+{
+  return (pthread_self() == thread_id);
+}
+
+int Thread::kill(int signal)
+{
+  if (thread_id)
+    return pthread_kill(thread_id, signal);
+  else
+    return -EINVAL;
+}
+
+int Thread::try_create(size_t stacksize)
+{
+  pthread_attr_t *thread_attr = NULL;
+  pthread_attr_t thread_attr_loc;
+  
+  stacksize &= CEPH_PAGE_MASK;  // must be multiple of page
+  if (stacksize) {
+    thread_attr = &thread_attr_loc;
+    pthread_attr_init(thread_attr);
+    pthread_attr_setstacksize(thread_attr, stacksize);
+  }
+
+  int r;
+
+  // The child thread will inherit our signal mask.  Set our signal mask to
+  // the set of signals we want to block.  (It's ok to block signals more
+  // signals than usual for a little while-- they will just be delivered to
+  // another thread or delieverd to this thread later.)
+
+  #ifndef _WIN32
+  sigset_t old_sigset;
+  if (g_code_env == CODE_ENVIRONMENT_LIBRARY) {
+    block_signals(NULL, &old_sigset);
+  }
+  else {
+    int to_block[] = { SIGPIPE , 0 };
+    block_signals(to_block, &old_sigset);
+  }
+  r = pthread_create(&thread_id, thread_attr, _entry_func, (void*)this);
+  restore_sigset(&old_sigset);
+  #else
+  r = pthread_create(&thread_id, thread_attr, _entry_func, (void*)this);
+  #endif
+
+  if (thread_attr) {
+    pthread_attr_destroy(thread_attr);	
+  }
+
+  return r;
+}
+
+void Thread::create(const char *name, size_t stacksize)
+{
+  ceph_assert(strlen(name) < 16);
+  thread_name = name;
+
+  int ret = try_create(stacksize);
+  if (ret != 0) {
+    char buf[256];
+    snprintf(buf, sizeof(buf), "Thread::try_create(): pthread_create "
+	     "failed with error %d", ret);
+    dout_emergency(buf);
+    ceph_assert(ret == 0);
+  }
+}
+
+int Thread::join(void **prval)
+{
+  if (thread_id == 0) {
+    ceph_abort_msg("join on thread that was never started");
+    return -EINVAL;
+  }
+
+  int status = pthread_join(thread_id, prval);
+  if (status != 0) {
+    char buf[256];
+    snprintf(buf, sizeof(buf), "Thread::join(): pthread_join "
+             "failed with error %d\n", status);
+    dout_emergency(buf);
+    ceph_assert(status == 0);
+  }
+
+  thread_id = 0;
+  return status;
+}
+
+int Thread::detach()
+{
+  return pthread_detach(thread_id);
+}
+
+int Thread::set_affinity(int id)
+{
+  int r = 0;
+  cpuid = id;
+  if (pid && ceph_gettid() == pid)
+    r = _set_affinity(id);
+  return r;
+}
+
+// Functions for std::thread
+// =========================
+
+void set_thread_name(std::thread& t, const std::string& s) {
+  int r = ceph_pthread_setname(t.native_handle(), s.c_str());
+  if (r != 0) {
+    throw std::system_error(r, std::generic_category());
+  }
+}
+std::string get_thread_name(const std::thread& t) {
+  std::string s(256, '\0');
+
+  int r = ceph_pthread_getname(const_cast<std::thread&>(t).native_handle(),
+			       s.data(), s.length());
+  if (r != 0) {
+    throw std::system_error(r, std::generic_category());
+  }
+  s.resize(std::strlen(s.data()));
+  return s;
+}
+
+void kill(std::thread& t, int signal)
+{
+  auto r = pthread_kill(t.native_handle(), signal);
+  if (r != 0) {
+    throw std::system_error(r, std::generic_category());
+  }
+}
diff --git a/src/common/Thread.h b/src/common/Thread.h
new file mode 100644
index 000000000..5242fb5f3
--- /dev/null
+++ b/src/common/Thread.h
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_THREAD_H
+#define CEPH_THREAD_H
+
+#include <functional>
+#include <string_view>
+#include <system_error>
+#include <thread>
+
+#include <pthread.h>
+#include <sys/types.h>
+
+#include "include/compat.h"
+
+extern pid_t ceph_gettid();
+
+class Thread {
+ private:
+  pthread_t thread_id;
+  pid_t pid;
+  int cpuid;
+  std::string thread_name;
+
+  void *entry_wrapper();
+
+ public:
+  Thread(const Thread&) = delete;
+  Thread& operator=(const Thread&) = delete;
+
+  Thread();
+  virtual ~Thread();
+
+ protected:
+  virtual void *entry() = 0;
+
+ private:
+  static void *_entry_func(void *arg);
+
+ public:
+  const pthread_t &get_thread_id() const;
+  pid_t get_pid() const { return pid; }
+  bool is_started() const;
+  bool am_self() const;
+  int kill(int signal);
+  int try_create(size_t stacksize);
+  void create(const char *name, size_t stacksize = 0);
+  int join(void **prval = 0);
+  int detach();
+  int set_affinity(int cpuid);
+};
+
+// Functions for with std::thread
+
+void set_thread_name(std::thread& t, const std::string& s);
+std::string get_thread_name(const std::thread& t);
+void kill(std::thread& t, int signal);
+
+template<typename Fun, typename... Args>
+std::thread make_named_thread(std::string_view n,
+			      Fun&& fun,
+			      Args&& ...args) {
+
+  return std::thread([n = std::string(n)](auto&& fun, auto&& ...args) {
+		       ceph_pthread_setname(pthread_self(), n.data());
+		       std::invoke(std::forward<Fun>(fun),
+				   std::forward<Args>(args)...);
+		     }, std::forward<Fun>(fun), std::forward<Args>(args)...);
+}
+#endif
diff --git a/src/common/Throttle.cc b/src/common/Throttle.cc
new file mode 100644
index 000000000..7af940279
--- /dev/null
+++ b/src/common/Throttle.cc
@@ -0,0 +1,887 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/scope_guard.h"
+
+#include "common/Throttle.h"
+#include "common/ceph_time.h"
+#include "common/perf_counters.h"
+
+
+// re-include our assert to clobber the system one; fix dout:
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_throttle
+
+#undef dout_prefix
+#define dout_prefix *_dout << "throttle(" << name << " " << (void*)this << ") "
+
+using std::list;
+using std::ostream;
+using std::string;
+
+using ceph::mono_clock;
+using ceph::mono_time;
+using ceph::timespan;
+
+enum {
+  l_throttle_first = 532430,
+  l_throttle_val,
+  l_throttle_max,
+  l_throttle_get_started,
+  l_throttle_get,
+  l_throttle_get_sum,
+  l_throttle_get_or_fail_fail,
+  l_throttle_get_or_fail_success,
+  l_throttle_take,
+  l_throttle_take_sum,
+  l_throttle_put,
+  l_throttle_put_sum,
+  l_throttle_wait,
+  l_throttle_last,
+};
+
+Throttle::Throttle(CephContext *cct, const std::string& n, int64_t m,
+		   bool _use_perf)
+  : cct(cct), name(n), max(m),
+    use_perf(_use_perf)
+{
+  ceph_assert(m >= 0);
+
+  if (!use_perf)
+    return;
+
+  if (cct->_conf->throttler_perf_counter) {
+    PerfCountersBuilder b(cct, string("throttle-") + name, l_throttle_first, l_throttle_last);
+    b.add_u64(l_throttle_val, "val", "Currently taken slots");
+    b.add_u64(l_throttle_max, "max", "Max value for throttle");
+    b.add_u64_counter(l_throttle_get_started, "get_started", "Number of get calls, increased before wait");
+    b.add_u64_counter(l_throttle_get, "get", "Gets");
+    b.add_u64_counter(l_throttle_get_sum, "get_sum", "Got data");
+    b.add_u64_counter(l_throttle_get_or_fail_fail, "get_or_fail_fail", "Get blocked during get_or_fail");
+    b.add_u64_counter(l_throttle_get_or_fail_success, "get_or_fail_success", "Successful get during get_or_fail");
+    b.add_u64_counter(l_throttle_take, "take", "Takes");
+    b.add_u64_counter(l_throttle_take_sum, "take_sum", "Taken data");
+    b.add_u64_counter(l_throttle_put, "put", "Puts");
+    b.add_u64_counter(l_throttle_put_sum, "put_sum", "Put data");
+    b.add_time_avg(l_throttle_wait, "wait", "Waiting latency");
+
+    logger = { b.create_perf_counters(), cct };
+    cct->get_perfcounters_collection()->add(logger.get());
+    logger->set(l_throttle_max, max);
+  }
+}
+
+Throttle::~Throttle()
+{
+  std::lock_guard l(lock);
+  ceph_assert(conds.empty());
+}
+
+void Throttle::_reset_max(int64_t m)
+{
+  // lock must be held.
+  if (max == m)
+    return;
+  if (!conds.empty())
+    conds.front().notify_one();
+  if (logger)
+    logger->set(l_throttle_max, m);
+  max = m;
+}
+
+bool Throttle::_wait(int64_t c, std::unique_lock<std::mutex>& l)
+{
+  mono_time start;
+  bool waited = false;
+  if (_should_wait(c) || !conds.empty()) { // always wait behind other waiters.
+    {
+      auto cv = conds.emplace(conds.end());
+      auto w = make_scope_guard([this, cv]() {
+	  conds.erase(cv);
+	});
+      waited = true;
+      ldout(cct, 2) << "_wait waiting..." << dendl;
+      if (logger)
+	start = mono_clock::now();
+
+      cv->wait(l, [this, c, cv]() { return (!_should_wait(c) &&
+					    cv == conds.begin()); });
+      ldout(cct, 2) << "_wait finished waiting" << dendl;
+      if (logger) {
+	logger->tinc(l_throttle_wait, mono_clock::now() - start);
+      }
+    }
+    // wake up the next guy
+    if (!conds.empty())
+      conds.front().notify_one();
+  }
+  return waited;
+}
+
+bool Throttle::wait(int64_t m)
+{
+  if (0 == max && 0 == m) {
+    return false;
+  }
+
+  std::unique_lock l(lock);
+  if (m) {
+    ceph_assert(m > 0);
+    _reset_max(m);
+  }
+  ldout(cct, 10) << "wait" << dendl;
+  return _wait(0, l);
+}
+
+int64_t Throttle::take(int64_t c)
+{
+  if (0 == max) {
+    return 0;
+  }
+  ceph_assert(c >= 0);
+  ldout(cct, 10) << "take " << c << dendl;
+  count += c;
+  if (logger) {
+    logger->inc(l_throttle_take);
+    logger->inc(l_throttle_take_sum, c);
+    logger->set(l_throttle_val, count);
+  }
+  return count;
+}
+
+bool Throttle::get(int64_t c, int64_t m)
+{
+  if (0 == max && 0 == m) {
+    count += c;
+    return false;
+  }
+
+  ceph_assert(c >= 0);
+  ldout(cct, 10) << "get " << c << " (" << count.load() << " -> " << (count.load() + c) << ")" << dendl;
+  if (logger) {
+    logger->inc(l_throttle_get_started);
+  }
+  bool waited = false;
+  {
+    std::unique_lock l(lock);
+    if (m) {
+      ceph_assert(m > 0);
+      _reset_max(m);
+    }
+    waited = _wait(c, l);
+    count += c;
+  }
+  if (logger) {
+    logger->inc(l_throttle_get);
+    logger->inc(l_throttle_get_sum, c);
+    logger->set(l_throttle_val, count);
+  }
+  return waited;
+}
+
+/* Returns true if it successfully got the requested amount,
+ * or false if it would block.
+ */
+bool Throttle::get_or_fail(int64_t c)
+{
+  if (0 == max) {
+    count += c;
+    return true;
+  }
+
+  assert (c >= 0);
+  bool result = false;
+  {
+    std::lock_guard l(lock);
+    if (_should_wait(c) || !conds.empty()) {
+      ldout(cct, 10) << "get_or_fail " << c << " failed" << dendl;
+      result = false;
+    } else {
+      ldout(cct, 10) << "get_or_fail " << c << " success (" << count.load()
+	<< " -> " << (count.load() + c) << ")" << dendl;
+      count += c;
+      result = true;
+    }
+  }
+
+  if (logger) {
+    if (result) {
+      logger->inc(l_throttle_get_or_fail_success);
+      logger->inc(l_throttle_get);
+      logger->inc(l_throttle_get_sum, c);
+      logger->set(l_throttle_val, count);
+    } else {
+      logger->inc(l_throttle_get_or_fail_fail);
+    }
+  }
+  return result;
+}
+
+int64_t Throttle::put(int64_t c)
+{
+  if (0 == max) {
+    count -= c;
+    return 0;
+  }
+
+  ceph_assert(c >= 0);
+  ldout(cct, 10) << "put " << c << " (" << count.load() << " -> "
+		 << (count.load()-c) << ")" << dendl;
+  int64_t new_count;
+  {
+    std::lock_guard l(lock);
+    new_count = count;
+    if (c) {
+      if (!conds.empty())
+	conds.front().notify_one();
+      // if count goes negative, we failed somewhere!
+      ceph_assert(count >= c);
+      new_count = count -= c;
+    }
+  }
+  if (logger) {
+    logger->inc(l_throttle_put);
+    logger->inc(l_throttle_put_sum, c);
+    logger->set(l_throttle_val, count);
+  }
+
+  return new_count;
+}
+
+void Throttle::reset()
+{
+  std::lock_guard l(lock);
+  if (!conds.empty())
+    conds.front().notify_one();
+  count = 0;
+  if (logger) {
+    logger->set(l_throttle_val, 0);
+  }
+}
+
+enum {
+  l_backoff_throttle_first = l_throttle_last + 1,
+  l_backoff_throttle_val,
+  l_backoff_throttle_max,
+  l_backoff_throttle_get,
+  l_backoff_throttle_get_sum,
+  l_backoff_throttle_take,
+  l_backoff_throttle_take_sum,
+  l_backoff_throttle_put,
+  l_backoff_throttle_put_sum,
+  l_backoff_throttle_wait,
+  l_backoff_throttle_last,
+};
+
+BackoffThrottle::BackoffThrottle(CephContext *cct, const std::string& n,
+				 unsigned expected_concurrency, bool _use_perf)
+  : name(n),
+    conds(expected_concurrency),///< [in] determines size of conds
+    use_perf(_use_perf)
+{
+  if (!use_perf)
+    return;
+
+  if (cct->_conf->throttler_perf_counter) {
+    PerfCountersBuilder b(cct, string("throttle-") + name,
+			  l_backoff_throttle_first, l_backoff_throttle_last);
+    b.add_u64(l_backoff_throttle_val, "val", "Currently available throttle");
+    b.add_u64(l_backoff_throttle_max, "max", "Max value for throttle");
+    b.add_u64_counter(l_backoff_throttle_get, "get", "Gets");
+    b.add_u64_counter(l_backoff_throttle_get_sum, "get_sum", "Got data");
+    b.add_u64_counter(l_backoff_throttle_take, "take", "Takes");
+    b.add_u64_counter(l_backoff_throttle_take_sum, "take_sum", "Taken data");
+    b.add_u64_counter(l_backoff_throttle_put, "put", "Puts");
+    b.add_u64_counter(l_backoff_throttle_put_sum, "put_sum", "Put data");
+    b.add_time_avg(l_backoff_throttle_wait, "wait", "Waiting latency");
+
+    logger = { b.create_perf_counters(), cct };
+    cct->get_perfcounters_collection()->add(logger.get());
+    logger->set(l_backoff_throttle_max, max);
+  }
+}
+
+BackoffThrottle::~BackoffThrottle()
+{
+  std::lock_guard l(lock);
+  ceph_assert(waiters.empty());
+}
+
+bool BackoffThrottle::set_params(
+  double _low_threshold,
+  double _high_threshold,
+  double _expected_throughput,
+  double _high_multiple,
+  double _max_multiple,
+  uint64_t _throttle_max,
+  ostream *errstream)
+{
+  bool valid = true;
+  if (_low_threshold > _high_threshold) {
+    valid = false;
+    if (errstream) {
+      *errstream << "low_threshold (" << _low_threshold
+		 << ") > high_threshold (" << _high_threshold
+		 << ")" << std::endl;
+    }
+  }
+
+  if (_high_multiple > _max_multiple) {
+    valid = false;
+    if (errstream) {
+      *errstream << "_high_multiple (" << _high_multiple
+		 << ") > _max_multiple (" << _max_multiple
+		 << ")" << std::endl;
+    }
+  }
+
+  if (_low_threshold > 1 || _low_threshold < 0) {
+    valid = false;
+    if (errstream) {
+      *errstream << "invalid low_threshold (" << _low_threshold << ")"
+		 << std::endl;
+    }
+  }
+
+  if (_high_threshold > 1 || _high_threshold < 0) {
+    valid = false;
+    if (errstream) {
+      *errstream << "invalid high_threshold (" << _high_threshold << ")"
+		 << std::endl;
+    }
+  }
+
+  if (_max_multiple < 0) {
+    valid = false;
+    if (errstream) {
+      *errstream << "invalid _max_multiple ("
+		 << _max_multiple << ")"
+		 << std::endl;
+    }
+  }
+
+  if (_high_multiple < 0) {
+    valid = false;
+    if (errstream) {
+      *errstream << "invalid _high_multiple ("
+		 << _high_multiple << ")"
+		 << std::endl;
+    }
+  }
+
+  if (_expected_throughput < 0) {
+    valid = false;
+    if (errstream) {
+      *errstream << "invalid _expected_throughput("
+		 << _expected_throughput << ")"
+		 << std::endl;
+    }
+  }
+
+  if (!valid)
+    return false;
+
+  locker l(lock);
+  low_threshold = _low_threshold;
+  high_threshold = _high_threshold;
+  high_delay_per_count = _high_multiple / _expected_throughput;
+  max_delay_per_count = _max_multiple / _expected_throughput;
+  max = _throttle_max;
+
+  if (logger)
+    logger->set(l_backoff_throttle_max, max);
+
+  if (high_threshold - low_threshold > 0) {
+    s0 = high_delay_per_count / (high_threshold - low_threshold);
+  } else {
+    low_threshold = high_threshold;
+    s0 = 0;
+  }
+
+  if (1 - high_threshold > 0) {
+    s1 = (max_delay_per_count - high_delay_per_count)
+      / (1 - high_threshold);
+  } else {
+    high_threshold = 1;
+    s1 = 0;
+  }
+
+  _kick_waiters();
+  return true;
+}
+
+ceph::timespan BackoffThrottle::_get_delay(uint64_t c) const
+{
+  if (max == 0)
+    return ceph::timespan(0);
+
+  double r = ((double)current) / ((double)max);
+  if (r < low_threshold) {
+    return ceph::timespan(0);
+  } else if (r < high_threshold) {
+    return c * ceph::make_timespan(
+      (r - low_threshold) * s0);
+  } else {
+    return c * ceph::make_timespan(
+      high_delay_per_count + ((r - high_threshold) * s1));
+  }
+}
+
+ceph::timespan BackoffThrottle::get(uint64_t c)
+{
+  locker l(lock);
+  auto delay = _get_delay(c);
+
+  if (logger) {
+    logger->inc(l_backoff_throttle_get);
+    logger->inc(l_backoff_throttle_get_sum, c);
+  }
+
+  // fast path
+  if (delay.count() == 0 &&
+      waiters.empty() &&
+      ((max == 0) || (current == 0) || ((current + c) <= max))) {
+    current += c;
+
+    if (logger) {
+      logger->set(l_backoff_throttle_val, current);
+    }
+
+    return ceph::make_timespan(0);
+  }
+
+  auto ticket = _push_waiter();
+  auto wait_from = mono_clock::now();
+  bool waited = false;
+
+  while (waiters.begin() != ticket) {
+    (*ticket)->wait(l);
+    waited = true;
+  }
+
+  auto start = mono_clock::now();
+  delay = _get_delay(c);
+  while (true) {
+    if (max != 0 && current != 0 && (current + c) > max) {
+      (*ticket)->wait(l);
+      waited = true;
+    } else if (delay.count() > 0) {
+      (*ticket)->wait_for(l, delay);
+      waited = true;
+    } else {
+      break;
+    }
+    ceph_assert(ticket == waiters.begin());
+    delay = _get_delay(c);
+    auto elapsed = mono_clock::now() - start;
+    if (delay <= elapsed) {
+      delay = timespan::zero();
+    } else {
+      delay -= elapsed;
+    }
+  }
+  waiters.pop_front();
+  _kick_waiters();
+
+  current += c;
+
+  if (logger) {
+    logger->set(l_backoff_throttle_val, current);
+    if (waited) {
+      logger->tinc(l_backoff_throttle_wait, mono_clock::now() - wait_from);
+    }
+  }
+
+  return mono_clock::now() - start;
+}
+
+uint64_t BackoffThrottle::put(uint64_t c)
+{
+  locker l(lock);
+  ceph_assert(current >= c);
+  current -= c;
+  _kick_waiters();
+
+  if (logger) {
+    logger->inc(l_backoff_throttle_put);
+    logger->inc(l_backoff_throttle_put_sum, c);
+    logger->set(l_backoff_throttle_val, current);
+  }
+
+  return current;
+}
+
+uint64_t BackoffThrottle::take(uint64_t c)
+{
+  locker l(lock);
+  current += c;
+
+  if (logger) {
+    logger->inc(l_backoff_throttle_take);
+    logger->inc(l_backoff_throttle_take_sum, c);
+    logger->set(l_backoff_throttle_val, current);
+  }
+
+  return current;
+}
+
+uint64_t BackoffThrottle::get_current()
+{
+  locker l(lock);
+  return current;
+}
+
+uint64_t BackoffThrottle::get_max()
+{
+  locker l(lock);
+  return max;
+}
+
+SimpleThrottle::SimpleThrottle(uint64_t max, bool ignore_enoent)
+  : m_max(max), m_ignore_enoent(ignore_enoent) {}
+
+SimpleThrottle::~SimpleThrottle()
+{
+  std::lock_guard l(m_lock);
+  ceph_assert(m_current == 0);
+  ceph_assert(waiters == 0);
+}
+
+void SimpleThrottle::start_op()
+{
+  std::unique_lock l(m_lock);
+  waiters++;
+  m_cond.wait(l, [this]() { return m_max != m_current; });
+  waiters--;
+  ++m_current;
+}
+
+void SimpleThrottle::end_op(int r)
+{
+  std::lock_guard l(m_lock);
+  --m_current;
+  if (r < 0 && !m_ret && !(r == -ENOENT && m_ignore_enoent))
+    m_ret = r;
+  m_cond.notify_all();
+}
+
+bool SimpleThrottle::pending_error() const
+{
+  std::lock_guard l(m_lock);
+  return (m_ret < 0);
+}
+
+int SimpleThrottle::wait_for_ret()
+{
+  std::unique_lock l(m_lock);
+  waiters++;
+  m_cond.wait(l, [this]() { return m_current == 0; });
+  waiters--;
+  return m_ret;
+}
+
+void C_OrderedThrottle::finish(int r) {
+  m_ordered_throttle->finish_op(m_tid, r);
+}
+
+OrderedThrottle::OrderedThrottle(uint64_t max, bool ignore_enoent)
+  : m_max(max), m_ignore_enoent(ignore_enoent) {}
+
+OrderedThrottle::~OrderedThrottle() {
+  std::lock_guard l(m_lock);
+  ceph_assert(waiters == 0);
+}
+
+C_OrderedThrottle *OrderedThrottle::start_op(Context *on_finish) {
+  ceph_assert(on_finish);
+
+  std::unique_lock l(m_lock);
+  uint64_t tid = m_next_tid++;
+  m_tid_result[tid] = Result(on_finish);
+  auto ctx = std::make_unique<C_OrderedThrottle>(this, tid);
+
+  complete_pending_ops(l);
+  while (m_max == m_current) {
+    ++waiters;
+    m_cond.wait(l);
+    --waiters;
+    complete_pending_ops(l);
+  }
+  ++m_current;
+
+  return ctx.release();
+}
+
+void OrderedThrottle::end_op(int r) {
+  std::lock_guard l(m_lock);
+  ceph_assert(m_current > 0);
+
+  if (r < 0 && m_ret_val == 0 && (r != -ENOENT || !m_ignore_enoent)) {
+    m_ret_val = r;
+  }
+  --m_current;
+  m_cond.notify_all();
+}
+
+void OrderedThrottle::finish_op(uint64_t tid, int r) {
+  std::lock_guard l(m_lock);
+
+  auto it = m_tid_result.find(tid);
+  ceph_assert(it != m_tid_result.end());
+
+  it->second.finished = true;
+  it->second.ret_val = r;
+  m_cond.notify_all();
+}
+
+bool OrderedThrottle::pending_error() const {
+  std::lock_guard l(m_lock);
+  return (m_ret_val < 0);
+}
+
+int OrderedThrottle::wait_for_ret() {
+  std::unique_lock l(m_lock);
+  complete_pending_ops(l);
+
+  while (m_current > 0) {
+    ++waiters;
+    m_cond.wait(l);
+    --waiters;
+    complete_pending_ops(l);
+  }
+  return m_ret_val;
+}
+
+void OrderedThrottle::complete_pending_ops(std::unique_lock<std::mutex>& l) {
+  while (true) {
+    auto it = m_tid_result.begin();
+    if (it == m_tid_result.end() || it->first != m_complete_tid ||
+        !it->second.finished) {
+      break;
+    }
+
+    Result result = it->second;
+    m_tid_result.erase(it);
+
+    l.unlock();
+    result.on_finish->complete(result.ret_val);
+    l.lock();
+
+    ++m_complete_tid;
+  }
+}
+
+#undef dout_prefix
+#define dout_prefix *_dout << "TokenBucketThrottle(" << m_name << " " \
+                           << (void*)this << ") "
+
+uint64_t TokenBucketThrottle::Bucket::get(uint64_t c) {
+  if (0 == max) {
+    return 0;
+  }
+
+  uint64_t got = 0;
+  if (available >= c) {
+    // There is enough token in bucket, take c.
+    got = c;
+    available -= c;
+    remain -= c;
+  } else {
+    // There is not enough, take all available.
+    got = available;
+    remain -= available;
+    available = 0;
+  }
+  return got;
+}
+
+uint64_t TokenBucketThrottle::Bucket::put(uint64_t tokens, double burst_ratio) {
+  if (0 == max) {
+    return 0;
+  }
+
+  if (tokens) {
+    // put tokens into bucket
+    uint64_t current = remain;
+    if ((current + tokens) <= capacity) {
+      remain += tokens;
+    } else {
+      remain = capacity;
+    }
+
+    // available tokens increase at burst speed
+    uint64_t available_inc = tokens;
+    if (burst_ratio > 1) {
+      available_inc = (uint64_t)(tokens * burst_ratio);
+    }
+    uint64_t inc_upper_limit = remain > max ? max : remain;
+    if ((available + available_inc) <= inc_upper_limit ){
+      available += available_inc;
+    }else{
+      available = inc_upper_limit;
+    }
+    
+  }
+  return remain;
+}
+
+void TokenBucketThrottle::Bucket::set_max(uint64_t max, uint64_t burst_seconds) {
+  // the capacity of bucket should not be less than max
+  if (burst_seconds < 1){
+    burst_seconds = 1;
+  }
+  uint64_t new_capacity = max*burst_seconds;
+  if (capacity != new_capacity){
+    capacity = new_capacity;
+    remain = capacity;
+  }
+  if (available > max || 0 == max) {
+    available = max;
+  }
+  this->max = max;
+}
+
+TokenBucketThrottle::TokenBucketThrottle(
+    CephContext *cct,
+    const std::string &name,
+    uint64_t burst,
+    uint64_t avg,
+    SafeTimer *timer,
+    ceph::mutex *timer_lock)
+  : m_cct(cct), m_name(name),
+    m_throttle(m_cct, name + "_bucket", burst),
+    m_burst(burst), m_avg(avg), m_timer(timer), m_timer_lock(timer_lock),
+    m_lock(ceph::make_mutex(name + "_lock"))
+{}
+
+TokenBucketThrottle::~TokenBucketThrottle() {
+  // cancel the timer events.
+  {
+    std::lock_guard timer_locker(*m_timer_lock);
+    cancel_timer();
+  }
+
+  list<Blocker> tmp_blockers;
+  {
+    std::lock_guard blockers_lock(m_lock);
+    tmp_blockers.splice(tmp_blockers.begin(), m_blockers, m_blockers.begin(), m_blockers.end());
+  }
+
+  for (auto b : tmp_blockers) {
+    b.ctx->complete(0);
+  }
+}
+
+int TokenBucketThrottle::set_limit(uint64_t average, uint64_t burst, uint64_t burst_seconds) {
+  {
+    std::lock_guard lock{m_lock};
+
+    if (0 < burst && burst < average) {
+      // the burst should never less than the average.
+      return -EINVAL;
+    }
+
+    m_avg = average;
+    m_burst = burst;
+
+    if (0 == average) {
+      // The limit is not set, and no tokens will be put into the bucket.
+      // So, we can schedule the timer slowly, or even cancel it.
+      m_tick = 1000;
+    } else {
+      // calculate the tick(ms), don't less than the minimum.
+      m_tick = 1000 / average;
+      if (m_tick < m_tick_min) {
+        m_tick = m_tick_min;
+      }
+
+      // this is for the number(avg) can not be divisible.
+      m_ticks_per_second = 1000 / m_tick;
+      m_current_tick = 0;
+
+      // for the default configuration of burst.
+      m_throttle.set_max(0 == burst ? average : burst, burst_seconds);
+    }
+    // turn millisecond to second
+    m_schedule_tick = m_tick / 1000.0;
+  }
+
+  // The schedule period will be changed when the average rate is set.
+  {
+    std::lock_guard timer_locker{*m_timer_lock};
+    cancel_timer();
+    schedule_timer();
+  }
+  return 0;
+}
+
+void TokenBucketThrottle::set_schedule_tick_min(uint64_t tick) {
+  std::lock_guard lock(m_lock);
+  if (tick != 0) {
+    m_tick_min = tick;
+  }
+}
+
+uint64_t TokenBucketThrottle::tokens_filled(double tick) {
+  return (0 == m_avg) ? 0 : (tick / m_ticks_per_second * m_avg);
+}
+
+uint64_t TokenBucketThrottle::tokens_this_tick() {
+  if (0 == m_avg) {
+    return 0;
+  }
+  if (m_current_tick >= m_ticks_per_second) {
+    m_current_tick = 0;
+  }
+  m_current_tick++;
+
+  return tokens_filled(m_current_tick) - tokens_filled(m_current_tick - 1);
+}
+
+void TokenBucketThrottle::add_tokens() {
+  list<Blocker> tmp_blockers;
+  {
+    std::lock_guard lock(m_lock);
+    // put tokens into bucket.
+    double burst_ratio = 1.0;
+    if (m_throttle.max > m_avg && m_avg > 0){
+      burst_ratio = (double)m_throttle.max/m_avg;
+    }
+    m_throttle.put(tokens_this_tick(), burst_ratio);
+    if (0 == m_avg || 0 == m_throttle.max)
+      tmp_blockers.swap(m_blockers);
+    // check the m_blockers from head to tail, if blocker can get
+    // enough tokens, let it go.
+    while (!m_blockers.empty()) {
+      Blocker &blocker = m_blockers.front();
+      uint64_t got = m_throttle.get(blocker.tokens_requested);
+      if (got == blocker.tokens_requested) {
+        // got enough tokens for front.
+        tmp_blockers.splice(tmp_blockers.end(), m_blockers, m_blockers.begin());
+      } else {
+        // there is no more tokens.
+        blocker.tokens_requested -= got;
+        break;
+      }
+    }
+  }
+
+  for (auto b : tmp_blockers) {
+    b.ctx->complete(0);
+  }
+}
+
+void TokenBucketThrottle::schedule_timer() {
+  m_token_ctx = new LambdaContext(
+      [this](int r) {
+        schedule_timer();
+      });
+  m_timer->add_event_after(m_schedule_tick, m_token_ctx);
+
+  add_tokens();
+}
+
+void TokenBucketThrottle::cancel_timer() {
+  m_timer->cancel_event(m_token_ctx);
+}
diff --git a/src/common/Throttle.h b/src/common/Throttle.h
new file mode 100644
index 000000000..e190b946c
--- /dev/null
+++ b/src/common/Throttle.h
@@ -0,0 +1,469 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_THROTTLE_H
+#define CEPH_THROTTLE_H
+
+#include <atomic>
+#include <chrono>
+#include <iostream>
+#include <list>
+#include <map>
+
+#include "common/ceph_mutex.h"
+#include "include/Context.h"
+#include "common/ThrottleInterface.h"
+#include "common/Timer.h"
+#include "common/convenience.h"
+#if defined(WITH_SEASTAR) && !defined(WITH_ALIEN)
+#include "crimson/common/perf_counters_collection.h"
+#else
+#include "common/perf_counters_collection.h"
+#endif
+
+/**
+ * @class Throttle
+ * Throttles the maximum number of active requests.
+ *
+ * This class defines the maximum number of slots currently taken away. The
+ * excessive requests for more of them are delayed, until some slots are put
+ * back, so @p get_current() drops below the limit after fulfills the requests.
+ */
+class Throttle final : public ThrottleInterface {
+  CephContext *cct;
+  const std::string name;
+  PerfCountersRef logger;
+  std::atomic<int64_t> count = { 0 }, max = { 0 };
+  std::mutex lock;
+  std::list<std::condition_variable> conds;
+  const bool use_perf;
+
+public:
+  Throttle(CephContext *cct, const std::string& n, int64_t m = 0, bool _use_perf = true);
+  ~Throttle() override;
+
+private:
+  void _reset_max(int64_t m);
+  bool _should_wait(int64_t c) const {
+    int64_t m = max;
+    int64_t cur = count;
+    return
+      m &&
+      ((c <= m && cur + c > m) || // normally stay under max
+       (c >= m && cur > m));     // except for large c
+  }
+
+  bool _wait(int64_t c, std::unique_lock<std::mutex>& l);
+
+public:
+  /**
+   * gets the number of currently taken slots
+   * @returns the number of taken slots
+   */
+  int64_t get_current() const {
+    return count;
+  }
+
+  /**
+   * get the max number of slots
+   * @returns the max number of slots
+   */
+  int64_t get_max() const { return max; }
+
+  /**
+   * return true if past midpoint
+   */
+  bool past_midpoint() const {
+    return count >= max / 2;
+  }
+
+  /**
+   * set the new max number, and wait until the number of taken slots drains
+   * and drops below this limit.
+   *
+   * @param m the new max number
+   * @returns true if this method is blocked, false it it returns immediately
+   */
+  bool wait(int64_t m = 0);
+
+  /**
+   * take the specified number of slots from the stock regardless the throttling
+   * @param c number of slots to take
+   * @returns the total number of taken slots
+   */
+  int64_t take(int64_t c = 1) override;
+
+  /**
+   * get the specified amount of slots from the stock, but will wait if the
+   * total number taken by consumer would exceed the maximum number.
+   * @param c number of slots to get
+   * @param m new maximum number to set, ignored if it is 0
+   * @returns true if this request is blocked due to the throttling, false 
+   * otherwise
+   */
+  bool get(int64_t c = 1, int64_t m = 0);
+
+  /**
+   * the unblocked version of @p get()
+   * @returns true if it successfully got the requested amount,
+   * or false if it would block.
+   */
+  bool get_or_fail(int64_t c = 1);
+
+  /**
+   * put slots back to the stock
+   * @param c number of slots to return
+   * @returns number of requests being hold after this
+   */
+  int64_t put(int64_t c = 1) override;
+   /**
+   * reset the zero to the stock
+   */
+  void reset();
+
+  void reset_max(int64_t m) {
+    std::lock_guard l(lock);
+    _reset_max(m);
+  }
+};
+
+/**
+ * BackoffThrottle
+ *
+ * Creates a throttle which gradually induces delays when get() is called
+ * based on params low_threshold, high_threshold, expected_throughput,
+ * high_multiple, and max_multiple.
+ *
+ * In [0, low_threshold), we want no delay.
+ *
+ * In [low_threshold, high_threshold), delays should be injected based
+ * on a line from 0 at low_threshold to
+ * high_multiple * (1/expected_throughput) at high_threshold.
+ *
+ * In [high_threshold, 1), we want delays injected based on a line from
+ * (high_multiple * (1/expected_throughput)) at high_threshold to
+ * (high_multiple * (1/expected_throughput)) +
+ * (max_multiple * (1/expected_throughput)) at 1.
+ *
+ * Let the current throttle ratio (current/max) be r, low_threshold be l,
+ * high_threshold be h, high_delay (high_multiple / expected_throughput) be e,
+ * and max_delay (max_multiple / expected_throughput) be m.
+ *
+ * delay = 0, r \in [0, l)
+ * delay = (r - l) * (e / (h - l)), r \in [l, h)
+ * delay = e + (r - h)((m - e)/(1 - h))
+ */
+class BackoffThrottle {
+  const std::string name;
+  PerfCountersRef logger;
+
+  std::mutex lock;
+  using locker = std::unique_lock<std::mutex>;
+
+  unsigned next_cond = 0;
+
+  /// allocated once to avoid constantly allocating new ones
+  std::vector<std::condition_variable> conds;
+
+  const bool use_perf;
+
+  /// pointers into conds
+  std::list<std::condition_variable*> waiters;
+
+  std::list<std::condition_variable*>::iterator _push_waiter() {
+    unsigned next = next_cond++;
+    if (next_cond == conds.size())
+      next_cond = 0;
+    return waiters.insert(waiters.end(), &(conds[next]));
+  }
+
+  void _kick_waiters() {
+    if (!waiters.empty())
+      waiters.front()->notify_all();
+  }
+
+  /// see above, values are in [0, 1].
+  double low_threshold = 0;
+  double high_threshold = 1;
+
+  /// see above, values are in seconds
+  double high_delay_per_count = 0;
+  double max_delay_per_count = 0;
+
+  /// Filled in in set_params
+  double s0 = 0; ///< e / (h - l), l != h, 0 otherwise
+  double s1 = 0; ///< (m - e)/(1 - h), 1 != h, 0 otherwise
+
+  /// max
+  uint64_t max = 0;
+  uint64_t current = 0;
+
+  ceph::timespan _get_delay(uint64_t c) const;
+
+public:
+  /**
+   * set_params
+   *
+   * Sets params.  If the params are invalid, returns false
+   * and populates errstream (if non-null) with a user comprehensible
+   * explanation.
+   */
+  bool set_params(
+    double _low_threshold,
+    double _high_threshold,
+    double expected_throughput,
+    double high_multiple,
+    double max_multiple,
+    uint64_t throttle_max,
+    std::ostream *errstream);
+
+  ceph::timespan get(uint64_t c = 1);
+  ceph::timespan wait() {
+    return get(0);
+  }
+  uint64_t put(uint64_t c = 1);
+  uint64_t take(uint64_t c = 1);
+  uint64_t get_current();
+  uint64_t get_max();
+
+  BackoffThrottle(CephContext *cct, const std::string& n,
+    unsigned expected_concurrency, ///< [in] determines size of conds
+    bool _use_perf = true);
+  ~BackoffThrottle();
+};
+
+
+/**
+ * @class SimpleThrottle
+ * This is a simple way to bound the number of concurrent operations.
+ *
+ * It tracks the first error encountered, and makes it available
+ * when all requests are complete. wait_for_ret() should be called
+ * before the instance is destroyed.
+ *
+ * Re-using the same instance isn't safe if you want to check each set
+ * of operations for errors, since the return value is not reset.
+ */
+class SimpleThrottle {
+public:
+  SimpleThrottle(uint64_t max, bool ignore_enoent);
+  ~SimpleThrottle();
+  void start_op();
+  void end_op(int r);
+  bool pending_error() const;
+  int wait_for_ret();
+private:
+  mutable std::mutex m_lock;
+  std::condition_variable m_cond;
+  uint64_t m_max;
+  uint64_t m_current = 0;
+  int m_ret = 0;
+  bool m_ignore_enoent;
+  uint32_t waiters = 0;
+};
+
+
+class OrderedThrottle;
+
+class C_OrderedThrottle : public Context {
+public:
+  C_OrderedThrottle(OrderedThrottle *ordered_throttle, uint64_t tid)
+    : m_ordered_throttle(ordered_throttle), m_tid(tid) {
+  }
+
+protected:
+  void finish(int r) override;
+
+private:
+  OrderedThrottle *m_ordered_throttle;
+  uint64_t m_tid;
+};
+
+/**
+ * @class OrderedThrottle
+ * Throttles the maximum number of active requests and completes them in order
+ *
+ * Operations can complete out-of-order but their associated Context callback
+ * will completed in-order during invocation of start_op() and wait_for_ret()
+ */
+class OrderedThrottle {
+public:
+  OrderedThrottle(uint64_t max, bool ignore_enoent);
+  ~OrderedThrottle();
+
+  C_OrderedThrottle *start_op(Context *on_finish);
+  void end_op(int r);
+
+  bool pending_error() const;
+  int wait_for_ret();
+
+protected:
+  friend class C_OrderedThrottle;
+
+  void finish_op(uint64_t tid, int r);
+
+private:
+  struct Result {
+    bool finished;
+    int ret_val;
+    Context *on_finish;
+
+    Result(Context *_on_finish = NULL)
+      : finished(false), ret_val(0), on_finish(_on_finish) {
+    }
+  };
+
+  typedef std::map<uint64_t, Result> TidResult;
+
+  mutable std::mutex m_lock;
+  std::condition_variable m_cond;
+  uint64_t m_max;
+  uint64_t m_current = 0;
+  int m_ret_val = 0;
+  bool m_ignore_enoent;
+
+  uint64_t m_next_tid = 0;
+  uint64_t m_complete_tid = 0;
+
+  TidResult m_tid_result;
+
+  void complete_pending_ops(std::unique_lock<std::mutex>& l);
+  uint32_t waiters = 0;
+};
+
+
+class TokenBucketThrottle {
+  struct Bucket {
+    CephContext *cct;
+    const std::string name;
+
+    uint64_t remain;
+    uint64_t max;
+    uint64_t capacity;
+    uint64_t available;
+
+    Bucket(CephContext *cct, const std::string &name, uint64_t m)
+      : cct(cct), name(name), remain(m), max(m), capacity(m), available(m) {}
+
+    uint64_t get(uint64_t c);
+    uint64_t put(uint64_t tokens, double burst_ratio);
+    void set_max(uint64_t max, uint64_t burst_seconds);
+  };
+
+  struct Blocker {
+    uint64_t tokens_requested;
+    Context *ctx;
+
+    Blocker(uint64_t _tokens_requested, Context* _ctx)
+      : tokens_requested(_tokens_requested), ctx(_ctx) {}
+  };
+
+  CephContext *m_cct;
+  const std::string m_name;
+  Bucket m_throttle;
+  uint64_t m_burst = 0;
+  uint64_t m_avg = 0;
+  SafeTimer *m_timer;
+  ceph::mutex *m_timer_lock;
+  Context *m_token_ctx = nullptr;
+  std::list<Blocker> m_blockers;
+  ceph::mutex m_lock;
+
+  // minimum of the filling period.
+  uint64_t m_tick_min = 50;
+  // tokens filling period, its unit is millisecond.
+  uint64_t m_tick = 0;
+  /**
+   * These variables are used to calculate how many tokens need to be put into
+   * the bucket within each tick.
+   *
+   * In actual use, the tokens to be put per tick(m_avg / m_ticks_per_second)
+   * may be a floating point number, but we need an 'uint64_t' to put into the
+   * bucket.
+   *
+   * For example, we set the value of rate to be 950, means 950 iops(or bps).
+   *
+   * In this case, the filling period(m_tick) should be 1000 / 950 = 1.052,
+   * which is too small for the SafeTimer. So we should set the period(m_tick)
+   * to be 50(m_tick_min), and 20 ticks in one second(m_ticks_per_second).
+   * The tokens filled in bucket per tick is 950 / 20 = 47.5, not an integer.
+   *
+   * To resolve this, we use a method called tokens_filled(m_current_tick) to
+   * calculate how many tokens will be put so far(until m_current_tick):
+   *
+   *   tokens_filled = m_current_tick / m_ticks_per_second * m_avg
+   *
+   * And the difference between two ticks will be the result we expect.
+   *   tokens in tick 0: (1 / 20 * 950) - (0 / 20 * 950) =  47 -   0 = 47
+   *   tokens in tick 1: (2 / 20 * 950) - (1 / 20 * 950) =  95 -  47 = 48
+   *   tokens in tick 2: (3 / 20 * 950) - (2 / 20 * 950) = 142 -  95 = 47
+   *
+   * As a result, the tokens filled in one second will shown as this:
+   *   tick    | 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|16|17|18|19|20|
+   *   tokens  |47|48|47|48|47|48|47|48|47|48|47|48|47|48|47|48|47|48|47|48|
+   */
+  uint64_t m_ticks_per_second = 0;
+  uint64_t m_current_tick = 0;
+
+  // period for the bucket filling tokens, its unit is seconds.
+  double m_schedule_tick = 1.0;
+
+public:
+  TokenBucketThrottle(CephContext *cct, const std::string &name,
+                      uint64_t burst, uint64_t avg,
+                      SafeTimer *timer, ceph::mutex *timer_lock);
+
+  ~TokenBucketThrottle();
+
+  const std::string &get_name() {
+    return m_name;
+  }
+
+  template <typename T, typename MF, typename I>
+  void add_blocker(uint64_t c, T&& t, MF&& mf, I&& item, uint64_t flag) {
+    auto ctx = new LambdaContext(
+      [t, mf, item=std::forward<I>(item), flag](int) mutable {
+        (t->*mf)(std::forward<I>(item), flag);
+      });
+    m_blockers.emplace_back(c, ctx);
+  }
+
+  template <typename T, typename MF, typename I>
+  bool get(uint64_t c, T&& t, MF&& mf, I&& item, uint64_t flag) {
+    bool wait = false;
+    uint64_t got = 0;
+    std::lock_guard lock(m_lock);
+    if (!m_blockers.empty()) {
+      // Keep the order of requests, add item after previous blocked requests.
+      wait = true;
+    } else {
+      if (0 == m_throttle.max || 0 == m_avg)
+        return false;
+
+      got = m_throttle.get(c);
+      if (got < c) {
+        // Not enough tokens, add a blocker for it.
+        wait = true;
+      }
+    }
+
+    if (wait) {
+      add_blocker(c - got, std::forward<T>(t), std::forward<MF>(mf),
+                  std::forward<I>(item), flag);
+    }
+
+    return wait;
+  }
+
+  int set_limit(uint64_t average, uint64_t burst, uint64_t burst_seconds);
+  void set_schedule_tick_min(uint64_t tick);
+
+private:
+  uint64_t tokens_filled(double tick);
+  uint64_t tokens_this_tick();
+  void add_tokens();
+  void schedule_timer();
+  void cancel_timer();
+};
+
+#endif
diff --git a/src/common/ThrottleInterface.h b/src/common/ThrottleInterface.h
new file mode 100644
index 000000000..49182a117
--- /dev/null
+++ b/src/common/ThrottleInterface.h
@@ -0,0 +1,23 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cstdint>
+
+class ThrottleInterface {
+public:
+  virtual ~ThrottleInterface() {}
+  /**
+   * take the specified number of slots from the stock regardless the throttling
+   * @param c number of slots to take
+   * @returns the total number of taken slots
+   */
+  virtual int64_t take(int64_t c = 1) = 0;
+  /**
+   * put slots back to the stock
+   * @param c number of slots to return
+   * @returns number of requests being hold after this
+   */
+  virtual int64_t put(int64_t c = 1) = 0;
+};
diff --git a/src/common/Timer.cc b/src/common/Timer.cc
new file mode 100644
index 000000000..48c79d613
--- /dev/null
+++ b/src/common/Timer.cc
@@ -0,0 +1,225 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "Cond.h"
+#include "Timer.h"
+
+
+#define dout_subsys ceph_subsys_timer
+#undef dout_prefix
+#define dout_prefix *_dout << "timer(" << this << ")."
+
+using std::pair;
+
+using ceph::operator <<;
+
+template <class Mutex>
+class CommonSafeTimerThread : public Thread {
+  CommonSafeTimer<Mutex> *parent;
+public:
+  explicit CommonSafeTimerThread(CommonSafeTimer<Mutex> *s) : parent(s) {}
+  void *entry() override {
+    parent->timer_thread();
+    return NULL;
+  }
+};
+
+template <class Mutex>
+CommonSafeTimer<Mutex>::CommonSafeTimer(CephContext *cct_, Mutex &l, bool safe_callbacks)
+  : cct(cct_), lock(l),
+    safe_callbacks(safe_callbacks),
+    thread(NULL),
+    stopping(false)
+{
+}
+
+template <class Mutex>
+CommonSafeTimer<Mutex>::~CommonSafeTimer()
+{
+  ceph_assert(thread == NULL);
+}
+
+template <class Mutex>
+void CommonSafeTimer<Mutex>::init()
+{
+  ldout(cct,10) << "init" << dendl;
+  thread = new CommonSafeTimerThread<Mutex>(this);
+  thread->create("safe_timer");
+}
+
+template <class Mutex>
+void CommonSafeTimer<Mutex>::shutdown()
+{
+  ldout(cct,10) << "shutdown" << dendl;
+  if (thread) {
+    ceph_assert(ceph_mutex_is_locked(lock));
+    cancel_all_events();
+    stopping = true;
+    cond.notify_all();
+    lock.unlock();
+    thread->join();
+    lock.lock();
+    delete thread;
+    thread = NULL;
+  }
+}
+
+template <class Mutex>
+void CommonSafeTimer<Mutex>::timer_thread()
+{
+  std::unique_lock l{lock};
+  ldout(cct,10) << "timer_thread starting" << dendl;
+  while (!stopping) {
+    auto now = clock_t::now();
+
+    while (!schedule.empty()) {
+      auto p = schedule.begin();
+
+      // is the future now?
+      if (p->first > now)
+	break;
+
+      Context *callback = p->second;
+      events.erase(callback);
+      schedule.erase(p);
+      ldout(cct,10) << "timer_thread executing " << callback << dendl;
+      
+      if (!safe_callbacks) {
+	l.unlock();
+	callback->complete(0);
+	l.lock();
+      } else {
+	callback->complete(0);
+      }
+    }
+
+    // recheck stopping if we dropped the lock
+    if (!safe_callbacks && stopping)
+      break;
+
+    ldout(cct,20) << "timer_thread going to sleep" << dendl;
+    if (schedule.empty()) {
+      cond.wait(l);
+    } else {
+      auto when = schedule.begin()->first;
+      cond.wait_until(l, when);
+    }
+    ldout(cct,20) << "timer_thread awake" << dendl;
+  }
+  ldout(cct,10) << "timer_thread exiting" << dendl;
+}
+
+template <class Mutex>
+Context* CommonSafeTimer<Mutex>::add_event_after(double seconds, Context *callback)
+{
+  return add_event_after(ceph::make_timespan(seconds), callback);
+}
+
+template <class Mutex>
+Context* CommonSafeTimer<Mutex>::add_event_after(ceph::timespan duration, Context *callback)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+
+  auto when = clock_t::now() + duration;
+  return add_event_at(when, callback);
+}
+
+template <class Mutex>
+Context* CommonSafeTimer<Mutex>::add_event_at(CommonSafeTimer<Mutex>::clock_t::time_point when, Context *callback)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ldout(cct,10) << __func__ << " " << when << " -> " << callback << dendl;
+  if (stopping) {
+    ldout(cct,5) << __func__ << " already shutdown, event not added" << dendl;
+    delete callback;
+    return nullptr;
+  }
+  scheduled_map_t::value_type s_val(when, callback);
+  scheduled_map_t::iterator i = schedule.insert(s_val);
+
+  event_lookup_map_t::value_type e_val(callback, i);
+  pair < event_lookup_map_t::iterator, bool > rval(events.insert(e_val));
+
+  /* If you hit this, you tried to insert the same Context* twice. */
+  ceph_assert(rval.second);
+
+  /* If the event we have just inserted comes before everything else, we need to
+   * adjust our timeout. */
+  if (i == schedule.begin())
+    cond.notify_all();
+  return callback;
+}
+
+template <class Mutex>
+Context* CommonSafeTimer<Mutex>::add_event_at(ceph::real_clock::time_point when, Context *callback)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  // convert from real_clock to mono_clock
+  auto mono_now = ceph::mono_clock::now();
+  auto real_now = ceph::real_clock::now();
+  const auto delta = when - real_now;
+  const auto mono_atime = (mono_now +
+			   std::chrono::ceil<clock_t::duration>(delta));
+  return add_event_at(mono_atime, callback);
+}
+
+template <class Mutex>
+bool CommonSafeTimer<Mutex>::cancel_event(Context *callback)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  
+  auto p = events.find(callback);
+  if (p == events.end()) {
+    ldout(cct,10) << "cancel_event " << callback << " not found" << dendl;
+    return false;
+  }
+
+  ldout(cct,10) << "cancel_event " << p->second->first << " -> " << callback << dendl;
+  delete p->first;
+
+  schedule.erase(p->second);
+  events.erase(p);
+  return true;
+}
+
+template <class Mutex>
+void CommonSafeTimer<Mutex>::cancel_all_events()
+{
+  ldout(cct,10) << "cancel_all_events" << dendl;
+  ceph_assert(ceph_mutex_is_locked(lock));
+
+  while (!events.empty()) {
+    auto p = events.begin();
+    ldout(cct,10) << " cancelled " << p->second->first << " -> " << p->first << dendl;
+    delete p->first;
+    schedule.erase(p->second);
+    events.erase(p);
+  }
+}
+
+template <class Mutex>
+void CommonSafeTimer<Mutex>::dump(const char *caller) const
+{
+  if (!caller)
+    caller = "";
+  ldout(cct,10) << "dump " << caller << dendl;
+
+  for (scheduled_map_t::const_iterator s = schedule.begin();
+       s != schedule.end();
+       ++s)
+    ldout(cct,10) << " " << s->first << "->" << s->second << dendl;
+}
+
+template class CommonSafeTimer<ceph::mutex>;
+template class CommonSafeTimer<ceph::fair_mutex>;
diff --git a/src/common/Timer.h b/src/common/Timer.h
new file mode 100644
index 000000000..fb70bad15
--- /dev/null
+++ b/src/common/Timer.h
@@ -0,0 +1,107 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_TIMER_H
+#define CEPH_TIMER_H
+
+#include <map>
+#include "include/common_fwd.h"
+#include "ceph_time.h"
+#include "ceph_mutex.h"
+#include "fair_mutex.h"
+#include <condition_variable>
+
+class Context;
+
+template <class Mutex> class CommonSafeTimerThread;
+
+template <class Mutex>
+class CommonSafeTimer
+{
+  CephContext *cct;
+  Mutex& lock;
+  std::condition_variable_any cond;
+  bool safe_callbacks;
+
+  friend class CommonSafeTimerThread<Mutex>;
+  class CommonSafeTimerThread<Mutex> *thread;
+
+  void timer_thread();
+  void _shutdown();
+
+  using clock_t = ceph::mono_clock;
+  using scheduled_map_t = std::multimap<clock_t::time_point, Context*>;
+  scheduled_map_t schedule;
+  using event_lookup_map_t = std::map<Context*, scheduled_map_t::iterator>;
+  event_lookup_map_t events;
+  bool stopping;
+
+  void dump(const char *caller = 0) const;
+
+public:
+  // This class isn't supposed to be copied
+  CommonSafeTimer(const CommonSafeTimer&) = delete;
+  CommonSafeTimer& operator=(const CommonSafeTimer&) = delete;
+
+  /* Safe callbacks determines whether callbacks are called with the lock
+   * held.
+   *
+   * safe_callbacks = true (default option) guarantees that a cancelled
+   * event's callback will never be called.
+   *
+   * Under some circumstances, holding the lock can cause lock cycles.
+   * If you are able to relax requirements on cancelled callbacks, then
+   * setting safe_callbacks = false eliminates the lock cycle issue.
+   * */
+  CommonSafeTimer(CephContext *cct, Mutex &l, bool safe_callbacks=true);
+  virtual ~CommonSafeTimer();
+
+  /* Call with the event_lock UNLOCKED.
+   *
+   * Cancel all events and stop the timer thread.
+   *
+   * If there are any events that still have to run, they will need to take
+   * the event_lock first. */
+  void init();
+  void shutdown();
+
+  /* Schedule an event in the future
+   * Call with the event_lock LOCKED */
+  Context* add_event_after(ceph::timespan duration, Context *callback);
+  Context* add_event_after(double seconds, Context *callback);
+  Context* add_event_at(clock_t::time_point when, Context *callback);
+  Context* add_event_at(ceph::real_clock::time_point when, Context *callback);
+  /* Cancel an event.
+   * Call with the event_lock LOCKED
+   *
+   * Returns true if the callback was cancelled.
+   * Returns false if you never added the callback in the first place.
+   */
+  bool cancel_event(Context *callback);
+
+  /* Cancel all events.
+   * Call with the event_lock LOCKED
+   *
+   * When this function returns, all events have been cancelled, and there are no
+   * more in progress.
+   */
+  void cancel_all_events();
+
+};
+
+extern template class CommonSafeTimer<ceph::mutex>;
+extern template class CommonSafeTimer<ceph::fair_mutex>;
+using SafeTimer = class CommonSafeTimer<ceph::mutex>;
+
+#endif
diff --git a/src/common/TracepointProvider.cc b/src/common/TracepointProvider.cc
new file mode 100644
index 000000000..38529f3df
--- /dev/null
+++ b/src/common/TracepointProvider.cc
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/TracepointProvider.h"
+#include "common/config.h"
+
+TracepointProvider::TracepointProvider(CephContext *cct, const char *library,
+                                       const char *config_key)
+  : m_cct(cct), m_library(library), m_config_keys{config_key, NULL}
+{
+  m_cct->_conf.add_observer(this);
+  verify_config(m_cct->_conf);
+}
+
+TracepointProvider::~TracepointProvider() {
+  m_cct->_conf.remove_observer(this);
+  if (m_handle) {
+    dlclose(m_handle);
+  }
+}
+
+void TracepointProvider::handle_conf_change(
+    const ConfigProxy& conf, const std::set<std::string> &changed) {
+  if (changed.count(m_config_keys[0])) {
+    verify_config(conf);
+  }
+}
+
+void TracepointProvider::verify_config(const ConfigProxy& conf) {
+  std::lock_guard locker(m_lock);
+  if (m_handle) {
+    return;
+  }
+
+  char buf[10];
+  char *pbuf = buf;
+  if (conf.get_val(m_config_keys[0], &pbuf, sizeof(buf)) != 0 ||
+      strncmp(buf, "true", 5) != 0) {
+    return;
+  }
+
+  m_handle = dlopen(m_library.c_str(), RTLD_NOW | RTLD_NODELETE);
+  ceph_assert(m_handle);
+}
+
diff --git a/src/common/TracepointProvider.h b/src/common/TracepointProvider.h
new file mode 100644
index 000000000..fe447677c
--- /dev/null
+++ b/src/common/TracepointProvider.h
@@ -0,0 +1,82 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_TRACEPOINT_PROVIDER_H
+#define CEPH_TRACEPOINT_PROVIDER_H
+
+#include "common/ceph_context.h"
+#include "common/config_obs.h"
+#include "common/ceph_mutex.h"
+#include "include/dlfcn_compat.h"
+
+class TracepointProvider : public md_config_obs_t {
+public:
+  struct Traits {
+    const char *library;
+    const char *config_key;
+
+    Traits(const char *library, const char *config_key)
+      : library(library), config_key(config_key) {
+    }
+  };
+
+  class Singleton {
+  public:
+    Singleton(CephContext *cct, const char *library, const char *config_key)
+      : tracepoint_provider(new TracepointProvider(cct, library, config_key)) {
+    }
+    ~Singleton() {
+      delete tracepoint_provider;
+    }
+
+    inline bool is_enabled() const {
+      return tracepoint_provider->m_handle != nullptr;
+    }
+  private:
+    TracepointProvider *tracepoint_provider;
+  };
+
+  template <const Traits &traits>
+  class TypedSingleton : public Singleton {
+  public:
+    explicit TypedSingleton(CephContext *cct)
+      : Singleton(cct, traits.library, traits.config_key) {
+    }
+  };
+
+  TracepointProvider(CephContext *cct, const char *library,
+                     const char *config_key);
+  ~TracepointProvider() override;
+
+  TracepointProvider(const TracepointProvider&) = delete;
+  TracepointProvider operator =(const TracepointProvider&) = delete;
+  TracepointProvider(TracepointProvider&&) = delete;
+  TracepointProvider operator =(TracepointProvider&&) = delete;
+
+  template <const Traits &traits>
+  static void initialize(CephContext *cct) {
+#ifdef WITH_LTTNG
+     cct->lookup_or_create_singleton_object<TypedSingleton<traits>>(
+       traits.library, false, cct);
+#endif
+  }
+
+protected:
+  const char** get_tracked_conf_keys() const override {
+    return m_config_keys;
+  }
+  void handle_conf_change(const ConfigProxy& conf,
+			  const std::set <std::string> &changed) override;
+
+private:
+  CephContext *m_cct;
+  std::string m_library;
+  mutable const char* m_config_keys[2];
+
+  ceph::mutex m_lock = ceph::make_mutex("TracepointProvider::m_lock");
+  void* m_handle = nullptr;
+
+  void verify_config(const ConfigProxy& conf);
+};
+
+#endif // CEPH_TRACEPOINT_PROVIDER_H
diff --git a/src/common/TrackedOp.cc b/src/common/TrackedOp.cc
new file mode 100644
index 000000000..32a1ab472
--- /dev/null
+++ b/src/common/TrackedOp.cc
@@ -0,0 +1,513 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ * Copyright 2013 Inktank
+ */
+
+#include "TrackedOp.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_optracker
+#undef dout_prefix
+#define dout_prefix _prefix(_dout)
+
+using std::list;
+using std::make_pair;
+using std::ostream;
+using std::pair;
+using std::set;
+using std::string;
+using std::stringstream;
+
+using ceph::Formatter;
+
+static ostream& _prefix(std::ostream* _dout)
+{
+  return *_dout << "-- op tracker -- ";
+}
+
+void OpHistoryServiceThread::break_thread() {
+  queue_spinlock.lock();
+  _external_queue.clear();
+  _break_thread = true;
+  queue_spinlock.unlock();
+}
+
+void* OpHistoryServiceThread::entry() {
+  int sleep_time = 1000;
+  list<pair<utime_t, TrackedOpRef>> internal_queue;
+  while (1) {
+    queue_spinlock.lock();
+    if (_break_thread) {
+      queue_spinlock.unlock();
+      break;
+    }
+    internal_queue.swap(_external_queue);
+    queue_spinlock.unlock();
+    if (internal_queue.empty()) {
+      usleep(sleep_time);
+      if (sleep_time < 128000) {
+        sleep_time <<= 2;
+      }
+    } else {
+      sleep_time = 1000;
+    }
+
+    while (!internal_queue.empty()) {
+      pair<utime_t, TrackedOpRef> op = internal_queue.front();
+      _ophistory->_insert_delayed(op.first, op.second);
+      internal_queue.pop_front();
+    }
+  }
+  return nullptr;
+}
+
+
+void OpHistory::on_shutdown()
+{
+  opsvc.break_thread();
+  opsvc.join();
+  std::lock_guard history_lock(ops_history_lock);
+  arrived.clear();
+  duration.clear();
+  slow_op.clear();
+  shutdown = true;
+}
+
+void OpHistory::_insert_delayed(const utime_t& now, TrackedOpRef op)
+{
+  std::lock_guard history_lock(ops_history_lock);
+  if (shutdown)
+    return;
+  double opduration = op->get_duration();
+  duration.insert(make_pair(opduration, op));
+  arrived.insert(make_pair(op->get_initiated(), op));
+  if (opduration >= history_slow_op_threshold.load()) {
+    slow_op.insert(make_pair(op->get_initiated(), op));
+    logger->inc(l_osd_slow_op_count);
+  }
+  cleanup(now);
+}
+
+void OpHistory::cleanup(utime_t now)
+{
+  while (arrived.size() &&
+	 (now - arrived.begin()->first >
+	  (double)(history_duration.load()))) {
+    duration.erase(make_pair(
+	arrived.begin()->second->get_duration(),
+	arrived.begin()->second));
+    arrived.erase(arrived.begin());
+  }
+
+  while (duration.size() > history_size.load()) {
+    arrived.erase(make_pair(
+	duration.begin()->second->get_initiated(),
+	duration.begin()->second));
+    duration.erase(duration.begin());
+  }
+
+  while (slow_op.size() > history_slow_op_size.load()) {
+    slow_op.erase(make_pair(
+	slow_op.begin()->second->get_initiated(),
+	slow_op.begin()->second));
+  }
+}
+
+void OpHistory::dump_ops(utime_t now, Formatter *f, set<string> filters, bool by_duration)
+{
+  std::lock_guard history_lock(ops_history_lock);
+  cleanup(now);
+  f->open_object_section("op_history");
+  f->dump_int("size", history_size.load());
+  f->dump_int("duration", history_duration.load());
+  {
+    f->open_array_section("ops");
+    auto dump_fn = [&f, &now, &filters](auto begin_iter, auto end_iter) {
+      for (auto i=begin_iter; i!=end_iter; ++i) {
+	if (!i->second->filter_out(filters))
+	  continue;
+	f->open_object_section("op");
+	i->second->dump(now, f, OpTracker::default_dumper);
+	f->close_section();
+      }
+    };
+
+    if (by_duration) {
+      dump_fn(duration.rbegin(), duration.rend());
+    } else {
+      dump_fn(arrived.begin(), arrived.end());
+    }
+    f->close_section();
+  }
+  f->close_section();
+}
+
+struct ShardedTrackingData {
+  ceph::mutex ops_in_flight_lock_sharded;
+  TrackedOp::tracked_op_list_t ops_in_flight_sharded;
+  explicit ShardedTrackingData(string lock_name)
+    : ops_in_flight_lock_sharded(ceph::make_mutex(lock_name)) {}
+};
+
+OpTracker::OpTracker(CephContext *cct_, bool tracking, uint32_t num_shards):
+  seq(0),
+  history(cct_),
+  num_optracker_shards(num_shards),
+  complaint_time(0), log_threshold(0),
+  tracking_enabled(tracking),
+  cct(cct_) {
+    for (uint32_t i = 0; i < num_optracker_shards; i++) {
+      char lock_name[34] = {0};
+      snprintf(lock_name, sizeof(lock_name), "%s:%" PRIu32, "OpTracker::ShardedLock", i);
+      ShardedTrackingData* one_shard = new ShardedTrackingData(lock_name);
+      sharded_in_flight_list.push_back(one_shard);
+    }
+}
+
+OpTracker::~OpTracker() {
+  while (!sharded_in_flight_list.empty()) {
+    ShardedTrackingData* sdata = sharded_in_flight_list.back();
+    ceph_assert(NULL != sdata);
+    while (!sdata->ops_in_flight_sharded.empty()) {
+      {
+        std::lock_guard locker(sdata->ops_in_flight_lock_sharded);
+        sdata->ops_in_flight_sharded.pop_back();
+      }
+    }
+    ceph_assert((sharded_in_flight_list.back())->ops_in_flight_sharded.empty());
+    delete sharded_in_flight_list.back();
+    sharded_in_flight_list.pop_back();
+  }
+}
+
+bool OpTracker::dump_historic_ops(Formatter *f, bool by_duration, set<string> filters)
+{
+  if (!tracking_enabled)
+    return false;
+
+  std::shared_lock l{lock};
+  utime_t now = ceph_clock_now();
+  history.dump_ops(now, f, filters, by_duration);
+  return true;
+}
+
+void OpHistory::dump_slow_ops(utime_t now, Formatter *f, set<string> filters)
+{
+  std::lock_guard history_lock(ops_history_lock);
+  cleanup(now);
+  f->open_object_section("OpHistory slow ops");
+  f->dump_int("num to keep", history_slow_op_size.load());
+  f->dump_int("threshold to keep", history_slow_op_threshold.load());
+  {
+    f->open_array_section("Ops");
+    for (set<pair<utime_t, TrackedOpRef> >::const_iterator i =
+	   slow_op.begin();
+	 i != slow_op.end();
+	 ++i) {
+      if (!i->second->filter_out(filters))
+        continue;
+      f->open_object_section("Op");
+      i->second->dump(now, f, OpTracker::default_dumper);
+      f->close_section();
+    }
+    f->close_section();
+  }
+  f->close_section();
+}
+
+bool OpTracker::dump_historic_slow_ops(Formatter *f, set<string> filters)
+{
+  if (!tracking_enabled)
+    return false;
+
+  std::shared_lock l{lock};
+  utime_t now = ceph_clock_now();
+  history.dump_slow_ops(now, f, filters);
+  return true;
+}
+
+bool OpTracker::dump_ops_in_flight(Formatter *f, bool print_only_blocked, set<string> filters, bool count_only, dumper lambda)
+{
+  if (!tracking_enabled)
+    return false;
+
+  std::shared_lock l{lock};
+  f->open_object_section("ops_in_flight"); // overall dump
+  uint64_t total_ops_in_flight = 0;
+
+  if (!count_only) {
+    f->open_array_section("ops"); // list of TrackedOps
+  }
+
+  utime_t now = ceph_clock_now();
+  for (uint32_t i = 0; i < num_optracker_shards; i++) {
+    ShardedTrackingData* sdata = sharded_in_flight_list[i];
+    ceph_assert(NULL != sdata); 
+    std::lock_guard locker(sdata->ops_in_flight_lock_sharded);
+    for (auto& op : sdata->ops_in_flight_sharded) {
+      if (print_only_blocked && (now - op.get_initiated() <= complaint_time))
+        break;
+      if (!op.filter_out(filters))
+        continue;
+      
+      if (!count_only) {
+        f->open_object_section("op");
+        op.dump(now, f, lambda);
+        f->close_section(); // this TrackedOp
+      }
+
+      total_ops_in_flight++;
+    }
+  }
+
+  if (!count_only) {
+    f->close_section(); // list of TrackedOps
+  }
+
+  if (print_only_blocked) {
+    f->dump_float("complaint_time", complaint_time);
+    f->dump_int("num_blocked_ops", total_ops_in_flight);
+  } else {
+    f->dump_int("num_ops", total_ops_in_flight);
+  }
+  f->close_section(); // overall dump
+  return true;
+}
+
+bool OpTracker::register_inflight_op(TrackedOp *i)
+{
+  if (!tracking_enabled)
+    return false;
+
+  std::shared_lock l{lock};
+  uint64_t current_seq = ++seq;
+  uint32_t shard_index = current_seq % num_optracker_shards;
+  ShardedTrackingData* sdata = sharded_in_flight_list[shard_index];
+  ceph_assert(NULL != sdata);
+  {
+    std::lock_guard locker(sdata->ops_in_flight_lock_sharded);
+    sdata->ops_in_flight_sharded.push_back(*i);
+    i->seq = current_seq;
+  }
+  return true;
+}
+
+void OpTracker::unregister_inflight_op(TrackedOp* const i)
+{
+  // caller checks;
+  ceph_assert(i->state);
+
+  uint32_t shard_index = i->seq % num_optracker_shards;
+  ShardedTrackingData* sdata = sharded_in_flight_list[shard_index];
+  ceph_assert(NULL != sdata);
+  {
+    std::lock_guard locker(sdata->ops_in_flight_lock_sharded);
+    auto p = sdata->ops_in_flight_sharded.iterator_to(*i);
+    sdata->ops_in_flight_sharded.erase(p);
+  }
+}
+
+void OpTracker::record_history_op(TrackedOpRef&& i)
+{
+  std::shared_lock l{lock};
+  history.insert(ceph_clock_now(), std::move(i));
+}
+
+bool OpTracker::visit_ops_in_flight(utime_t* oldest_secs,
+				    std::function<bool(TrackedOp&)>&& visit)
+{
+  if (!tracking_enabled)
+    return false;
+
+  const utime_t now = ceph_clock_now();
+  utime_t oldest_op = now;
+  // single representation of all inflight operations reunified
+  // from OpTracker's shards. TrackedOpRef extends the lifetime
+  // to carry the ops outside of the critical section, and thus
+  // allows to call the visitor without any lock being held.
+  // This simplifies the contract on API at the price of plenty
+  // additional moves and atomic ref-counting. This seems OK as
+  // `visit_ops_in_flight()` is definitely not intended for any
+  // hot path.
+  std::vector<TrackedOpRef> ops_in_flight;
+
+  std::shared_lock l{lock};
+  for (const auto sdata : sharded_in_flight_list) {
+    ceph_assert(sdata);
+    std::lock_guard locker(sdata->ops_in_flight_lock_sharded);
+    if (!sdata->ops_in_flight_sharded.empty()) {
+      utime_t oldest_op_tmp =
+	sdata->ops_in_flight_sharded.front().get_initiated();
+      if (oldest_op_tmp < oldest_op) {
+        oldest_op = oldest_op_tmp;
+      }
+    }
+    std::transform(std::begin(sdata->ops_in_flight_sharded),
+                   std::end(sdata->ops_in_flight_sharded),
+                   std::back_inserter(ops_in_flight),
+                   [] (TrackedOp& op) { return TrackedOpRef(&op); });
+  }
+  if (ops_in_flight.empty())
+    return false;
+  *oldest_secs = now - oldest_op;
+  dout(10) << "ops_in_flight.size: " << ops_in_flight.size()
+           << "; oldest is " << *oldest_secs
+           << " seconds old" << dendl;
+
+  if (*oldest_secs < complaint_time)
+    return false;
+
+  l.unlock();
+  for (auto& op : ops_in_flight) {
+    // `lock` neither `ops_in_flight_lock_sharded` should be held when
+    // calling the visitor. Otherwise `OSD::get_health_metrics()` can
+    // dead-lock due to the `~TrackedOp()` calling `record_history_op()`
+    // or `unregister_inflight_op()`.
+    if (!visit(*op))
+      break;
+  }
+  return true;
+}
+
+bool OpTracker::with_slow_ops_in_flight(utime_t* oldest_secs,
+					int* num_slow_ops,
+					int* num_warned_ops,
+					std::function<void(TrackedOp&)>&& on_warn)
+{
+  const utime_t now = ceph_clock_now();
+  auto too_old = now;
+  too_old -= complaint_time;
+  int slow = 0;
+  int warned = 0;
+  auto check = [&](TrackedOp& op) {
+    if (op.get_initiated() >= too_old) {
+      // no more slow ops in flight
+      return false;
+    }
+    if (!op.warn_interval_multiplier)
+      return true;
+    slow++;
+    if (warned >= log_threshold) {
+      // enough samples of slow ops
+      return true;
+    }
+    auto time_to_complain = (op.get_initiated() +
+			     complaint_time * op.warn_interval_multiplier);
+    if (time_to_complain >= now) {
+      // complain later if the op is still in flight
+      return true;
+    }
+    // will warn, increase counter
+    warned++;
+    on_warn(op);
+    return true;
+  };
+  if (visit_ops_in_flight(oldest_secs, check)) {
+    if (num_slow_ops) {
+      *num_slow_ops = slow;
+      *num_warned_ops = warned;
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool OpTracker::check_ops_in_flight(std::string* summary,
+				    std::vector<string> &warnings,
+				    int *num_slow_ops)
+{
+  const utime_t now = ceph_clock_now();
+  auto too_old = now;
+  too_old -= complaint_time;
+  int warned = 0;
+  utime_t oldest_secs;
+  auto warn_on_slow_op = [&](TrackedOp& op) {
+    stringstream ss;
+    utime_t age = now - op.get_initiated();
+    ss << "slow request " << age << " seconds old, received at "
+       << op.get_initiated() << ": " << op.get_desc()
+       << " currently "
+       << op.state_string();
+    warnings.push_back(ss.str());
+    // only those that have been shown will backoff
+    op.warn_interval_multiplier *= 2;
+  };
+  int slow = 0;
+  if (with_slow_ops_in_flight(&oldest_secs, &slow, &warned, warn_on_slow_op) &&
+      slow > 0) {
+    stringstream ss;
+    ss << slow << " slow requests, "
+       << warned << " included below; oldest blocked for > "
+       << oldest_secs << " secs";
+    *summary = ss.str();
+    if (num_slow_ops) {
+      *num_slow_ops = slow;
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void OpTracker::get_age_ms_histogram(pow2_hist_t *h)
+{
+  h->clear();
+  utime_t now = ceph_clock_now();
+
+  for (uint32_t iter = 0; iter < num_optracker_shards; iter++) {
+    ShardedTrackingData* sdata = sharded_in_flight_list[iter];
+    ceph_assert(NULL != sdata);
+    std::lock_guard locker(sdata->ops_in_flight_lock_sharded);
+
+    for (auto& i : sdata->ops_in_flight_sharded) {
+      utime_t age = now - i.get_initiated();
+      uint32_t ms = (long)(age * 1000.0);
+      h->add(ms);
+    }
+  }
+}
+
+
+#undef dout_context
+#define dout_context tracker->cct
+
+void TrackedOp::mark_event(std::string_view event, utime_t stamp)
+{
+  if (!state)
+    return;
+
+  {
+    std::lock_guard l(lock);
+    events.emplace_back(stamp, event);
+  }
+  dout(6) << " seq: " << seq
+	  << ", time: " << stamp
+	  << ", event: " << event
+	  << ", op: " << get_desc()
+	  << dendl;
+  _event_marked();
+}
+
+void TrackedOp::dump(utime_t now, Formatter *f, OpTracker::dumper lambda) const
+{
+  // Ignore if still in the constructor
+  if (!state)
+    return;
+  f->dump_string("description", get_desc());
+  f->dump_stream("initiated_at") << get_initiated();
+  f->dump_float("age", now - get_initiated());
+  f->dump_float("duration", get_duration());
+  {
+    f->open_object_section("type_data");
+    lambda(*this, f);
+    f->close_section();
+  }
+}
diff --git a/src/common/TrackedOp.h b/src/common/TrackedOp.h
new file mode 100644
index 000000000..477f6c959
--- /dev/null
+++ b/src/common/TrackedOp.h
@@ -0,0 +1,435 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 New Dream Network/Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef TRACKEDREQUEST_H_
+#define TRACKEDREQUEST_H_
+
+#include <atomic>
+#include "common/StackStringStream.h"
+#include "common/ceph_mutex.h"
+#include "common/histogram.h"
+#include "common/Thread.h"
+#include "common/Clock.h"
+#include "include/spinlock.h"
+#include "msg/Message.h"
+
+#define OPTRACKER_PREALLOC_EVENTS 20
+
+class TrackedOp;
+class OpHistory;
+
+typedef boost::intrusive_ptr<TrackedOp> TrackedOpRef;
+
+class OpHistoryServiceThread : public Thread
+{
+private:
+  std::list<std::pair<utime_t, TrackedOpRef>> _external_queue;
+  OpHistory* _ophistory;
+  mutable ceph::spinlock queue_spinlock;
+  bool _break_thread;
+public:
+  explicit OpHistoryServiceThread(OpHistory* parent)
+    : _ophistory(parent),
+      _break_thread(false) { }
+
+  void break_thread();
+  void insert_op(const utime_t& now, TrackedOpRef op) {
+    queue_spinlock.lock();
+    _external_queue.emplace_back(now, op);
+    queue_spinlock.unlock();
+  }
+
+  void *entry() override;
+};
+
+enum {
+  l_osd_slow_op_first = 1000,
+  l_osd_slow_op_count,
+  l_osd_slow_op_last,
+};
+
+class OpHistory {
+  CephContext* cct = nullptr;
+  std::set<std::pair<utime_t, TrackedOpRef> > arrived;
+  std::set<std::pair<double, TrackedOpRef> > duration;
+  std::set<std::pair<utime_t, TrackedOpRef> > slow_op;
+  ceph::mutex ops_history_lock = ceph::make_mutex("OpHistory::ops_history_lock");
+  void cleanup(utime_t now);
+  std::atomic_size_t history_size{0};
+  std::atomic_uint32_t history_duration{0};
+  std::atomic_size_t history_slow_op_size{0};
+  std::atomic_uint32_t history_slow_op_threshold{0};
+  std::atomic_bool shutdown{false};
+  OpHistoryServiceThread opsvc;
+  friend class OpHistoryServiceThread;
+  std::unique_ptr<PerfCounters> logger;
+
+public:
+  OpHistory(CephContext *c) : cct(c), opsvc(this) {
+    PerfCountersBuilder b(cct, "osd-slow-ops",
+                         l_osd_slow_op_first, l_osd_slow_op_last);
+    b.add_u64_counter(l_osd_slow_op_count, "slow_ops_count",
+                      "Number of operations taking over ten second");
+
+    logger.reset(b.create_perf_counters());
+    cct->get_perfcounters_collection()->add(logger.get());
+
+    opsvc.create("OpHistorySvc");
+  }
+  ~OpHistory() {
+    ceph_assert(arrived.empty());
+    ceph_assert(duration.empty());
+    ceph_assert(slow_op.empty());
+    if(logger) {
+      cct->get_perfcounters_collection()->remove(logger.get());
+      logger.reset();
+    }
+  }
+  void insert(const utime_t& now, TrackedOpRef op)
+  {
+    if (shutdown)
+      return;
+
+    opsvc.insert_op(now, op);
+  }
+
+  void _insert_delayed(const utime_t& now, TrackedOpRef op);
+  void dump_ops(utime_t now, ceph::Formatter *f, std::set<std::string> filters = {""}, bool by_duration=false);
+  void dump_slow_ops(utime_t now, ceph::Formatter *f, std::set<std::string> filters = {""});
+  void on_shutdown();
+  void set_size_and_duration(size_t new_size, uint32_t new_duration) {
+    history_size = new_size;
+    history_duration = new_duration;
+  }
+  void set_slow_op_size_and_threshold(size_t new_size, uint32_t new_threshold) {
+    history_slow_op_size = new_size;
+    history_slow_op_threshold = new_threshold;
+  }
+};
+
+struct ShardedTrackingData;
+class OpTracker {
+  friend class OpHistory;
+  std::atomic<int64_t> seq = { 0 };
+  std::vector<ShardedTrackingData*> sharded_in_flight_list;
+  OpHistory history;
+  uint32_t num_optracker_shards;
+  float complaint_time;
+  int log_threshold;
+  std::atomic<bool> tracking_enabled;
+  ceph::shared_mutex lock = ceph::make_shared_mutex("OpTracker::lock");
+
+public:
+  using dumper = std::function<void(const TrackedOp&, Formatter*)>;
+
+  CephContext *cct;
+  OpTracker(CephContext *cct_, bool tracking, uint32_t num_shards);
+      
+  void set_complaint_and_threshold(float time, int threshold) {
+    complaint_time = time;
+    log_threshold = threshold;
+  }
+  void set_history_size_and_duration(uint32_t new_size, uint32_t new_duration) {
+    history.set_size_and_duration(new_size, new_duration);
+  }
+  void set_history_slow_op_size_and_threshold(uint32_t new_size, uint32_t new_threshold) {
+    history.set_slow_op_size_and_threshold(new_size, new_threshold);
+  }
+  bool is_tracking() const {
+    return tracking_enabled;
+  }
+  void set_tracking(bool enable) {
+    tracking_enabled = enable;
+  }
+  static void default_dumper(const TrackedOp& op, Formatter* f);
+  bool dump_ops_in_flight(ceph::Formatter *f, bool print_only_blocked = false, std::set<std::string> filters = {""}, bool count_only = false, dumper lambda = default_dumper);
+  bool dump_historic_ops(ceph::Formatter *f, bool by_duration = false, std::set<std::string> filters = {""});
+  bool dump_historic_slow_ops(ceph::Formatter *f, std::set<std::string> filters = {""});
+  bool register_inflight_op(TrackedOp *i);
+  void unregister_inflight_op(TrackedOp *i);
+  void record_history_op(TrackedOpRef&& i);
+
+  void get_age_ms_histogram(pow2_hist_t *h);
+
+  /**
+   * walk through ops in flight
+   *
+   * @param oldest_sec the amount of time since the oldest op was initiated
+   * @param check a function consuming tracked ops, the function returns
+   *              false if it don't want to be fed with more ops
+   * @return True if there are any Ops to warn on, false otherwise
+   */
+  bool visit_ops_in_flight(utime_t* oldest_secs,
+			   std::function<bool(TrackedOp&)>&& visit);
+  /**
+   * walk through slow ops in flight
+   *
+   * @param[out] oldest_sec the amount of time since the oldest op was initiated
+   * @param[out] num_slow_ops total number of slow ops
+   * @param[out] num_warned_ops total number of warned ops
+   * @param on_warn a function consuming tracked ops, the function returns
+   *                false if it don't want to be fed with more ops
+   * @return True if there are any Ops to warn on, false otherwise
+   */
+  bool with_slow_ops_in_flight(utime_t* oldest_secs,
+			       int* num_slow_ops,
+			       int* num_warned_ops,
+			       std::function<void(TrackedOp&)>&& on_warn);
+  /**
+   * Look for Ops which are too old, and insert warning
+   * strings for each Op that is too old.
+   *
+   * @param summary[out] a std::string summarizing slow Ops.
+   * @param warning_strings[out] A std::vector<std::string> reference which is filled
+   * with a warning std::string for each old Op.
+   * @param slow[out] total number of slow ops
+   * @return True if there are any Ops to warn on, false otherwise.
+   */
+  bool check_ops_in_flight(std::string* summary,
+			   std::vector<std::string> &warning_strings,
+			   int* slow = nullptr);
+
+  void on_shutdown() {
+    history.on_shutdown();
+  }
+  ~OpTracker();
+
+  template <typename T, typename U>
+  typename T::Ref create_request(U params)
+  {
+    typename T::Ref retval(new T(params, this));
+    retval->tracking_start();
+    if (is_tracking()) {
+      retval->mark_event("throttled", params->get_throttle_stamp());
+      retval->mark_event("header_read", params->get_recv_stamp());
+      retval->mark_event("all_read", params->get_recv_complete_stamp());
+      retval->mark_event("dispatched", params->get_dispatch_stamp());
+    }
+
+    return retval;
+  }
+};
+
+class TrackedOp : public boost::intrusive::list_base_hook<> {
+private:
+  friend class OpHistory;
+  friend class OpTracker;
+
+  boost::intrusive::list_member_hook<> tracker_item;
+
+public:
+  typedef boost::intrusive::list<
+  TrackedOp,
+  boost::intrusive::member_hook<
+    TrackedOp,
+    boost::intrusive::list_member_hook<>,
+    &TrackedOp::tracker_item> > tracked_op_list_t;
+
+  // for use when clearing lists.  e.g.,
+  //   ls.clear_and_dispose(TrackedOp::Putter());
+  struct Putter {
+    void operator()(TrackedOp *op) {
+      op->put();
+    }
+  };
+
+protected:
+  OpTracker *tracker;          ///< the tracker we are associated with
+  std::atomic_int nref = {0};  ///< ref count
+
+  utime_t initiated_at;
+
+  struct Event {
+    utime_t stamp;
+    std::string str;
+
+    Event(utime_t t, std::string_view s) : stamp(t), str(s) {}
+
+    int compare(const char *s) const {
+      return str.compare(s);
+    }
+
+    const char *c_str() const {
+      return str.c_str();
+    }
+
+    void dump(ceph::Formatter *f) const {
+      f->dump_stream("time") << stamp;
+      f->dump_string("event", str);
+    }
+  };
+
+  std::vector<Event> events;    ///< std::list of events and their times
+  mutable ceph::mutex lock = ceph::make_mutex("TrackedOp::lock"); ///< to protect the events list
+  uint64_t seq = 0;        ///< a unique value std::set by the OpTracker
+
+  uint32_t warn_interval_multiplier = 1; //< limits output of a given op warning
+
+  enum {
+    STATE_UNTRACKED = 0,
+    STATE_LIVE,
+    STATE_HISTORY
+  };
+  std::atomic<int> state = {STATE_UNTRACKED};
+
+  TrackedOp(OpTracker *_tracker, const utime_t& initiated) :
+    tracker(_tracker),
+    initiated_at(initiated)
+  {
+    events.reserve(OPTRACKER_PREALLOC_EVENTS);
+  }
+
+  /// output any type-specific data you want to get when dump() is called
+  virtual void _dump(ceph::Formatter *f) const {}
+  /// if you want something else to happen when events are marked, implement
+  virtual void _event_marked() {}
+  /// return a unique descriptor of the Op; eg the message it's attached to
+  virtual void _dump_op_descriptor(std::ostream& stream) const = 0;
+  /// called when the last non-OpTracker reference is dropped
+  virtual void _unregistered() {}
+
+  virtual bool filter_out(const std::set<std::string>& filters) { return true; }
+
+public:
+  ZTracer::Trace osd_trace;
+  ZTracer::Trace pg_trace;
+  ZTracer::Trace store_trace;
+  ZTracer::Trace journal_trace;
+
+  virtual ~TrackedOp() {}
+
+  void get() {
+    ++nref;
+  }
+  void put() {
+  again:
+    auto nref_snap = nref.load();
+    if (nref_snap == 1) {
+      switch (state.load()) {
+      case STATE_UNTRACKED:
+	_unregistered();
+	delete this;
+	break;
+
+      case STATE_LIVE:
+	mark_event("done");
+	tracker->unregister_inflight_op(this);
+	_unregistered();
+	if (!tracker->is_tracking()) {
+	  delete this;
+	} else {
+	  state = TrackedOp::STATE_HISTORY;
+	  tracker->record_history_op(
+	    TrackedOpRef(this, /* add_ref = */ false));
+	}
+	break;
+
+      case STATE_HISTORY:
+	delete this;
+	break;
+
+      default:
+	ceph_abort();
+      }
+    } else if (!nref.compare_exchange_weak(nref_snap, nref_snap - 1)) {
+      goto again;
+    }
+  }
+
+  std::string get_desc() const {
+    std::string ret;
+    {
+      std::lock_guard l(desc_lock);
+      ret = desc;
+    }
+    if (ret.size() == 0 || want_new_desc.load()) {
+      CachedStackStringStream css;
+      std::scoped_lock l(lock, desc_lock);
+      if (desc.size() && !want_new_desc.load()) {
+        return desc;
+      }
+      _dump_op_descriptor(*css);
+      desc = css->strv();
+      want_new_desc = false;
+      return desc;
+    } else {
+      return ret;
+    }
+  }
+
+private:
+  mutable ceph::mutex desc_lock = ceph::make_mutex("OpTracker::desc_lock");
+  mutable std::string desc;   ///< protected by desc_lock
+  mutable std::atomic<bool> want_new_desc = {false};
+
+public:
+  void reset_desc() {
+    want_new_desc = true;
+  }
+
+  void dump_type(Formatter* f) const {
+    return _dump(f);
+  }
+
+  const utime_t& get_initiated() const {
+    return initiated_at;
+  }
+
+  double get_duration() const {
+    std::lock_guard l(lock);
+    if (!events.empty() && events.rbegin()->compare("done") == 0)
+      return events.rbegin()->stamp - get_initiated();
+    else
+      return ceph_clock_now() - get_initiated();
+  }
+
+  void mark_event(std::string_view event, utime_t stamp=ceph_clock_now());
+
+  void mark_nowarn() {
+    warn_interval_multiplier = 0;
+  }
+
+  std::string state_string() const {
+    std::lock_guard l(lock);
+    return _get_state_string();
+  }
+
+  void dump(utime_t now, ceph::Formatter *f, OpTracker::dumper lambda) const;
+
+  void tracking_start() {
+    if (tracker->register_inflight_op(this)) {
+      events.emplace_back(initiated_at, "initiated");
+      state = STATE_LIVE;
+    }
+  }
+
+  // ref counting via intrusive_ptr, with special behavior on final
+  // put for historical op tracking
+  friend void intrusive_ptr_add_ref(TrackedOp *o) {
+    o->get();
+  }
+  friend void intrusive_ptr_release(TrackedOp *o) {
+    o->put();
+  }
+
+protected:
+  virtual std::string _get_state_string() const {
+    return events.empty() ? std::string() : std::string(events.rbegin()->str);
+  }
+};
+
+inline void OpTracker::default_dumper(const TrackedOp& op, Formatter* f) {
+  op._dump(f);
+}
+
+#endif
diff --git a/src/common/WeightedPriorityQueue.h b/src/common/WeightedPriorityQueue.h
new file mode 100644
index 000000000..cf34709b9
--- /dev/null
+++ b/src/common/WeightedPriorityQueue.h
@@ -0,0 +1,353 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef WP_QUEUE_H
+#define WP_QUEUE_H
+
+#include "OpQueue.h"
+
+#include <boost/intrusive/list.hpp>
+#include <boost/intrusive/rbtree.hpp>
+#include <boost/intrusive/avl_set.hpp>
+
+#include "include/ceph_assert.h"
+
+namespace bi = boost::intrusive;
+
+template <typename T, typename S>
+class MapKey
+{
+  public:
+  bool operator()(const S i, const T &k) const
+  {
+    return i < k.key;
+  }
+  bool operator()(const T &k, const S i) const
+  {
+    return k.key < i;
+  }
+};
+
+template <typename T>
+class DelItem
+{
+  public:
+  void operator()(T* delete_this)
+    { delete delete_this; }
+};
+
+template <typename T, typename K>
+class WeightedPriorityQueue :  public OpQueue <T, K>
+{
+  private:
+    class ListPair : public bi::list_base_hook<>
+    {
+      public:
+        unsigned cost;
+        T item;
+        ListPair(unsigned c, T&& i) :
+          cost(c),
+          item(std::move(i))
+	{}
+    };
+    class Klass : public bi::set_base_hook<>
+    {
+      typedef bi::list<ListPair> ListPairs;
+      typedef typename ListPairs::iterator Lit;
+      public:
+        K key;		// klass
+        ListPairs lp;
+        Klass(K& k) :
+          key(k) {
+        }
+        ~Klass() {
+          lp.clear_and_dispose(DelItem<ListPair>());
+        }
+      friend bool operator< (const Klass &a, const Klass &b)
+        { return a.key < b.key; }
+      friend bool operator> (const Klass &a, const Klass &b)
+        { return a.key > b.key; }
+      friend bool operator== (const Klass &a, const Klass &b)
+        { return a.key == b.key; }
+      void insert(unsigned cost, T&& item, bool front) {
+        if (front) {
+          lp.push_front(*new ListPair(cost, std::move(item)));
+        } else {
+          lp.push_back(*new ListPair(cost, std::move(item)));
+        }
+      }
+      //Get the cost of the next item to dequeue
+      unsigned get_cost() const {
+        ceph_assert(!empty());
+        return lp.begin()->cost;
+      }
+      T pop() {
+	ceph_assert(!lp.empty());
+	T ret = std::move(lp.begin()->item);
+        lp.erase_and_dispose(lp.begin(), DelItem<ListPair>());
+        return ret;
+      }
+      bool empty() const {
+        return lp.empty();
+      }
+      unsigned get_size() const {
+	return lp.size();
+      }
+      void filter_class(std::list<T>* out) {
+        for (Lit i = --lp.end();; --i) {
+          if (out) {
+            out->push_front(std::move(i->item));
+          }
+          i = lp.erase_and_dispose(i, DelItem<ListPair>());
+          if (i == lp.begin()) {
+            break;
+          }
+        }
+      }
+    };
+    class SubQueue : public bi::set_base_hook<>
+    {
+      typedef bi::rbtree<Klass> Klasses;
+      typedef typename Klasses::iterator Kit;
+      void check_end() {
+        if (next == klasses.end()) {
+          next = klasses.begin();
+        }
+      }
+      public:
+	unsigned key;	// priority
+        Klasses klasses;
+	Kit next;
+	SubQueue(unsigned& p) :
+	  key(p),
+	  next(klasses.begin()) {
+	}
+	~SubQueue() {
+	  klasses.clear_and_dispose(DelItem<Klass>());
+	}
+      friend bool operator< (const SubQueue &a, const SubQueue &b)
+        { return a.key < b.key; }
+      friend bool operator> (const SubQueue &a, const SubQueue &b)
+        { return a.key > b.key; }
+      friend bool operator== (const SubQueue &a, const SubQueue &b)
+        { return a.key == b.key; }
+      bool empty() const {
+        return klasses.empty();
+      }
+      void insert(K cl, unsigned cost, T&& item, bool front = false) {
+        typename Klasses::insert_commit_data insert_data;
+      	std::pair<Kit, bool> ret =
+          klasses.insert_unique_check(cl, MapKey<Klass, K>(), insert_data);
+      	if (ret.second) {
+      	  ret.first = klasses.insert_unique_commit(*new Klass(cl), insert_data);
+          check_end();
+	}
+	ret.first->insert(cost, std::move(item), front);
+      }
+      unsigned get_cost() const {
+        ceph_assert(!empty());
+        return next->get_cost();
+      }
+      T pop() {
+        T ret = next->pop();
+        if (next->empty()) {
+          next = klasses.erase_and_dispose(next, DelItem<Klass>());
+        } else {
+	  ++next;
+	}
+        check_end();
+	return ret;
+      }
+      void filter_class(K& cl, std::list<T>* out) {
+        Kit i = klasses.find(cl, MapKey<Klass, K>());
+        if (i != klasses.end()) {
+          i->filter_class(out);
+	  Kit tmp = klasses.erase_and_dispose(i, DelItem<Klass>());
+	  if (next == i) {
+            next = tmp;
+          }
+          check_end();
+        }
+      }
+      // this is intended for unit tests and should be never used on hot paths
+      unsigned get_size_slow() const {
+	unsigned count = 0;
+	for (const auto& klass : klasses) {
+	  count += klass.get_size();
+	}
+	return count;
+      }
+      void dump(ceph::Formatter *f) const {
+        f->dump_int("num_keys", next->get_size());
+        if (!empty()) {
+          f->dump_int("first_item_cost", next->get_cost());
+        }
+      }
+    };
+    class Queue {
+      typedef bi::rbtree<SubQueue> SubQueues;
+      typedef typename SubQueues::iterator Sit;
+      SubQueues queues;
+      unsigned total_prio;
+      unsigned max_cost;
+      public:
+	Queue() :
+	  total_prio(0),
+	  max_cost(0) {
+	}
+	~Queue() {
+	  queues.clear_and_dispose(DelItem<SubQueue>());
+	}
+	bool empty() const {
+	  return queues.empty();
+	}
+	void insert(unsigned p, K cl, unsigned cost, T&& item, bool front = false) {
+	  typename SubQueues::insert_commit_data insert_data;
+      	  std::pair<typename SubQueues::iterator, bool> ret =
+      	    queues.insert_unique_check(p, MapKey<SubQueue, unsigned>(), insert_data);
+      	  if (ret.second) {
+      	    ret.first = queues.insert_unique_commit(*new SubQueue(p), insert_data);
+	    total_prio += p;
+      	  }
+	  ret.first->insert(cl, cost, std::move(item), front);
+	  if (cost > max_cost) {
+	    max_cost = cost;
+	  }
+	}
+	T pop(bool strict = false) {
+	  Sit i = --queues.end();
+	  if (strict) {
+	    T ret = i->pop();
+	    if (i->empty()) {
+	      queues.erase_and_dispose(i, DelItem<SubQueue>());
+	    }
+	    return ret;
+	  }
+	  if (queues.size() > 1) {
+	    while (true) {
+	      // Pick a new priority out of the total priority.
+	      unsigned prio = rand() % total_prio + 1;
+	      unsigned tp = total_prio - i->key;
+	      // Find the priority corresponding to the picked number.
+	      // Subtract high priorities to low priorities until the picked number
+	      // is more than the total and try to dequeue that priority.
+	      // Reverse the direction from previous implementation because there is a higher
+	      // chance of dequeuing a high priority op so spend less time spinning.
+	      while (prio <= tp) {
+		--i;
+		tp -= i->key;
+	      }
+	      // Flip a coin to see if this priority gets to run based on cost.
+	      // The next op's cost is multiplied by .9 and subtracted from the
+	      // max cost seen. Ops with lower costs will have a larger value
+	      // and allow them to be selected easier than ops with high costs.
+	      if (max_cost == 0 || rand() % max_cost <=
+		  (max_cost - ((i->get_cost() * 9) / 10))) {
+		break;
+	      }
+	      i = --queues.end();
+	    }
+	  }
+	  T ret = i->pop();
+	  if (i->empty()) {
+	    total_prio -= i->key;
+	    queues.erase_and_dispose(i, DelItem<SubQueue>());
+	  }
+	  return ret;
+	}
+	void filter_class(K& cl, std::list<T>* out) {
+	  for (Sit i = queues.begin(); i != queues.end();) {
+	    i->filter_class(cl, out);
+	    if (i->empty()) {
+	      total_prio -= i->key;
+	      i = queues.erase_and_dispose(i, DelItem<SubQueue>());
+	    } else {
+	      ++i;
+	    }
+	  }
+	}
+	// this is intended for unit tests and should be never used on hot paths
+	unsigned get_size_slow() const {
+	  unsigned count = 0;
+	  for (const auto& queue : queues) {
+	    count += queue.get_size_slow();
+	  }
+	  return count;
+	}
+	void dump(ceph::Formatter *f) const {
+	  for (typename SubQueues::const_iterator i = queues.begin();
+	        i != queues.end(); ++i) {
+	    f->dump_int("total_priority", total_prio);
+	    f->dump_int("max_cost", max_cost);
+	    f->open_object_section("subqueue");
+	    f->dump_int("priority", i->key);
+	    i->dump(f);
+	    f->close_section();
+	  }
+	}
+    };
+
+    Queue strict;
+    Queue normal;
+  public:
+    WeightedPriorityQueue(unsigned max_per, unsigned min_c) :
+      strict(),
+      normal()
+      {
+	std::srand(time(0));
+      }
+    void remove_by_class(K cl, std::list<T>* removed = 0) final {
+      strict.filter_class(cl, removed);
+      normal.filter_class(cl, removed);
+    }
+    bool empty() const final {
+      return strict.empty() && normal.empty();
+    }
+    void enqueue_strict(K cl, unsigned p, T&& item) final {
+      strict.insert(p, cl, 0, std::move(item));
+    }
+    void enqueue_strict_front(K cl, unsigned p, T&& item) final {
+      strict.insert(p, cl, 0, std::move(item), true);
+    }
+    void enqueue(K cl, unsigned p, unsigned cost, T&& item) final {
+      normal.insert(p, cl, cost, std::move(item));
+    }
+    void enqueue_front(K cl, unsigned p, unsigned cost, T&& item) final {
+      normal.insert(p, cl, cost, std::move(item), true);
+    }
+    T dequeue() override {
+      ceph_assert(!empty());
+      if (!strict.empty()) {
+	return strict.pop(true);
+      }
+      return normal.pop();
+    }
+    unsigned get_size_slow() {
+      return strict.get_size_slow() + normal.get_size_slow();
+    }
+    void dump(ceph::Formatter *f) const override {
+      f->open_array_section("high_queues");
+      strict.dump(f);
+      f->close_section();
+      f->open_array_section("queues");
+      normal.dump(f);
+      f->close_section();
+    }
+
+    void print(std::ostream &ostream) const final {
+      ostream << "WeightedPriorityQueue";
+    }
+};
+
+#endif
diff --git a/src/common/WorkQueue.cc b/src/common/WorkQueue.cc
new file mode 100644
index 000000000..ea7ff3939
--- /dev/null
+++ b/src/common/WorkQueue.cc
@@ -0,0 +1,413 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "WorkQueue.h"
+#include "include/compat.h"
+#include "common/errno.h"
+
+#define dout_subsys ceph_subsys_tp
+#undef dout_prefix
+#define dout_prefix *_dout << name << " "
+
+ThreadPool::ThreadPool(CephContext *cct_, std::string nm, std::string tn, int n, const char *option)
+  : cct(cct_), name(std::move(nm)), thread_name(std::move(tn)),
+    lockname(name + "::lock"),
+    _lock(ceph::make_mutex(lockname)),  // this should be safe due to declaration order
+    _stop(false),
+    _pause(0),
+    _draining(0),
+    _num_threads(n),
+    processing(0)
+{
+  if (option) {
+    _thread_num_option = option;
+    // set up conf_keys
+    _conf_keys = new const char*[2];
+    _conf_keys[0] = _thread_num_option.c_str();
+    _conf_keys[1] = NULL;
+  } else {
+    _conf_keys = new const char*[1];
+    _conf_keys[0] = NULL;
+  }
+}
+
+void ThreadPool::TPHandle::suspend_tp_timeout()
+{
+  cct->get_heartbeat_map()->clear_timeout(hb);
+}
+
+void ThreadPool::TPHandle::reset_tp_timeout()
+{
+  cct->get_heartbeat_map()->reset_timeout(
+    hb, grace, suicide_grace);
+}
+
+ThreadPool::~ThreadPool()
+{
+  ceph_assert(_threads.empty());
+  delete[] _conf_keys;
+}
+
+void ThreadPool::handle_conf_change(const ConfigProxy& conf,
+				    const std::set <std::string> &changed)
+{
+  if (changed.count(_thread_num_option)) {
+    char *buf;
+    int r = conf.get_val(_thread_num_option.c_str(), &buf, -1);
+    ceph_assert(r >= 0);
+    int v = atoi(buf);
+    free(buf);
+    if (v >= 0) {
+      _lock.lock();
+      _num_threads = v;
+      start_threads();
+      _cond.notify_all();
+      _lock.unlock();
+    }
+  }
+}
+
+void ThreadPool::worker(WorkThread *wt)
+{
+  std::unique_lock ul(_lock);
+  ldout(cct,10) << "worker start" << dendl;
+
+  std::stringstream ss;
+  ss << name << " thread " << (void *)pthread_self();
+  auto hb = cct->get_heartbeat_map()->add_worker(ss.str(), pthread_self());
+
+  while (!_stop) {
+
+    // manage dynamic thread pool
+    join_old_threads();
+    if (_threads.size() > _num_threads) {
+      ldout(cct,1) << " worker shutting down; too many threads (" << _threads.size() << " > " << _num_threads << ")" << dendl;
+      _threads.erase(wt);
+      _old_threads.push_back(wt);
+      break;
+    }
+
+    if (work_queues.empty()) {
+      ldout(cct, 10) << "worker no work queues" << dendl;
+    } else if (!_pause) {
+      WorkQueue_* wq;
+      int tries = 2 * work_queues.size();
+      bool did = false;
+      while (tries--) {
+	next_work_queue %= work_queues.size();
+	wq = work_queues[next_work_queue++];
+	
+	void *item = wq->_void_dequeue();
+	if (item) {
+	  processing++;
+	  ldout(cct,12) << "worker wq " << wq->name << " start processing " << item
+			<< " (" << processing << " active)" << dendl;
+	  ul.unlock();
+	  TPHandle tp_handle(cct, hb, wq->timeout_interval, wq->suicide_interval);
+	  tp_handle.reset_tp_timeout();
+	  wq->_void_process(item, tp_handle);
+	  ul.lock();
+	  wq->_void_process_finish(item);
+	  processing--;
+	  ldout(cct,15) << "worker wq " << wq->name << " done processing " << item
+			<< " (" << processing << " active)" << dendl;
+	  if (_pause || _draining)
+	    _wait_cond.notify_all();
+	  did = true;
+	  break;
+	}
+      }
+      if (did)
+	continue;
+    }
+
+    ldout(cct,20) << "worker waiting" << dendl;
+    cct->get_heartbeat_map()->reset_timeout(
+      hb,
+      ceph::make_timespan(cct->_conf->threadpool_default_timeout),
+      ceph::make_timespan(0));
+    auto wait = std::chrono::seconds(
+      cct->_conf->threadpool_empty_queue_max_wait);
+    _cond.wait_for(ul, wait);
+  }
+  ldout(cct,1) << "worker finish" << dendl;
+
+  cct->get_heartbeat_map()->remove_worker(hb);
+}
+
+void ThreadPool::start_threads()
+{
+  ceph_assert(ceph_mutex_is_locked(_lock));
+  while (_threads.size() < _num_threads) {
+    WorkThread *wt = new WorkThread(this);
+    ldout(cct, 10) << "start_threads creating and starting " << wt << dendl;
+    _threads.insert(wt);
+
+    wt->create(thread_name.c_str());
+  }
+}
+
+void ThreadPool::join_old_threads()
+{
+  ceph_assert(ceph_mutex_is_locked(_lock));
+  while (!_old_threads.empty()) {
+    ldout(cct, 10) << "join_old_threads joining and deleting " << _old_threads.front() << dendl;
+    _old_threads.front()->join();
+    delete _old_threads.front();
+    _old_threads.pop_front();
+  }
+}
+
+void ThreadPool::start()
+{
+  ldout(cct,10) << "start" << dendl;
+
+  if (_thread_num_option.length()) {
+    ldout(cct, 10) << " registering config observer on " << _thread_num_option << dendl;
+    cct->_conf.add_observer(this);
+  }
+
+  _lock.lock();
+  start_threads();
+  _lock.unlock();
+  ldout(cct,15) << "started" << dendl;
+}
+
+void ThreadPool::stop(bool clear_after)
+{
+  ldout(cct,10) << "stop" << dendl;
+
+  if (_thread_num_option.length()) {
+    ldout(cct, 10) << " unregistering config observer on " << _thread_num_option << dendl;
+    cct->_conf.remove_observer(this);
+  }
+
+  _lock.lock();
+  _stop = true;
+  _cond.notify_all();
+  join_old_threads();
+  _lock.unlock();
+  for (auto p = _threads.begin(); p != _threads.end(); ++p) {
+    (*p)->join();
+    delete *p;
+  }
+  _threads.clear();
+  _lock.lock();
+  for (unsigned i=0; i<work_queues.size(); i++)
+    work_queues[i]->_clear();
+  _stop = false;
+  _lock.unlock();
+  ldout(cct,15) << "stopped" << dendl;
+}
+
+void ThreadPool::pause()
+{
+  std::unique_lock ul(_lock);
+  ldout(cct,10) << "pause" << dendl;
+  _pause++;
+  while (processing) {
+    _wait_cond.wait(ul);
+  }
+  ldout(cct,15) << "paused" << dendl;
+}
+
+void ThreadPool::pause_new()
+{
+  ldout(cct,10) << "pause_new" << dendl;
+  _lock.lock();
+  _pause++;
+  _lock.unlock();
+}
+
+void ThreadPool::unpause()
+{
+  ldout(cct,10) << "unpause" << dendl;
+  _lock.lock();
+  ceph_assert(_pause > 0);
+  _pause--;
+  _cond.notify_all();
+  _lock.unlock();
+}
+
+void ThreadPool::drain(WorkQueue_* wq)
+{
+  std::unique_lock ul(_lock);
+  ldout(cct,10) << "drain" << dendl;
+  _draining++;
+  while (processing || (wq != NULL && !wq->_empty())) {
+    _wait_cond.wait(ul);
+  }
+  _draining--;
+}
+
+ShardedThreadPool::ShardedThreadPool(CephContext *pcct_, std::string nm, std::string tn,
+				     uint32_t pnum_threads):
+  cct(pcct_),
+  name(std::move(nm)),
+  thread_name(std::move(tn)),
+  lockname(name + "::lock"),
+  shardedpool_lock(ceph::make_mutex(lockname)),
+  num_threads(pnum_threads),
+  num_paused(0),
+  num_drained(0),
+  wq(NULL) {}
+
+void ShardedThreadPool::shardedthreadpool_worker(uint32_t thread_index)
+{
+  ceph_assert(wq != NULL);
+  ldout(cct,10) << "worker start" << dendl;
+
+  std::stringstream ss;
+  ss << name << " thread " << (void *)pthread_self();
+  auto hb = cct->get_heartbeat_map()->add_worker(ss.str(), pthread_self());
+
+  while (!stop_threads) {
+    if (pause_threads) {
+      std::unique_lock ul(shardedpool_lock);
+      ++num_paused;
+      wait_cond.notify_all();
+      while (pause_threads) {
+       cct->get_heartbeat_map()->reset_timeout(
+	        hb,
+	        wq->timeout_interval,
+		wq->suicide_interval);
+       shardedpool_cond.wait_for(
+	 ul,
+	 std::chrono::seconds(cct->_conf->threadpool_empty_queue_max_wait));
+      }
+      --num_paused;
+    }
+    if (drain_threads) {
+      std::unique_lock ul(shardedpool_lock);
+      if (wq->is_shard_empty(thread_index)) {
+        ++num_drained;
+        wait_cond.notify_all();
+        while (drain_threads) {
+	  cct->get_heartbeat_map()->reset_timeout(
+	    hb,
+	    wq->timeout_interval,
+	    wq->suicide_interval);
+          shardedpool_cond.wait_for(
+	    ul,
+	    std::chrono::seconds(cct->_conf->threadpool_empty_queue_max_wait));
+        }
+        --num_drained;
+      }
+    }
+
+    cct->get_heartbeat_map()->reset_timeout(
+      hb,
+      wq->timeout_interval,
+      wq->suicide_interval);
+    wq->_process(thread_index, hb);
+
+  }
+
+  ldout(cct,10) << "sharded worker finish" << dendl;
+
+  cct->get_heartbeat_map()->remove_worker(hb);
+
+}
+
+void ShardedThreadPool::start_threads()
+{
+  ceph_assert(ceph_mutex_is_locked(shardedpool_lock));
+  int32_t thread_index = 0;
+  while (threads_shardedpool.size() < num_threads) {
+
+    WorkThreadSharded *wt = new WorkThreadSharded(this, thread_index);
+    ldout(cct, 10) << "start_threads creating and starting " << wt << dendl;
+    threads_shardedpool.push_back(wt);
+    wt->create(thread_name.c_str());
+    thread_index++;
+  }
+}
+
+void ShardedThreadPool::start()
+{
+  ldout(cct,10) << "start" << dendl;
+
+  shardedpool_lock.lock();
+  start_threads();
+  shardedpool_lock.unlock();
+  ldout(cct,15) << "started" << dendl;
+}
+
+void ShardedThreadPool::stop()
+{
+  ldout(cct,10) << "stop" << dendl;
+  stop_threads = true;
+  ceph_assert(wq != NULL);
+  wq->return_waiting_threads();
+  for (auto p = threads_shardedpool.begin();
+       p != threads_shardedpool.end();
+       ++p) {
+    (*p)->join();
+    delete *p;
+  }
+  threads_shardedpool.clear();
+  ldout(cct,15) << "stopped" << dendl;
+}
+
+void ShardedThreadPool::pause()
+{
+  std::unique_lock ul(shardedpool_lock);
+  ldout(cct,10) << "pause" << dendl;
+  pause_threads = true;
+  ceph_assert(wq != NULL);
+  wq->return_waiting_threads();
+  while (num_threads != num_paused){
+    wait_cond.wait(ul);
+  }
+  ldout(cct,10) << "paused" << dendl; 
+}
+
+void ShardedThreadPool::pause_new()
+{
+  ldout(cct,10) << "pause_new" << dendl;
+  shardedpool_lock.lock();
+  pause_threads = true;
+  ceph_assert(wq != NULL);
+  wq->return_waiting_threads();
+  shardedpool_lock.unlock();
+  ldout(cct,10) << "paused_new" << dendl;
+}
+
+void ShardedThreadPool::unpause()
+{
+  ldout(cct,10) << "unpause" << dendl;
+  shardedpool_lock.lock();
+  pause_threads = false;
+  wq->stop_return_waiting_threads();
+  shardedpool_cond.notify_all();
+  shardedpool_lock.unlock();
+  ldout(cct,10) << "unpaused" << dendl;
+}
+
+void ShardedThreadPool::drain()
+{
+  std::unique_lock ul(shardedpool_lock);
+  ldout(cct,10) << "drain" << dendl;
+  drain_threads = true;
+  ceph_assert(wq != NULL);
+  wq->return_waiting_threads();
+  while (num_threads != num_drained) {
+    wait_cond.wait(ul);
+  }
+  drain_threads = false;
+  wq->stop_return_waiting_threads();
+  shardedpool_cond.notify_all();
+  ldout(cct,10) << "drained" << dendl;
+}
+
diff --git a/src/common/WorkQueue.h b/src/common/WorkQueue.h
new file mode 100644
index 000000000..bb9e1b66b
--- /dev/null
+++ b/src/common/WorkQueue.h
@@ -0,0 +1,680 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_WORKQUEUE_H
+#define CEPH_WORKQUEUE_H
+
+#if defined(WITH_SEASTAR) && !defined(WITH_ALIEN)
+// for ObjectStore.h
+struct ThreadPool {
+  struct TPHandle {
+  };
+};
+
+#else
+
+#include <atomic>
+#include <list>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "common/ceph_mutex.h"
+#include "include/unordered_map.h"
+#include "common/config_obs.h"
+#include "common/HeartbeatMap.h"
+#include "common/Thread.h"
+#include "include/common_fwd.h"
+#include "include/Context.h"
+#include "common/HBHandle.h"
+
+
+/// Pool of threads that share work submitted to multiple work queues.
+class ThreadPool : public md_config_obs_t {
+protected:
+  CephContext *cct;
+  std::string name;
+  std::string thread_name;
+  std::string lockname;
+  ceph::mutex _lock;
+  ceph::condition_variable _cond;
+  bool _stop;
+  int _pause;
+  int _draining;
+  ceph::condition_variable _wait_cond;
+
+public:
+  class TPHandle : public HBHandle {
+    friend class ThreadPool;
+    CephContext *cct;
+    ceph::heartbeat_handle_d *hb;
+    ceph::timespan grace;
+    ceph::timespan suicide_grace;
+  public:
+    TPHandle(
+      CephContext *cct,
+      ceph::heartbeat_handle_d *hb,
+      ceph::timespan grace,
+      ceph::timespan suicide_grace)
+      : cct(cct), hb(hb), grace(grace), suicide_grace(suicide_grace) {}
+    void reset_tp_timeout() override final;
+    void suspend_tp_timeout() override final;
+  };
+protected:
+
+  /// Basic interface to a work queue used by the worker threads.
+  struct WorkQueue_ {
+    std::string name;
+    ceph::timespan timeout_interval;
+    ceph::timespan suicide_interval;
+    WorkQueue_(std::string n, ceph::timespan ti, ceph::timespan sti)
+      : name(std::move(n)), timeout_interval(ti), suicide_interval(sti)
+    { }
+    virtual ~WorkQueue_() {}
+    /// Remove all work items from the queue.
+    virtual void _clear() = 0;
+    /// Check whether there is anything to do.
+    virtual bool _empty() = 0;
+    /// Get the next work item to process.
+    virtual void *_void_dequeue() = 0;
+    /** @brief Process the work item.
+     * This function will be called several times in parallel
+     * and must therefore be thread-safe. */
+    virtual void _void_process(void *item, TPHandle &handle) = 0;
+    /** @brief Synchronously finish processing a work item.
+     * This function is called after _void_process with the global thread pool lock held,
+     * so at most one copy will execute simultaneously for a given thread pool.
+     * It can be used for non-thread-safe finalization. */
+    virtual void _void_process_finish(void *) = 0;
+    void set_timeout(time_t ti){
+      timeout_interval = ceph::make_timespan(ti);
+    }
+    void set_suicide_timeout(time_t sti){
+      suicide_interval = ceph::make_timespan(sti);
+    }
+  };
+
+  // track thread pool size changes
+  unsigned _num_threads;
+  std::string _thread_num_option;
+  const char **_conf_keys;
+
+  const char **get_tracked_conf_keys() const override {
+    return _conf_keys;
+  }
+  void handle_conf_change(const ConfigProxy& conf,
+			  const std::set <std::string> &changed) override;
+
+public:
+  /** @brief Templated by-value work queue.
+   * Skeleton implementation of a queue that processes items submitted by value.
+   * This is useful if the items are single primitive values or very small objects
+   * (a few bytes). The queue will automatically add itself to the thread pool on
+   * construction and remove itself on destruction. */
+  template<typename T, typename U = T>
+  class WorkQueueVal : public WorkQueue_ {
+    ceph::mutex _lock = ceph::make_mutex("WorkQueueVal::_lock");
+    ThreadPool *pool;
+    std::list<U> to_process;
+    std::list<U> to_finish;
+    virtual void _enqueue(T) = 0;
+    virtual void _enqueue_front(T) = 0;
+    bool _empty() override = 0;
+    virtual U _dequeue() = 0;
+    virtual void _process_finish(U) {}
+
+    void *_void_dequeue() override {
+      {
+	std::lock_guard l(_lock);
+	if (_empty())
+	  return 0;
+	U u = _dequeue();
+	to_process.push_back(u);
+      }
+      return ((void*)1); // Not used
+    }
+    void _void_process(void *, TPHandle &handle) override {
+      _lock.lock();
+      ceph_assert(!to_process.empty());
+      U u = to_process.front();
+      to_process.pop_front();
+      _lock.unlock();
+
+      _process(u, handle);
+
+      _lock.lock();
+      to_finish.push_back(u);
+      _lock.unlock();
+    }
+
+    void _void_process_finish(void *) override {
+      _lock.lock();
+      ceph_assert(!to_finish.empty());
+      U u = to_finish.front();
+      to_finish.pop_front();
+      _lock.unlock();
+
+      _process_finish(u);
+    }
+
+    void _clear() override {}
+
+  public:
+    WorkQueueVal(std::string n,
+		 ceph::timespan ti,
+		 ceph::timespan sti,
+		 ThreadPool *p)
+      : WorkQueue_(std::move(n), ti, sti), pool(p) {
+      pool->add_work_queue(this);
+    }
+    ~WorkQueueVal() override {
+      pool->remove_work_queue(this);
+    }
+    void queue(T item) {
+      std::lock_guard l(pool->_lock);
+      _enqueue(item);
+      pool->_cond.notify_one();
+    }
+    void queue_front(T item) {
+      std::lock_guard l(pool->_lock);
+      _enqueue_front(item);
+      pool->_cond.notify_one();
+    }
+    void drain() {
+      pool->drain(this);
+    }
+  protected:
+    void lock() {
+      pool->lock();
+    }
+    void unlock() {
+      pool->unlock();
+    }
+    virtual void _process(U u, TPHandle &) = 0;
+  };
+
+  /** @brief Template by-pointer work queue.
+   * Skeleton implementation of a queue that processes items of a given type submitted as pointers.
+   * This is useful when the work item are large or include dynamically allocated memory. The queue
+   * will automatically add itself to the thread pool on construction and remove itself on
+   * destruction. */
+  template<class T>
+  class WorkQueue : public WorkQueue_ {
+    ThreadPool *pool;
+    
+    /// Add a work item to the queue.
+    virtual bool _enqueue(T *) = 0;
+    /// Dequeue a previously submitted work item.
+    virtual void _dequeue(T *) = 0;
+    /// Dequeue a work item and return the original submitted pointer.
+    virtual T *_dequeue() = 0;
+    virtual void _process_finish(T *) {}
+
+    // implementation of virtual methods from WorkQueue_
+    void *_void_dequeue() override {
+      return (void *)_dequeue();
+    }
+    void _void_process(void *p, TPHandle &handle) override {
+      _process(static_cast<T *>(p), handle);
+    }
+    void _void_process_finish(void *p) override {
+      _process_finish(static_cast<T *>(p));
+    }
+
+  protected:
+    /// Process a work item. Called from the worker threads.
+    virtual void _process(T *t, TPHandle &) = 0;
+
+  public:
+    WorkQueue(std::string n,
+	      ceph::timespan ti, ceph::timespan sti,
+	      ThreadPool* p)
+      : WorkQueue_(std::move(n), ti, sti), pool(p) {
+      pool->add_work_queue(this);
+    }
+    ~WorkQueue() override {
+      pool->remove_work_queue(this);
+    }
+    
+    bool queue(T *item) {
+      pool->_lock.lock();
+      bool r = _enqueue(item);
+      pool->_cond.notify_one();
+      pool->_lock.unlock();
+      return r;
+    }
+    void dequeue(T *item) {
+      pool->_lock.lock();
+      _dequeue(item);
+      pool->_lock.unlock();
+    }
+    void clear() {
+      pool->_lock.lock();
+      _clear();
+      pool->_lock.unlock();
+    }
+
+    void lock() {
+      pool->lock();
+    }
+    void unlock() {
+      pool->unlock();
+    }
+    /// wake up the thread pool (without lock held)
+    void wake() {
+      pool->wake();
+    }
+    /// wake up the thread pool (with lock already held)
+    void _wake() {
+      pool->_wake();
+    }
+    void _wait() {
+      pool->_wait();
+    }
+    void drain() {
+      pool->drain(this);
+    }
+
+  };
+
+  template<typename T>
+  class PointerWQ : public WorkQueue_ {
+  public:
+    ~PointerWQ() override {
+      m_pool->remove_work_queue(this);
+      ceph_assert(m_processing == 0);
+    }
+    void drain() {
+      {
+        // if this queue is empty and not processing, don't wait for other
+        // queues to finish processing
+        std::lock_guard l(m_pool->_lock);
+        if (m_processing == 0 && m_items.empty()) {
+          return;
+        }
+      }
+      m_pool->drain(this);
+    }
+    void queue(T *item) {
+      std::lock_guard l(m_pool->_lock);
+      m_items.push_back(item);
+      m_pool->_cond.notify_one();
+    }
+    bool empty() {
+      std::lock_guard l(m_pool->_lock);
+      return _empty();
+    }
+  protected:
+    PointerWQ(std::string n,
+	      ceph::timespan ti, ceph::timespan sti,
+	      ThreadPool* p)
+      : WorkQueue_(std::move(n), ti, sti), m_pool(p), m_processing(0) {
+    }
+    void register_work_queue() {
+      m_pool->add_work_queue(this);
+    }
+    void _clear() override {
+      ceph_assert(ceph_mutex_is_locked(m_pool->_lock));
+      m_items.clear();
+    }
+    bool _empty() override {
+      ceph_assert(ceph_mutex_is_locked(m_pool->_lock));
+      return m_items.empty();
+    }
+    void *_void_dequeue() override {
+      ceph_assert(ceph_mutex_is_locked(m_pool->_lock));
+      if (m_items.empty()) {
+        return NULL;
+      }
+
+      ++m_processing;
+      T *item = m_items.front();
+      m_items.pop_front();
+      return item;
+    }
+    void _void_process(void *item, ThreadPool::TPHandle &handle) override {
+      process(reinterpret_cast<T *>(item));
+    }
+    void _void_process_finish(void *item) override {
+      ceph_assert(ceph_mutex_is_locked(m_pool->_lock));
+      ceph_assert(m_processing > 0);
+      --m_processing;
+    }
+
+    virtual void process(T *item) = 0;
+    void process_finish() {
+      std::lock_guard locker(m_pool->_lock);
+      _void_process_finish(nullptr);
+    }
+
+    T *front() {
+      ceph_assert(ceph_mutex_is_locked(m_pool->_lock));
+      if (m_items.empty()) {
+        return NULL;
+      }
+      return m_items.front();
+    }
+    void requeue_front(T *item) {
+      std::lock_guard pool_locker(m_pool->_lock);
+      _void_process_finish(nullptr);
+      m_items.push_front(item);
+    }
+    void requeue_back(T *item) {
+      std::lock_guard pool_locker(m_pool->_lock);
+      _void_process_finish(nullptr);
+      m_items.push_back(item);
+    }
+    void signal() {
+      std::lock_guard pool_locker(m_pool->_lock);
+      m_pool->_cond.notify_one();
+    }
+    ceph::mutex &get_pool_lock() {
+      return m_pool->_lock;
+    }
+  private:
+    ThreadPool *m_pool;
+    std::list<T *> m_items;
+    uint32_t m_processing;
+  };
+protected:
+  std::vector<WorkQueue_*> work_queues;
+  int next_work_queue = 0;
+ 
+
+  // threads
+  struct WorkThread : public Thread {
+    ThreadPool *pool;
+    // cppcheck-suppress noExplicitConstructor
+    WorkThread(ThreadPool *p) : pool(p) {}
+    void *entry() override {
+      pool->worker(this);
+      return 0;
+    }
+  };
+  
+  std::set<WorkThread*> _threads;
+  std::list<WorkThread*> _old_threads;  ///< need to be joined
+  int processing;
+
+  void start_threads();
+  void join_old_threads();
+  virtual void worker(WorkThread *wt);
+
+public:
+  ThreadPool(CephContext *cct_, std::string nm, std::string tn, int n, const char *option = NULL);
+  ~ThreadPool() override;
+
+  /// return number of threads currently running
+  int get_num_threads() {
+    std::lock_guard l(_lock);
+    return _num_threads;
+  }
+  
+  /// assign a work queue to this thread pool
+  void add_work_queue(WorkQueue_* wq) {
+    std::lock_guard l(_lock);
+    work_queues.push_back(wq);
+  }
+  /// remove a work queue from this thread pool
+  void remove_work_queue(WorkQueue_* wq) {
+    std::lock_guard l(_lock);
+    unsigned i = 0;
+    while (work_queues[i] != wq)
+      i++;
+    for (i++; i < work_queues.size(); i++) 
+      work_queues[i-1] = work_queues[i];
+    ceph_assert(i == work_queues.size());
+    work_queues.resize(i-1);
+  }
+
+  /// take thread pool lock
+  void lock() {
+    _lock.lock();
+  }
+  /// release thread pool lock
+  void unlock() {
+    _lock.unlock();
+  }
+
+  /// wait for a kick on this thread pool
+  void wait(ceph::condition_variable &c) {
+    std::unique_lock l(_lock, std::adopt_lock);
+    c.wait(l);
+  }
+
+  /// wake up a waiter (with lock already held)
+  void _wake() {
+    _cond.notify_all();
+  }
+  /// wake up a waiter (without lock held)
+  void wake() {
+    std::lock_guard l(_lock);
+    _cond.notify_all();
+  }
+  void _wait() {
+    std::unique_lock l(_lock, std::adopt_lock);
+    _cond.wait(l);
+  }
+
+  /// start thread pool thread
+  void start();
+  /// stop thread pool thread
+  void stop(bool clear_after=true);
+  /// pause thread pool (if it not already paused)
+  void pause();
+  /// pause initiation of new work
+  void pause_new();
+  /// resume work in thread pool.  must match each pause() call 1:1 to resume.
+  void unpause();
+  /** @brief Wait until work completes.
+   * If the parameter is NULL, blocks until all threads are idle.
+   * If it is not NULL, blocks until the given work queue does not have
+   * any items left to process. */
+  void drain(WorkQueue_* wq = 0);
+};
+
+class GenContextWQ :
+  public ThreadPool::WorkQueueVal<GenContext<ThreadPool::TPHandle&>*> {
+  std::list<GenContext<ThreadPool::TPHandle&>*> _queue;
+public:
+  GenContextWQ(const std::string &name, ceph::timespan ti, ThreadPool *tp)
+    : ThreadPool::WorkQueueVal<
+      GenContext<ThreadPool::TPHandle&>*>(name, ti, ti*10, tp) {}
+  
+  void _enqueue(GenContext<ThreadPool::TPHandle&> *c) override {
+    _queue.push_back(c);
+  }
+  void _enqueue_front(GenContext<ThreadPool::TPHandle&> *c) override {
+    _queue.push_front(c);
+  }
+  bool _empty() override {
+    return _queue.empty();
+  }
+  GenContext<ThreadPool::TPHandle&> *_dequeue() override {
+    ceph_assert(!_queue.empty());
+    GenContext<ThreadPool::TPHandle&> *c = _queue.front();
+    _queue.pop_front();
+    return c;
+  }
+  void _process(GenContext<ThreadPool::TPHandle&> *c,
+		ThreadPool::TPHandle &tp) override {
+    c->complete(tp);
+  }
+};
+
+class C_QueueInWQ : public Context {
+  GenContextWQ *wq;
+  GenContext<ThreadPool::TPHandle&> *c;
+public:
+  C_QueueInWQ(GenContextWQ *wq, GenContext<ThreadPool::TPHandle &> *c)
+    : wq(wq), c(c) {}
+  void finish(int) override {
+    wq->queue(c);
+  }
+};
+
+/// Work queue that asynchronously completes contexts (executes callbacks).
+/// @see Finisher
+class ContextWQ : public ThreadPool::PointerWQ<Context> {
+public:
+  ContextWQ(const std::string &name, ceph::timespan ti, ThreadPool *tp)
+    : ThreadPool::PointerWQ<Context>(name, ti, ceph::timespan::zero(), tp) {
+    this->register_work_queue();
+  }
+
+  void queue(Context *ctx, int result = 0) {
+    if (result != 0) {
+      std::lock_guard locker(m_lock);
+      m_context_results[ctx] = result;
+    }
+    ThreadPool::PointerWQ<Context>::queue(ctx);
+  }
+protected:
+  void _clear() override {
+    ThreadPool::PointerWQ<Context>::_clear();
+
+    std::lock_guard locker(m_lock);
+    m_context_results.clear();
+  }
+
+  void process(Context *ctx) override {
+    int result = 0;
+    {
+      std::lock_guard locker(m_lock);
+      ceph::unordered_map<Context *, int>::iterator it =
+        m_context_results.find(ctx);
+      if (it != m_context_results.end()) {
+        result = it->second;
+        m_context_results.erase(it);
+      }
+    }
+    ctx->complete(result);
+  }
+private:
+  ceph::mutex m_lock = ceph::make_mutex("ContextWQ::m_lock");
+  ceph::unordered_map<Context*, int> m_context_results;
+};
+
+class ShardedThreadPool {
+
+  CephContext *cct;
+  std::string name;
+  std::string thread_name;
+  std::string lockname;
+  ceph::mutex shardedpool_lock;
+  ceph::condition_variable shardedpool_cond;
+  ceph::condition_variable wait_cond;
+  uint32_t num_threads;
+
+  std::atomic<bool> stop_threads = { false };
+  std::atomic<bool> pause_threads = { false };
+  std::atomic<bool> drain_threads = { false };
+
+  uint32_t num_paused;
+  uint32_t num_drained;
+
+public:
+
+  class BaseShardedWQ {
+  
+  public:
+    ceph::timespan timeout_interval, suicide_interval;
+    BaseShardedWQ(ceph::timespan ti, ceph::timespan sti)
+      :timeout_interval(ti), suicide_interval(sti) {}
+    virtual ~BaseShardedWQ() {}
+
+    virtual void _process(uint32_t thread_index, ceph::heartbeat_handle_d *hb ) = 0;
+    virtual void return_waiting_threads() = 0;
+    virtual void stop_return_waiting_threads() = 0;
+    virtual bool is_shard_empty(uint32_t thread_index) = 0;
+  };
+
+  template <typename T>
+  class ShardedWQ: public BaseShardedWQ {
+  
+    ShardedThreadPool* sharded_pool;
+
+  protected:
+    virtual void _enqueue(T&&) = 0;
+    virtual void _enqueue_front(T&&) = 0;
+
+
+  public:
+    ShardedWQ(ceph::timespan ti,
+	      ceph::timespan sti, ShardedThreadPool* tp)
+      : BaseShardedWQ(ti, sti), sharded_pool(tp) {
+      tp->set_wq(this);
+    }
+    ~ShardedWQ() override {}
+
+    void queue(T&& item) {
+      _enqueue(std::move(item));
+    }
+    void queue_front(T&& item) {
+      _enqueue_front(std::move(item));
+    }
+    void drain() {
+      sharded_pool->drain();
+    }
+    
+  };
+
+private:
+
+  BaseShardedWQ* wq;
+  // threads
+  struct WorkThreadSharded : public Thread {
+    ShardedThreadPool *pool;
+    uint32_t thread_index;
+    WorkThreadSharded(ShardedThreadPool *p, uint32_t pthread_index): pool(p),
+      thread_index(pthread_index) {}
+    void *entry() override {
+      pool->shardedthreadpool_worker(thread_index);
+      return 0;
+    }
+  };
+
+  std::vector<WorkThreadSharded*> threads_shardedpool;
+  void start_threads();
+  void shardedthreadpool_worker(uint32_t thread_index);
+  void set_wq(BaseShardedWQ* swq) {
+    wq = swq;
+  }
+
+
+
+public:
+
+  ShardedThreadPool(CephContext *cct_, std::string nm, std::string tn, uint32_t pnum_threads);
+
+  ~ShardedThreadPool(){};
+
+  /// start thread pool thread
+  void start();
+  /// stop thread pool thread
+  void stop();
+  /// pause thread pool (if it not already paused)
+  void pause();
+  /// pause initiation of new work
+  void pause_new();
+  /// resume work in thread pool.  must match each pause() call 1:1 to resume.
+  void unpause();
+  /// wait for all work to complete
+  void drain();
+
+};
+
+#endif
+
+#endif
diff --git a/src/common/addr_parsing.c b/src/common/addr_parsing.c
new file mode 100644
index 000000000..0e183c667
--- /dev/null
+++ b/src/common/addr_parsing.c
@@ -0,0 +1,149 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#if defined(__FreeBSD__) || defined(_AIX)
+#include <sys/socket.h>
+#include <netinet/in.h>
+#endif
+#include <netdb.h>
+
+#define BUF_SIZE 128
+#define ROUND_UP_128(x) (-(-(x) & -128))
+
+int safe_cat(char **pstr, int *plen, int pos, const char *src)
+{
+  size_t len2 = strlen(src);
+  size_t new_size = pos + len2 + 1;
+  if (*plen < new_size) {
+    size_t round_up = ROUND_UP_128(new_size);
+    void* p = realloc(*pstr, round_up);
+    if (!p) {
+      printf("Out of memory\n");
+      exit(1);
+    } else {
+      *pstr = p;
+    }
+  }
+  memcpy(*pstr + pos, src, len2 + 1);
+  return pos + len2;
+}
+
+char *resolve_addrs(const char *orig_str)
+{
+  int len = BUF_SIZE;
+  char *new_str = (char *)malloc(len);
+
+  if (!new_str) {
+    return NULL;
+  }
+
+  char *saveptr = NULL;
+  char *buf = strdup(orig_str);
+  const char *delim = ",; ";
+
+  char *tok = strtok_r(buf, delim, &saveptr);
+
+  int pos = 0;
+
+  while (tok) {
+    struct addrinfo hint;
+    struct addrinfo *res, *ores;
+    char *firstcolon, *lastcolon, *bracecolon;
+    int r;
+    int brackets = 0;
+
+    firstcolon = strchr(tok, ':');
+    lastcolon = strrchr(tok, ':');
+    bracecolon = strstr(tok, "]:");
+
+    char *port_str = 0;
+    if (firstcolon && firstcolon == lastcolon) {
+      /* host:port or a.b.c.d:port */
+      *firstcolon = 0;
+      port_str = firstcolon + 1;
+    } else if (bracecolon) {
+      /* [ipv6addr]:port */
+      port_str = bracecolon + 1;
+      *port_str = 0;
+      port_str++;
+    }
+    if (port_str && !*port_str)
+      port_str = NULL;
+
+    if (*tok == '[' &&
+	tok[strlen(tok)-1] == ']') {
+      tok[strlen(tok)-1] = 0;
+      tok++;
+      brackets = 1;
+    }
+
+    //printf("name '%s' port '%s'\n", tok, port_str);
+
+    // FIPS zeroization audit 20191115: this memset is fine.
+    memset(&hint, 0, sizeof(hint));
+    hint.ai_family = AF_UNSPEC;
+    hint.ai_socktype = SOCK_STREAM;
+    hint.ai_protocol = IPPROTO_TCP;
+
+    r = getaddrinfo(tok, port_str, &hint, &res);
+    if (r < 0) {
+      printf("server name not found: %s (%s)\n", tok,
+	     gai_strerror(r));
+      free(new_str);
+      free(buf);
+      return 0;
+    }
+
+    /* build resolved addr list */
+    ores = res;
+    while (res) {
+      char host[40], port[40];
+      getnameinfo(res->ai_addr, res->ai_addrlen,
+		  host, sizeof(host),
+		  port, sizeof(port),
+		  NI_NUMERICSERV | NI_NUMERICHOST);
+      /*printf(" host %s port %s flags %d family %d socktype %d proto %d sanonname %s\n",
+	host, port,
+	res->ai_flags, res->ai_family, res->ai_socktype, res->ai_protocol,
+	res->ai_canonname);*/
+      if (res->ai_family == AF_INET6)
+	brackets = 1;  /* always surround ipv6 addrs with brackets */
+      if (brackets)
+	pos = safe_cat(&new_str, &len, pos, "[");
+      pos = safe_cat(&new_str, &len, pos, host);
+      if (brackets)
+	pos = safe_cat(&new_str, &len, pos, "]");
+      if (port_str) {
+	pos = safe_cat(&new_str, &len, pos, ":");
+	pos = safe_cat(&new_str, &len, pos, port);
+      }
+      res = res->ai_next;
+      if (res)
+	pos = safe_cat(&new_str, &len, pos, ",");
+    }
+    freeaddrinfo(ores);
+
+    tok = strtok_r(NULL, delim, &saveptr);
+    if (tok)
+      pos = safe_cat(&new_str, &len, pos, ",");
+
+  }
+
+  //printf("new_str is '%s'\n", new_str);
+  free(buf);
+  return new_str;
+}
diff --git a/src/common/admin_socket.cc b/src/common/admin_socket.cc
new file mode 100644
index 000000000..8a7e0c721
--- /dev/null
+++ b/src/common/admin_socket.cc
@@ -0,0 +1,790 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include <poll.h>
+#include <sys/un.h>
+
+#include "common/admin_socket.h"
+#include "common/admin_socket_client.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "common/Thread.h"
+#include "common/version.h"
+#include "common/ceph_mutex.h"
+
+#ifndef WITH_SEASTAR
+#include "common/Cond.h"
+#endif
+
+#include "messages/MCommand.h"
+#include "messages/MCommandReply.h"
+#include "messages/MMonCommand.h"
+#include "messages/MMonCommandAck.h"
+
+// re-include our assert to clobber the system one; fix dout:
+#include "include/ceph_assert.h"
+#include "include/compat.h"
+#include "include/sock_compat.h"
+
+#define dout_subsys ceph_subsys_asok
+#undef dout_prefix
+#define dout_prefix *_dout << "asok(" << (void*)m_cct << ") "
+
+using namespace std::literals;
+
+using std::ostringstream;
+using std::string;
+using std::stringstream;
+
+using namespace TOPNSPC::common;
+
+using ceph::bufferlist;
+using ceph::cref_t;
+using ceph::Formatter;
+
+
+/*
+ * UNIX domain sockets created by an application persist even after that
+ * application closes, unless they're explicitly unlinked. This is because the
+ * directory containing the socket keeps a reference to the socket.
+ *
+ * This code makes things a little nicer by unlinking those dead sockets when
+ * the application exits normally.
+ */
+
+template<typename F, typename... Args>
+inline int retry_sys_call(F f, Args... args) {
+  int r;
+  do {
+    r = f(args...);
+  } while (r < 0 && errno == EINTR);
+  return r;
+};
+
+
+static std::mutex cleanup_lock;
+static std::vector<std::string> cleanup_files;
+static bool cleanup_atexit = false;
+
+static void remove_cleanup_file(std::string_view file) {
+  std::unique_lock l(cleanup_lock);
+
+  if (auto i = std::find(cleanup_files.cbegin(), cleanup_files.cend(), file);
+      i != cleanup_files.cend()) {
+    retry_sys_call(::unlink, i->c_str());
+    cleanup_files.erase(i);
+  }
+}
+
+void remove_all_cleanup_files() {
+  std::unique_lock l(cleanup_lock);
+  for (const auto& s : cleanup_files) {
+    retry_sys_call(::unlink, s.c_str());
+  }
+  cleanup_files.clear();
+}
+
+static void add_cleanup_file(std::string file) {
+  std::unique_lock l(cleanup_lock);
+  cleanup_files.push_back(std::move(file));
+  if (!cleanup_atexit) {
+    atexit(remove_all_cleanup_files);
+    cleanup_atexit = true;
+  }
+}
+
+AdminSocket::AdminSocket(CephContext *cct)
+  : m_cct(cct)
+{}
+
+AdminSocket::~AdminSocket()
+{
+  shutdown();
+}
+
+/*
+ * This thread listens on the UNIX domain socket for incoming connections.
+ * It only handles one connection at a time at the moment. All I/O is nonblocking,
+ * so that we can implement sensible timeouts. [TODO: make all I/O nonblocking]
+ *
+ * This thread also listens to m_wakeup_rd_fd. If there is any data sent to this
+ * pipe, the thread wakes up.  If m_shutdown is set, the thread terminates
+ * itself gracefully, allowing the AdminSocketConfigObs class to join() it.
+ */
+
+std::string AdminSocket::create_wakeup_pipe(int *pipe_rd, int *pipe_wr)
+{
+  int pipefd[2];
+  #ifdef _WIN32
+  if (win_socketpair(pipefd) < 0) {
+  #else
+  if (pipe_cloexec(pipefd, O_NONBLOCK) < 0) {
+  #endif
+    int e = ceph_sock_errno();
+    ostringstream oss;
+    oss << "AdminSocket::create_wakeup_pipe error: " << cpp_strerror(e);
+    return oss.str();
+  }
+  
+  *pipe_rd = pipefd[0];
+  *pipe_wr = pipefd[1];
+  return "";
+}
+
+std::string AdminSocket::destroy_wakeup_pipe()
+{
+  // Send a byte to the wakeup pipe that the thread is listening to
+  char buf[1] = { 0x0 };
+  int ret = safe_send(m_wakeup_wr_fd, buf, sizeof(buf));
+
+  // Close write end
+  retry_sys_call(::compat_closesocket, m_wakeup_wr_fd);
+  m_wakeup_wr_fd = -1;
+
+  if (ret != 0) {
+    ostringstream oss;
+    oss << "AdminSocket::destroy_shutdown_pipe error: failed to write"
+      "to thread shutdown pipe: error " << ret;
+    return oss.str();
+  }
+
+  th.join();
+
+  // Close read end. Doing this before join() blocks the listenter and prevents
+  // joining.
+  retry_sys_call(::compat_closesocket, m_wakeup_rd_fd);
+  m_wakeup_rd_fd = -1;
+
+  return "";
+}
+
+std::string AdminSocket::bind_and_listen(const std::string &sock_path, int *fd)
+{
+  ldout(m_cct, 5) << "bind_and_listen " << sock_path << dendl;
+
+  struct sockaddr_un address;
+  if (sock_path.size() > sizeof(address.sun_path) - 1) {
+    ostringstream oss;
+    oss << "AdminSocket::bind_and_listen: "
+	<< "The UNIX domain socket path " << sock_path << " is too long! The "
+	<< "maximum length on this system is "
+	<< (sizeof(address.sun_path) - 1);
+    return oss.str();
+  }
+  int sock_fd = socket_cloexec(PF_UNIX, SOCK_STREAM, 0);
+  if (sock_fd < 0) {
+    int err = ceph_sock_errno();
+    ostringstream oss;
+    oss << "AdminSocket::bind_and_listen: "
+	<< "failed to create socket: " << cpp_strerror(err);
+    return oss.str();
+  }
+  // FIPS zeroization audit 20191115: this memset is fine.
+  memset(&address, 0, sizeof(struct sockaddr_un));
+  address.sun_family = AF_UNIX;
+  snprintf(address.sun_path, sizeof(address.sun_path),
+	   "%s", sock_path.c_str());
+  if (::bind(sock_fd, (struct sockaddr*)&address,
+	   sizeof(struct sockaddr_un)) != 0) {
+    int err = ceph_sock_errno();
+    if (err == EADDRINUSE) {
+      AdminSocketClient client(sock_path);
+      bool ok;
+      client.ping(&ok);
+      if (ok) {
+	ldout(m_cct, 20) << "socket " << sock_path << " is in use" << dendl;
+	err = EEXIST;
+      } else {
+	ldout(m_cct, 20) << "unlink stale file " << sock_path << dendl;
+	retry_sys_call(::unlink, sock_path.c_str());
+	if (::bind(sock_fd, (struct sockaddr*)&address,
+		 sizeof(struct sockaddr_un)) == 0) {
+	  err = 0;
+	} else {
+	  err = ceph_sock_errno();
+	}
+      }
+    }
+    if (err != 0) {
+      ostringstream oss;
+      oss << "AdminSocket::bind_and_listen: "
+	  << "failed to bind the UNIX domain socket to '" << sock_path
+	  << "': " << cpp_strerror(err);
+      close(sock_fd);
+      return oss.str();
+    }
+  }
+  if (listen(sock_fd, 5) != 0) {
+    int err = ceph_sock_errno();
+    ostringstream oss;
+    oss << "AdminSocket::bind_and_listen: "
+	  << "failed to listen to socket: " << cpp_strerror(err);
+    close(sock_fd);
+    retry_sys_call(::unlink, sock_path.c_str());
+    return oss.str();
+  }
+  *fd = sock_fd;
+  return "";
+}
+
+void AdminSocket::entry() noexcept
+{
+  ldout(m_cct, 5) << "entry start" << dendl;
+  while (true) {
+    struct pollfd fds[2];
+    // FIPS zeroization audit 20191115: this memset is fine.
+    memset(fds, 0, sizeof(fds));
+    fds[0].fd = m_sock_fd;
+    fds[0].events = POLLIN | POLLRDBAND;
+    fds[1].fd = m_wakeup_rd_fd;
+    fds[1].events = POLLIN | POLLRDBAND;
+
+    ldout(m_cct,20) << __func__ << " waiting" << dendl;
+    int ret = poll(fds, 2, -1);
+    if (ret < 0) {
+      int err = ceph_sock_errno();
+      if (err == EINTR) {
+	continue;
+      }
+      lderr(m_cct) << "AdminSocket: poll(2) error: '"
+		   << cpp_strerror(err) << dendl;
+      return;
+    }
+    ldout(m_cct,20) << __func__ << " awake" << dendl;
+
+    if (fds[0].revents & POLLIN) {
+      // Send out some data
+      do_accept();
+    }
+    if (fds[1].revents & POLLIN) {
+      // read off one byte
+      char buf;
+      auto s = safe_recv(m_wakeup_rd_fd, &buf, 1);
+      if (s == -1) {
+        int e = ceph_sock_errno();
+        ldout(m_cct, 5) << "AdminSocket: (ignoring) read(2) error: '"
+		        << cpp_strerror(e) << dendl;
+      }
+      do_tell_queue();
+    }
+    if (m_shutdown) {
+      // Parent wants us to shut down
+      return;
+    }
+  }
+  ldout(m_cct, 5) << "entry exit" << dendl;
+}
+
+void AdminSocket::chown(uid_t uid, gid_t gid)
+{
+  if (m_sock_fd >= 0) {
+    int r = ::chown(m_path.c_str(), uid, gid);
+    if (r < 0) {
+      r = -errno;
+      lderr(m_cct) << "AdminSocket: failed to chown socket: "
+		   << cpp_strerror(r) << dendl;
+    }
+  }
+}
+
+void AdminSocket::chmod(mode_t mode)
+{
+  if (m_sock_fd >= 0) {
+    int r = ::chmod(m_path.c_str(), mode);
+    if (r < 0) {
+      r = -errno;
+      lderr(m_cct) << "AdminSocket: failed to chmod socket: "
+                   << cpp_strerror(r) << dendl;
+    }
+  }
+}
+
+void AdminSocket::do_accept()
+{
+  struct sockaddr_un address;
+  socklen_t address_length = sizeof(address);
+  ldout(m_cct, 30) << "AdminSocket: calling accept" << dendl;
+  int connection_fd = accept_cloexec(m_sock_fd, (struct sockaddr*) &address,
+			     &address_length);
+  if (connection_fd < 0) {
+    int err = ceph_sock_errno();
+    lderr(m_cct) << "AdminSocket: do_accept error: '"
+			   << cpp_strerror(err) << dendl;
+    return;
+  }
+  ldout(m_cct, 30) << "AdminSocket: finished accept" << dendl;
+
+  char cmd[1024];
+  unsigned pos = 0;
+  string c;
+  while (1) {
+    int ret = safe_recv(connection_fd, &cmd[pos], 1);
+    if (ret <= 0) {
+      if (ret < 0) {
+        lderr(m_cct) << "AdminSocket: error reading request code: "
+		     << cpp_strerror(ret) << dendl;
+      }
+      retry_sys_call(::compat_closesocket, connection_fd);
+      return;
+    }
+    if (cmd[0] == '\0') {
+      // old protocol: __be32
+      if (pos == 3 && cmd[0] == '\0') {
+	switch (cmd[3]) {
+	case 0:
+	  c = "0";
+	  break;
+	case 1:
+	  c = "perfcounters_dump";
+	  break;
+	case 2:
+	  c = "perfcounters_schema";
+	  break;
+	default:
+	  c = "foo";
+	  break;
+	}
+	//wrap command with new protocol
+	c = "{\"prefix\": \"" + c + "\"}";
+	break;
+      }
+    } else {
+      // new protocol: null or \n terminated string
+      if (cmd[pos] == '\n' || cmd[pos] == '\0') {
+	cmd[pos] = '\0';
+	c = cmd;
+	break;
+      }
+    }
+    if (++pos >= sizeof(cmd)) {
+      lderr(m_cct) << "AdminSocket: error reading request too long" << dendl;
+      retry_sys_call(::compat_closesocket, connection_fd);
+      return;
+    }
+  }
+
+  std::vector<std::string> cmdvec = { c };
+  bufferlist empty, out;
+  ostringstream err;
+  int rval = execute_command(cmdvec, empty /* inbl */, err, &out);
+
+  // Unfortunately, the asok wire protocol does not let us pass an error code,
+  // and many asok command implementations return helpful error strings.  So,
+  // let's prepend an error string to the output if there is an error code.
+  if (rval < 0) {
+    ostringstream ss;
+    ss << "ERROR: " << cpp_strerror(rval) << "\n";
+    ss << err.str() << "\n";
+    bufferlist o;
+    o.append(ss.str());
+    o.claim_append(out);
+    out.claim_append(o);
+  }
+  uint32_t len = htonl(out.length());
+  int ret = safe_send(connection_fd, &len, sizeof(len));
+  if (ret < 0) {
+    lderr(m_cct) << "AdminSocket: error writing response length "
+		 << cpp_strerror(ret) << dendl;
+  } else {
+    int r = out.send_fd(connection_fd);
+    if (r < 0) {
+      lderr(m_cct) << "AdminSocket: error writing response payload "
+		   << cpp_strerror(ret) << dendl;
+    }
+  }
+  retry_sys_call(::compat_closesocket, connection_fd);
+}
+
+void AdminSocket::do_tell_queue()
+{
+  ldout(m_cct,10) << __func__ << dendl;
+  std::list<cref_t<MCommand>> q;
+  std::list<cref_t<MMonCommand>> lq;
+  {
+    std::lock_guard l(tell_lock);
+    q.swap(tell_queue);
+    lq.swap(tell_legacy_queue);
+  }
+  for (auto& m : q) {
+    bufferlist outbl;
+    execute_command(
+      m->cmd,
+      m->get_data(),
+      [m](int r, const std::string& err, bufferlist& outbl) {
+	auto reply = new MCommandReply(r, err);
+	reply->set_tid(m->get_tid());
+	reply->set_data(outbl);
+#ifdef WITH_SEASTAR
+        // TODO: crimson: handle asok commmand from alien thread
+#else
+	m->get_connection()->send_message(reply);
+#endif
+      });
+  }
+  for (auto& m : lq) {
+    bufferlist outbl;
+    execute_command(
+      m->cmd,
+      m->get_data(),
+      [m](int r, const std::string& err, bufferlist& outbl) {
+	auto reply = new MMonCommandAck(m->cmd, r, err, 0);
+	reply->set_tid(m->get_tid());
+	reply->set_data(outbl);
+#ifdef WITH_SEASTAR
+        // TODO: crimson: handle asok commmand from alien thread
+#else
+	m->get_connection()->send_message(reply);
+#endif
+      });
+  }
+}
+
+int AdminSocket::execute_command(
+  const std::vector<std::string>& cmd,
+  const bufferlist& inbl,
+  std::ostream& errss,
+  bufferlist *outbl)
+{
+#ifdef WITH_SEASTAR
+   // TODO: crimson: blocking execute_command() in alien thread
+  return -ENOSYS;
+#else
+  bool done = false;
+  int rval = 0;
+  ceph::mutex mylock = ceph::make_mutex("admin_socket::excute_command::mylock");
+  ceph::condition_variable mycond;
+  C_SafeCond fin(mylock, mycond, &done, &rval);
+  execute_command(
+    cmd,
+    inbl,
+    [&errss, outbl, &fin](int r, const std::string& err, bufferlist& out) {
+      errss << err;
+      *outbl = std::move(out);
+      fin.finish(r);
+    });
+  {
+    std::unique_lock l{mylock};
+    mycond.wait(l, [&done] { return done;});
+  }
+  return rval;
+#endif
+}
+
+void AdminSocket::execute_command(
+  const std::vector<std::string>& cmdvec,
+  const bufferlist& inbl,
+  std::function<void(int,const std::string&,bufferlist&)> on_finish)
+{
+  cmdmap_t cmdmap;
+  string format;
+  stringstream errss;
+  bufferlist empty;
+  ldout(m_cct,10) << __func__ << " cmdvec='" << cmdvec << "'" << dendl;
+  if (!cmdmap_from_json(cmdvec, &cmdmap, errss)) {
+    ldout(m_cct, 0) << "AdminSocket: " << errss.str() << dendl;
+    return on_finish(-EINVAL, "invalid json", empty);
+  }
+  string prefix;
+  try {
+    cmd_getval(cmdmap, "format", format);
+    cmd_getval(cmdmap, "prefix", prefix);
+  } catch (const bad_cmd_get& e) {
+    return on_finish(-EINVAL, "invalid json, missing format and/or prefix",
+		     empty);
+  }
+
+  auto f = Formatter::create(format, "json-pretty", "json-pretty");
+
+  auto [retval, hook] = find_matched_hook(prefix, cmdmap);
+  switch (retval) {
+  case ENOENT:
+    lderr(m_cct) << "AdminSocket: request '" << cmdvec
+		 << "' not defined" << dendl;
+    delete f;
+    return on_finish(-EINVAL, "unknown command prefix "s + prefix, empty);
+  case EINVAL:
+    delete f;
+    return on_finish(-EINVAL, "invalid command json", empty);
+  default:
+    assert(retval == 0);
+  }
+
+  hook->call_async(
+    prefix, cmdmap, f, inbl,
+    [f, on_finish](int r, const std::string& err, bufferlist& out) {
+      // handle either existing output in bufferlist *or* via formatter
+      if (r >= 0 && out.length() == 0) {
+	f->flush(out);
+      }
+      delete f;
+      on_finish(r, err, out);
+    });
+
+  std::unique_lock l(lock);
+  in_hook = false;
+  in_hook_cond.notify_all();
+}
+
+std::pair<int, AdminSocketHook*>
+AdminSocket::find_matched_hook(std::string& prefix,
+			       const cmdmap_t& cmdmap)
+{
+  std::unique_lock l(lock);
+  // Drop lock after done with the lookup to avoid cycles in cases where the
+  // hook takes the same lock that was held during calls to
+  // register/unregister, and set in_hook to allow unregister to wait for us
+  // before removing this hook.
+  auto [hooks_begin, hooks_end] = hooks.equal_range(prefix);
+  if (hooks_begin == hooks_end) {
+    return {ENOENT, nullptr};
+  }
+  // make sure one of the registered commands with this prefix validates.
+  stringstream errss;
+  for (auto hook = hooks_begin; hook != hooks_end; ++hook) {
+    if (validate_cmd(hook->second.desc, cmdmap, errss)) {
+      in_hook = true;
+      return {0, hook->second.hook};
+    }
+  }
+  return {EINVAL, nullptr};
+}
+
+void AdminSocket::queue_tell_command(cref_t<MCommand> m)
+{
+  ldout(m_cct,10) << __func__ << " " << *m << dendl;
+  std::lock_guard l(tell_lock);
+  tell_queue.push_back(std::move(m));
+  wakeup();
+}
+void AdminSocket::queue_tell_command(cref_t<MMonCommand> m)
+{
+  ldout(m_cct,10) << __func__ << " " << *m << dendl;
+  std::lock_guard l(tell_lock);
+  tell_legacy_queue.push_back(std::move(m));
+  wakeup();
+}
+
+int AdminSocket::register_command(std::string_view cmddesc,
+				  AdminSocketHook *hook,
+				  std::string_view help)
+{
+  int ret;
+  std::unique_lock l(lock);
+  string prefix = cmddesc_get_prefix(cmddesc);
+  auto i = hooks.find(prefix);
+  if (i != hooks.cend() &&
+      i->second.desc == cmddesc) {
+    ldout(m_cct, 5) << "register_command " << prefix
+		    << " cmddesc " << cmddesc << " hook " << hook
+		    << " EEXIST" << dendl;
+    ret = -EEXIST;
+  } else {
+    ldout(m_cct, 5) << "register_command " << prefix << " hook " << hook
+		    << dendl;
+    hooks.emplace_hint(i,
+		       std::piecewise_construct,
+		       std::forward_as_tuple(prefix),
+		       std::forward_as_tuple(hook, cmddesc, help));
+    ret = 0;
+  }
+  return ret;
+}
+
+void AdminSocket::unregister_commands(const AdminSocketHook *hook)
+{
+  std::unique_lock l(lock);
+  auto i = hooks.begin();
+  while (i != hooks.end()) {
+    if (i->second.hook == hook) {
+      ldout(m_cct, 5) << __func__ << " " << i->first << dendl;
+
+      // If we are currently processing a command, wait for it to
+      // complete in case it referenced the hook that we are
+      // unregistering.
+      in_hook_cond.wait(l, [this]() { return !in_hook; });
+      hooks.erase(i++);
+    } else {
+      i++;
+    }
+  }
+}
+
+class VersionHook : public AdminSocketHook {
+public:
+  int call(std::string_view command, const cmdmap_t& cmdmap,
+	   const bufferlist&,
+	   Formatter *f,
+	   std::ostream& errss,
+	   bufferlist& out) override {
+    if (command == "0"sv) {
+      out.append(CEPH_ADMIN_SOCK_VERSION);
+    } else {
+      f->open_object_section("version");
+      if (command == "version") {
+	f->dump_string("version", ceph_version_to_str());
+	f->dump_string("release", ceph_release_to_str());
+	f->dump_string("release_type", ceph_release_type());
+      } else if (command == "git_version") {
+	f->dump_string("git_version", git_version_to_str());
+      }
+      ostringstream ss;
+      f->close_section();
+    }
+    return 0;
+  }
+};
+
+class HelpHook : public AdminSocketHook {
+  AdminSocket *m_as;
+public:
+  explicit HelpHook(AdminSocket *as) : m_as(as) {}
+  int call(std::string_view command, const cmdmap_t& cmdmap,
+	   const bufferlist&,
+	   Formatter *f,
+	   std::ostream& errss,
+	   bufferlist& out) override {
+    f->open_object_section("help");
+    for (const auto& [command, info] : m_as->hooks) {
+      if (info.help.length())
+	f->dump_string(command.c_str(), info.help);
+    }
+    f->close_section();
+    return 0;
+  }
+};
+
+class GetdescsHook : public AdminSocketHook {
+  AdminSocket *m_as;
+public:
+  explicit GetdescsHook(AdminSocket *as) : m_as(as) {}
+  int call(std::string_view command, const cmdmap_t& cmdmap,
+	   const bufferlist&,
+	   Formatter *f,
+	   std::ostream& errss,
+	   bufferlist& out) override {
+    int cmdnum = 0;
+    f->open_object_section("command_descriptions");
+    for (const auto& [command, info] : m_as->hooks) {
+      // GCC 8 actually has [[maybe_unused]] on a structured binding
+      // do what you'd expect. GCC 7 does not.
+      (void)command;
+      ostringstream secname;
+      secname << "cmd" << std::setfill('0') << std::setw(3) << cmdnum;
+      dump_cmd_and_help_to_json(f,
+                                CEPH_FEATURES_ALL,
+				secname.str().c_str(),
+				info.desc,
+				info.help);
+      cmdnum++;
+    }
+    f->close_section(); // command_descriptions
+    return 0;
+  }
+};
+
+bool AdminSocket::init(const std::string& path)
+{
+  ldout(m_cct, 5) << "init " << path << dendl;
+
+  #ifdef _WIN32
+  OSVERSIONINFOEXW ver = {0};
+  ver.dwOSVersionInfoSize = sizeof(ver);
+  get_windows_version(&ver);
+
+  if (std::tie(ver.dwMajorVersion, ver.dwMinorVersion, ver.dwBuildNumber) <
+      std::make_tuple(10, 0, 17063)) {
+    ldout(m_cct, 5) << "Unix sockets require Windows 10.0.17063 or later. "
+                    << "The admin socket will not be available." << dendl;
+    return false;
+  }
+  #endif
+
+  /* Set up things for the new thread */
+  std::string err;
+  int pipe_rd = -1, pipe_wr = -1;
+  err = create_wakeup_pipe(&pipe_rd, &pipe_wr);
+  if (!err.empty()) {
+    lderr(m_cct) << "AdminSocketConfigObs::init: error: " << err << dendl;
+    return false;
+  }
+  int sock_fd;
+  err = bind_and_listen(path, &sock_fd);
+  if (!err.empty()) {
+    lderr(m_cct) << "AdminSocketConfigObs::init: failed: " << err << dendl;
+    close(pipe_rd);
+    close(pipe_wr);
+    return false;
+  }
+
+  /* Create new thread */
+  m_sock_fd = sock_fd;
+  m_wakeup_rd_fd = pipe_rd;
+  m_wakeup_wr_fd = pipe_wr;
+  m_path = path;
+
+  version_hook = std::make_unique<VersionHook>();
+  register_command("0", version_hook.get(), "");
+  register_command("version", version_hook.get(), "get ceph version");
+  register_command("git_version", version_hook.get(),
+		   "get git sha1");
+  help_hook = std::make_unique<HelpHook>(this);
+  register_command("help", help_hook.get(),
+		   "list available commands");
+  getdescs_hook = std::make_unique<GetdescsHook>(this);
+  register_command("get_command_descriptions",
+		   getdescs_hook.get(), "list available commands");
+
+  th = make_named_thread("admin_socket", &AdminSocket::entry, this);
+  add_cleanup_file(m_path.c_str());
+  return true;
+}
+
+void AdminSocket::shutdown()
+{
+  // Under normal operation this is unlikely to occur.  However for some unit
+  // tests, some object members are not initialized and so cannot be deleted
+  // without fault.
+  if (m_wakeup_wr_fd < 0)
+    return;
+
+  ldout(m_cct, 5) << "shutdown" << dendl;
+  m_shutdown = true;
+
+  auto err = destroy_wakeup_pipe();
+  if (!err.empty()) {
+    lderr(m_cct) << "AdminSocket::shutdown: error: " << err << dendl;
+  }
+
+  retry_sys_call(::compat_closesocket, m_sock_fd);
+
+  unregister_commands(version_hook.get());
+  version_hook.reset();
+
+  unregister_commands(help_hook.get());
+  help_hook.reset();
+
+  unregister_commands(getdescs_hook.get());
+  getdescs_hook.reset();
+
+  remove_cleanup_file(m_path);
+  m_path.clear();
+}
+
+void AdminSocket::wakeup()
+{
+  // Send a byte to the wakeup pipe that the thread is listening to
+  char buf[1] = { 0x0 };
+  int r = safe_send(m_wakeup_wr_fd, buf, sizeof(buf));
+  (void)r;
+}
diff --git a/src/common/admin_socket.h b/src/common/admin_socket.h
new file mode 100644
index 000000000..3f364a5b7
--- /dev/null
+++ b/src/common/admin_socket.h
@@ -0,0 +1,221 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_ADMIN_SOCKET_H
+#define CEPH_COMMON_ADMIN_SOCKET_H
+
+#if defined(WITH_SEASTAR) && !defined(WITH_ALIEN)
+#include "crimson/admin/admin_socket.h"
+#else
+
+#include <condition_variable>
+#include <mutex>
+#include <string>
+#include <string_view>
+#include <thread>
+
+#include "include/buffer.h"
+#include "include/common_fwd.h"
+#include "common/ref.h"
+#include "common/cmdparse.h"
+
+class MCommand;
+class MMonCommand;
+
+inline constexpr auto CEPH_ADMIN_SOCK_VERSION = std::string_view("2");
+
+class AdminSocketHook {
+public:
+  /**
+   * @brief
+   * Handler for admin socket commands, synchronous version
+   *
+   * Executes action associated with admin command and returns byte-stream output @c out.
+   * There is no restriction on output. Each handler defines output semantics.
+   * Typically output is textual representation of some ceph's internals.
+   * Handler should use provided formatter @c f if structuralized output is being produced.
+   *
+   * @param command[in] String matching constant part of cmddesc in @ref AdminSocket::register_command
+   * @param cmdmap[in]  Parameters extracted from argument part of cmddesc in @ref AdminSocket::register_command
+   * @param f[in]       Formatter created according to requestor preference, used by `ceph --format`
+   * @param errss[out]  Error stream, should contain details in case of execution failure
+   * @param out[out]    Produced output
+   *
+   * @retval 0 Success, errss is ignored and does not contribute to output
+   * @retval <0 Error code, errss is prepended to @c out
+   *
+   * @note If @c out is empty, then admin socket will try to flush @c f to out.
+   */
+  virtual int call(
+    std::string_view command,
+    const cmdmap_t& cmdmap,
+    const ceph::buffer::list& inbl,
+    ceph::Formatter *f,
+    std::ostream& errss,
+    ceph::buffer::list& out) = 0;
+
+  /**
+   * @brief
+   * Handler for admin socket commands, asynchronous version
+   *
+   * Executes action associated with admin command and prepares byte-stream response.
+   * When processing is done @c on_finish must be called.
+   * There is no restriction on output. Each handler defines own output semantics.
+   * Typically output is textual representation of some ceph's internals.
+   * Input @c inbl can be passed, see ceph --in-file.
+   * Handler should use provided formatter @c f if structuralized output is being produced.
+   * on_finish handler has following parameters:
+   * - result code of handler (same as @ref AdminSocketHook::call)
+   * - error message, text
+   * - output
+   *
+   * @param[in] command String matching constant part of cmddesc in @ref AdminSocket::register_command
+   * @param[in] cmdmap  Parameters extracted from argument part of cmddesc in @ref AdminSocket::register_command
+   * @param[in] f       Formatter created according to requestor preference, used by `ceph --format`
+   * @param[in] inbl    Input content for handler
+   * @param[in] on_finish Function to call when processing is done
+   *
+   * @note If @c out is empty, then admin socket will try to flush @c f to out.
+   */
+  virtual void call_async(
+    std::string_view command,
+    const cmdmap_t& cmdmap,
+    ceph::Formatter *f,
+    const ceph::buffer::list& inbl,
+    std::function<void(int,const std::string&,ceph::buffer::list&)> on_finish) {
+    // by default, call the synchronous handler and then finish
+    ceph::buffer::list out;
+    std::ostringstream errss;
+    int r = call(command, cmdmap, inbl, f, errss, out);
+    on_finish(r, errss.str(), out);
+  }
+  virtual ~AdminSocketHook() {}
+};
+
+class AdminSocket
+{
+public:
+  AdminSocket(CephContext *cct);
+  ~AdminSocket();
+
+  AdminSocket(const AdminSocket&) = delete;
+  AdminSocket& operator =(const AdminSocket&) = delete;
+  AdminSocket(AdminSocket&&) = delete;
+  AdminSocket& operator =(AdminSocket&&) = delete;
+
+  /**
+   * register an admin socket command
+   *
+   * The command is registered under a command string.  Incoming
+   * commands are split by space and matched against the longest
+   * registered command.  For example, if 'foo' and 'foo bar' are
+   * registered, and an incoming command is 'foo bar baz', it is
+   * matched with 'foo bar', while 'foo fud' will match 'foo'.
+   *
+   * The entire incoming command string is passed to the registered
+   * hook.
+   *
+   * @param command command string
+   * @param cmddesc command syntax descriptor
+   * @param hook implementation
+   * @param help help text.  if empty, command will not be included in 'help' output.
+   *
+   * @return 0 for success, -EEXIST if command already registered.
+   */
+  int register_command(std::string_view cmddesc,
+		       AdminSocketHook *hook,
+		       std::string_view help);
+
+  /*
+   * unregister all commands belong to hook.
+   */
+  void unregister_commands(const AdminSocketHook *hook);
+
+  bool init(const std::string& path);
+
+  void chown(uid_t uid, gid_t gid);
+  void chmod(mode_t mode);
+
+  /// execute (async)
+  void execute_command(
+    const std::vector<std::string>& cmd,
+    const ceph::buffer::list& inbl,
+    std::function<void(int,const std::string&,ceph::buffer::list&)> on_fin);
+
+  /// execute (blocking)
+  int execute_command(
+    const std::vector<std::string>& cmd,
+    const ceph::buffer::list& inbl,
+    std::ostream& errss,
+    ceph::buffer::list *outbl);
+
+  void queue_tell_command(ceph::cref_t<MCommand> m);
+  void queue_tell_command(ceph::cref_t<MMonCommand> m); // for compat
+
+private:
+
+  void shutdown();
+  void wakeup();
+
+  std::string create_wakeup_pipe(int *pipe_rd, int *pipe_wr);
+  std::string destroy_wakeup_pipe();
+  std::string bind_and_listen(const std::string &sock_path, int *fd);
+
+  std::thread th;
+  void entry() noexcept;
+  void do_accept();
+  void do_tell_queue();
+
+  CephContext *m_cct;
+  std::string m_path;
+  int m_sock_fd = -1;
+  int m_wakeup_rd_fd = -1;
+  int m_wakeup_wr_fd = -1;
+  bool m_shutdown = false;
+
+  bool in_hook = false;
+  std::condition_variable in_hook_cond;
+  std::mutex lock;  // protects `hooks`
+  std::unique_ptr<AdminSocketHook> version_hook;
+  std::unique_ptr<AdminSocketHook> help_hook;
+  std::unique_ptr<AdminSocketHook> getdescs_hook;
+
+  std::mutex tell_lock;
+  std::list<ceph::cref_t<MCommand>> tell_queue;
+  std::list<ceph::cref_t<MMonCommand>> tell_legacy_queue;
+
+  struct hook_info {
+    AdminSocketHook* hook;
+    std::string desc;
+    std::string help;
+
+    hook_info(AdminSocketHook* hook, std::string_view desc,
+	      std::string_view help)
+      : hook(hook), desc(desc), help(help) {}
+  };
+
+  /// find the first hook which matches the given prefix and cmdmap
+  std::pair<int, AdminSocketHook*> find_matched_hook(
+    std::string& prefix,
+    const cmdmap_t& cmdmap);
+
+  std::multimap<std::string, hook_info, std::less<>> hooks;
+
+  friend class AdminSocketTest;
+  friend class HelpHook;
+  friend class GetdescsHook;
+};
+
+#endif
+#endif
diff --git a/src/common/admin_socket_client.cc b/src/common/admin_socket_client.cc
new file mode 100644
index 000000000..aeddc100f
--- /dev/null
+++ b/src/common/admin_socket_client.cc
@@ -0,0 +1,172 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <arpa/inet.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#include "common/admin_socket.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "common/admin_socket_client.h"
+
+#include "include/compat.h"
+#include "include/sock_compat.h"
+
+using std::ostringstream;
+
+const char* get_rand_socket_path()
+{
+  static char *g_socket_path = NULL;
+
+  if (g_socket_path == NULL) {
+    char buf[512];
+    const char *tdir = getenv("TMPDIR");
+    #ifdef _WIN32
+    if (tdir == NULL) {
+      tdir = getenv("TEMP");
+    }
+    #endif /* _WIN32 */
+    if (tdir == NULL) {
+      tdir = "/tmp";
+    }
+    snprintf(buf, sizeof(((struct sockaddr_un*)0)->sun_path),
+	     "%s/perfcounters_test_socket.%ld.%ld",
+	     tdir, (long int)getpid(), time(NULL));
+    g_socket_path = (char*)strdup(buf);
+  }
+  return g_socket_path;
+}
+
+static std::string asok_connect(const std::string &path, int *fd)
+{
+  int socket_fd = socket_cloexec(PF_UNIX, SOCK_STREAM, 0);
+  if(socket_fd < 0) {
+    int err = ceph_sock_errno();
+    ostringstream oss;
+    oss << "socket(PF_UNIX, SOCK_STREAM, 0) failed: " << cpp_strerror(err);
+    return oss.str();
+  }
+
+  struct sockaddr_un address;
+  // FIPS zeroization audit 20191115: this memset is fine.
+  memset(&address, 0, sizeof(struct sockaddr_un));
+  address.sun_family = AF_UNIX;
+  snprintf(address.sun_path, sizeof(address.sun_path), "%s", path.c_str());
+
+  if (::connect(socket_fd, (struct sockaddr *) &address, 
+	sizeof(struct sockaddr_un)) != 0) {
+    int err = ceph_sock_errno();
+    ostringstream oss;
+    oss << "connect(" << socket_fd << ") failed: " << cpp_strerror(err);
+    compat_closesocket(socket_fd);
+    return oss.str();
+  }
+
+  struct timeval timer;
+  timer.tv_sec = 10;
+  timer.tv_usec = 0;
+  if (::setsockopt(socket_fd, SOL_SOCKET, SO_RCVTIMEO, (SOCKOPT_VAL_TYPE)&timer, sizeof(timer))) {
+    int err = ceph_sock_errno();
+    ostringstream oss;
+    oss << "setsockopt(" << socket_fd << ", SO_RCVTIMEO) failed: "
+	<< cpp_strerror(err);
+    compat_closesocket(socket_fd);
+    return oss.str();
+  }
+  timer.tv_sec = 10;
+  timer.tv_usec = 0;
+  if (::setsockopt(socket_fd, SOL_SOCKET, SO_SNDTIMEO, (SOCKOPT_VAL_TYPE)&timer, sizeof(timer))) {
+    int err = ceph_sock_errno();
+    ostringstream oss;
+    oss << "setsockopt(" << socket_fd << ", SO_SNDTIMEO) failed: "
+	<< cpp_strerror(err);
+    compat_closesocket(socket_fd);
+    return oss.str();
+  }
+
+  *fd = socket_fd;
+  return "";
+}
+
+static std::string asok_request(int socket_fd, std::string request)
+{
+  ssize_t res = safe_send(socket_fd, request.c_str(), request.length() + 1);
+  if (res < 0) {
+    int err = res;
+    ostringstream oss;
+    oss << "safe_write(" << socket_fd << ") failed to write request code: "
+	<< cpp_strerror(err);
+    return oss.str();
+  }
+  return "";
+}
+
+AdminSocketClient::
+AdminSocketClient(const std::string &path)
+  : m_path(path)
+{
+}
+
+std::string AdminSocketClient::ping(bool *ok)
+{
+  std::string version;
+  std::string result = do_request("{\"prefix\":\"0\"}", &version);
+  *ok = result == "" && version.length() == 1;
+  return result;
+}
+
+std::string AdminSocketClient::do_request(std::string request, std::string *result)
+{
+  int socket_fd = 0, res;
+  std::string buffer;
+  uint32_t message_size_raw, message_size;
+
+  std::string err = asok_connect(m_path, &socket_fd);
+  if (!err.empty()) {
+    goto out;
+  }
+  err = asok_request(socket_fd, request);
+  if (!err.empty()) {
+    goto done;
+  }
+  res = safe_recv_exact(socket_fd, &message_size_raw,
+				sizeof(message_size_raw));
+  if (res < 0) {
+    int e = res;
+    ostringstream oss;
+    oss << "safe_recv(" << socket_fd << ") failed to read message size: "
+	<< cpp_strerror(e);
+    err = oss.str();
+    goto done;
+  }
+  message_size = ntohl(message_size_raw);
+  buffer.resize(message_size, 0);
+  res = safe_recv_exact(socket_fd, &buffer[0], message_size);
+  if (res < 0) {
+    int e = res;
+    ostringstream oss;
+    oss << "safe_recv(" << socket_fd << ") failed: " << cpp_strerror(e);
+    err = oss.str();
+    goto done;
+  }
+  //printf("MESSAGE FROM SERVER: %s\n", buffer.c_str());
+  std::swap(*result, buffer);
+done:
+  compat_closesocket(socket_fd);
+ out:
+  return err;
+}
diff --git a/src/common/admin_socket_client.h b/src/common/admin_socket_client.h
new file mode 100644
index 000000000..dcfab2b8b
--- /dev/null
+++ b/src/common/admin_socket_client.h
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_ADMIN_SOCKET_CLIENT_H
+#define CEPH_COMMON_ADMIN_SOCKET_CLIENT_H
+
+#include <string>
+
+/* This is a simple client that talks to an AdminSocket using blocking I/O.
+ * We put a 5-second timeout on send and recv operations.
+ */
+class AdminSocketClient
+{
+public:
+  AdminSocketClient(const std::string &path);
+  std::string do_request(std::string request, std::string *result);
+  std::string ping(bool *ok);
+private:
+  std::string m_path;
+};
+
+const char* get_rand_socket_path();
+
+#endif
diff --git a/src/common/aix_errno.cc b/src/common/aix_errno.cc
new file mode 100644
index 000000000..07f6a1452
--- /dev/null
+++ b/src/common/aix_errno.cc
@@ -0,0 +1,231 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <errno.h>
+#include "include/types.h"
+
+
+// converts from linux errno values to host values
+__s32 ceph_to_hostos_errno(__s32 r) 
+{
+  if (r < -34) {
+    switch (r) {
+      case -35:
+        return -EDEADLK;
+      case -36:
+        return -ENAMETOOLONG;
+      case -37:
+        return -ENOLCK;
+      case -38:
+        return -ENOSYS;
+      case -39:
+        return -ENOTEMPTY;
+      case -40:
+        return -ELOOP;
+      case -42:
+        return -ENOMSG;
+      case -43:
+        return -EIDRM;
+      case -44:
+        return -ECHRNG;
+      case -45:
+        return -EL2NSYNC;
+      case -46:
+        return -EL3HLT;
+      case -47:
+        return -EL3RST;
+      case -48:
+        return -ELNRNG;
+      case -49:
+        return -EUNATCH;
+      case -51:
+        return -EL2HLT;
+      case -52:
+        return -EPERM; //TODO EBADE
+      case -53:
+        return -EPERM; //TODO EBADR
+      case -54:
+        return -EPERM; //TODO EXFULL
+      case -55:
+        return -EPERM; //TODO ENOANO
+      case -56:
+        return -EPERM; //TODO EBADRQC
+      case -57:
+        return -EPERM; //TODO EBADSLT
+      case -59:
+        return -EPERM; //TODO EBFONT
+      case -60:
+        return -ENOSTR;
+      case -61:
+        return -ENODATA;
+      case -62:
+        return -ETIME;
+      case -63:
+        return -ENOSR;
+      case -64:
+        return -EPERM; //TODO ENONET
+      case -65:
+        return -EPERM; //TODO ENOPKG
+      case -66:
+        return -EREMOTE;
+      case -67:
+        return -ENOLINK;
+      case -68:
+        return -EPERM; //TODO EADV 
+      case -69:
+        return -EPERM; //TODO ESRMNT 
+      case -70:
+        return -EPERM; //TODO ECOMM
+      case -71:
+        return -EPROTO;
+      case -72:
+        return -EMULTIHOP;
+      case -73:
+        return -EPERM; //TODO EDOTDOT 
+      case -74:
+        return -EBADMSG;
+      case -75:
+        return -EOVERFLOW;
+      case -76:
+        return -EPERM; //TODO ENOTUNIQ
+      case -77:
+        return -EPERM; //TODO EBADFD
+      case -78:
+        return -EPERM; //TODO EREMCHG
+      case -79:
+        return -EPERM; //TODO ELIBACC
+      case -80:
+        return -EPERM; //TODO ELIBBAD 
+      case -81:
+        return -EPERM; //TODO ELIBSCN
+      case -82:
+        return -EPERM; //TODO ELIBMAX
+      case -83:
+	return -EPERM; // TODO ELIBEXEC
+      case -84:
+        return -EILSEQ;
+      case -85:
+        return -ERESTART;
+      case -86:
+        return -EPERM; //ESTRPIPE; 
+      case -87:
+        return -EUSERS;
+      case -88:
+        return -ENOTSOCK;
+      case -89:
+        return -EDESTADDRREQ;
+      case -90:
+        return -EMSGSIZE;
+      case -91:
+        return -EPROTOTYPE;
+      case -92:
+        return -ENOPROTOOPT;
+      case -93:
+        return -EPROTONOSUPPORT;
+      case -94:
+        return -ESOCKTNOSUPPORT;
+      case -95:
+        return -EOPNOTSUPP;
+      case -96:
+        return -EPFNOSUPPORT;
+      case -97:
+        return -EAFNOSUPPORT;
+      case -98:
+        return -EADDRINUSE;
+      case -99:
+        return -EADDRNOTAVAIL;
+      case -100:
+        return -ENETDOWN;
+      case -101:
+        return -ENETUNREACH;
+      case -102:
+        return -ENETRESET;
+      case -103:
+        return -ECONNABORTED;
+      case -104:
+        return -ECONNRESET;
+      case -105:
+        return -ENOBUFS;
+      case -106:
+        return -EISCONN;
+      case -107:
+        return -ENOTCONN;
+      case -108:
+        return -ESHUTDOWN;
+      case -109:
+        return -ETOOMANYREFS;
+      case -110:
+        return -ETIMEDOUT;
+      case -111:
+        return -ECONNREFUSED;
+      case -112:
+        return -EHOSTDOWN;
+      case -113:
+        return -EHOSTUNREACH;
+      case -114:
+        return -EALREADY;
+      case -115:
+        return -EINPROGRESS;
+      case -116:
+        return -ESTALE;
+      case -117:
+        return -EPERM; //TODO EUCLEAN 
+      case -118:
+        return -EPERM; //TODO ENOTNAM
+      case -119:
+        return -EPERM; //TODO ENAVAIL
+      case -120:
+        return -EPERM; //TODO EISNAM
+      case -121:
+        return -EPERM; //TODO EREMOTEIO
+      case -122:
+        return -EDQUOT;
+      case -123:
+        return -EPERM; //TODO ENOMEDIUM
+      case -124:
+        return -EPERM; //TODO EMEDIUMTYPE - not used
+      case -125:
+        return -ECANCELED;
+      case -126:
+        return -EPERM; //TODO ENOKEY
+      case -127:
+        return -EPERM; //TODO EKEYEXPIRED
+      case -128:
+        return -EPERM; //TODO EKEYREVOKED
+      case -129:
+        return -EPERM; //TODO EKEYREJECTED
+      case -130:
+        return -EOWNERDEAD;
+      case -131:
+        return -ENOTRECOVERABLE;
+      case -132:
+        return -EPERM; //TODO ERFKILL
+      case -133:
+        return -EPERM; //TODO EHWPOISON
+
+      default: { 
+        break;
+      }
+    }
+  } 
+  return r; // otherwise return original value
+}
+
+// converts Host OS errno values to linux/Ceph values
+// XXX Currently not worked out
+__s32 hostos_to_ceph_errno(__s32 r)
+{
+  return r;
+}
+
diff --git a/src/common/allocate_unique.h b/src/common/allocate_unique.h
new file mode 100644
index 000000000..4c5cc1c6e
--- /dev/null
+++ b/src/common/allocate_unique.h
@@ -0,0 +1,69 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <memory>
+
+namespace ceph {
+
+/// An allocator-aware 'Deleter' for std::unique_ptr<T, Deleter>. The
+/// allocator's traits must have a value_type of T.
+template <typename Alloc>
+class deallocator {
+  using allocator_type = Alloc;
+  using allocator_traits = std::allocator_traits<allocator_type>;
+  using pointer = typename allocator_traits::pointer;
+  allocator_type alloc;
+ public:
+  explicit deallocator(const allocator_type& alloc) noexcept : alloc(alloc) {}
+  void operator()(pointer p) {
+    allocator_traits::destroy(alloc, p);
+    allocator_traits::deallocate(alloc, p, 1);
+  }
+};
+
+/// deallocator alias that rebinds Alloc's value_type to T
+template <typename T, typename Alloc>
+using deallocator_t = deallocator<typename std::allocator_traits<Alloc>
+      ::template rebind_alloc<T>>;
+
+/// std::unique_ptr alias that rebinds Alloc if necessary, and avoids repetition
+/// of the template parameter T.
+template <typename T, typename Alloc>
+using allocated_unique_ptr = std::unique_ptr<T, deallocator_t<T, Alloc>>;
+
+
+/// Returns a std::unique_ptr whose memory is managed by the given allocator.
+template <typename T, typename Alloc, typename... Args>
+static auto allocate_unique(Alloc& alloc, Args&&... args)
+  -> allocated_unique_ptr<T, Alloc>
+{
+  static_assert(!std::is_array_v<T>, "allocate_unique() does not support T[]");
+
+  using allocator_type = typename std::allocator_traits<Alloc>
+      ::template rebind_alloc<T>;
+  using allocator_traits = std::allocator_traits<allocator_type>;
+  auto a = allocator_type{alloc};
+  auto p = allocator_traits::allocate(a, 1);
+  try {
+    allocator_traits::construct(a, p, std::forward<Args>(args)...);
+    return {p, deallocator<allocator_type>{a}};
+  } catch (...) {
+    allocator_traits::deallocate(a, p, 1);
+    throw;
+  }
+}
+
+} // namespace ceph
diff --git a/src/common/arch.h b/src/common/arch.h
new file mode 100644
index 000000000..09dbe7c90
--- /dev/null
+++ b/src/common/arch.h
@@ -0,0 +1,15 @@
+#ifndef CEPH_ARCH_H
+#define CEPH_ARCH_H
+
+static const char *get_arch()
+{
+#if defined(__i386__)
+  return "i386";
+#elif defined(__x86_64__)
+  return "x86-64";
+#else
+    return "unknown";
+#endif
+}
+
+#endif
diff --git a/src/common/armor.c b/src/common/armor.c
new file mode 100644
index 000000000..3508b82e3
--- /dev/null
+++ b/src/common/armor.c
@@ -0,0 +1,131 @@
+
+#if defined(__linux__)
+#include <linux/errno.h>
+#else
+#include <sys/errno.h>
+#endif
+
+/*
+ * base64 encode/decode.
+ */
+
+const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+static int encode_bits(int c)
+{
+	return pem_key[c];
+}
+
+static int decode_bits(char c)
+{
+	if (c >= 'A' && c <= 'Z')
+		return c - 'A';
+	if (c >= 'a' && c <= 'z')
+		return c - 'a' + 26;
+	if (c >= '0' && c <= '9')
+		return c - '0' + 52;
+	if (c == '+' || c == '-')
+		return 62;
+	if (c == '/' || c == '_')
+		return 63;
+	if (c == '=')
+		return 0; /* just non-negative, please */
+	return -EINVAL;	
+}
+
+static int set_str_val(char **pdst, const char *end, char c)
+{
+	if (*pdst < end) {
+		char *p = *pdst;
+		*p = c;
+		(*pdst)++;
+	} else
+		return -ERANGE;
+
+	return 0;
+}
+
+int ceph_armor_line_break(char *dst, char * const dst_end, const char *src, const char *end, int line_width)
+{
+	int olen = 0;
+	int line = 0;
+
+#define SET_DST(c) do { \
+	int __ret = set_str_val(&dst, dst_end, c); \
+	if (__ret < 0) \
+		return __ret; \
+} while (0);
+
+	while (src < end) {
+		unsigned char a;
+
+		a = *src++;
+		SET_DST(encode_bits(a >> 2));
+		if (src < end) {
+			unsigned char b;
+			b = *src++;
+			SET_DST(encode_bits(((a & 3) << 4) | (b >> 4)));
+			if (src < end) {
+				unsigned char c;
+				c = *src++;
+				SET_DST(encode_bits(((b & 15) << 2) |
+								(c >> 6)));
+				SET_DST(encode_bits(c & 63));
+			} else {
+				SET_DST(encode_bits((b & 15) << 2));
+				SET_DST('=');
+			}
+		} else {
+			SET_DST(encode_bits(((a & 3) << 4)));
+			SET_DST('=');
+			SET_DST('=');
+		}
+		olen += 4;
+		line += 4;
+		if (line_width && line == line_width) {
+			line = 0;
+			SET_DST('\n');
+			olen++;
+		}
+	}
+	return olen;
+}
+
+int ceph_armor(char *dst, char * const dst_end, const char *src, const char *end)
+{
+	return ceph_armor_line_break(dst, dst_end, src, end, 0);
+}
+
+int ceph_unarmor(char *dst, char * const dst_end, const char *src, const char *end)
+{
+	int olen = 0;
+
+	while (src < end) {
+		int a, b, c, d;
+
+		if (src[0] == '\n') {
+			src++;
+			continue;
+		}
+
+		if (src + 4 > end)
+			return -EINVAL;
+		a = decode_bits(src[0]);
+		b = decode_bits(src[1]);
+		c = decode_bits(src[2]);
+		d = decode_bits(src[3]);
+		if (a < 0 || b < 0 || c < 0 || d < 0)
+			return -EINVAL;
+
+		SET_DST((a << 2) | (b >> 4));
+		if (src[2] == '=')
+			return olen + 1;
+		SET_DST(((b & 15) << 4) | (c >> 2));
+		if (src[3] == '=')
+			return olen + 2;
+		SET_DST(((c & 3) << 6) | d);
+		olen += 3;
+		src += 4;
+	}
+	return olen;
+}
diff --git a/src/common/armor.h b/src/common/armor.h
new file mode 100644
index 000000000..f0ee9f9b2
--- /dev/null
+++ b/src/common/armor.h
@@ -0,0 +1,20 @@
+#ifndef CEPH_ARMOR_H
+#define CEPH_ARMOR_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int ceph_armor(char *dst, char * const dst_end,
+	       const char * src, const char *end);
+
+int ceph_armor_linebreak(char *dst, char * const dst_end,
+	       const char *src, const char *end,
+	       int line_width);
+int ceph_unarmor(char *dst, char * const dst_end,
+		 const char *src, const char * const end);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/common/assert.cc b/src/common/assert.cc
new file mode 100644
index 000000000..7fb4c2d72
--- /dev/null
+++ b/src/common/assert.cc
@@ -0,0 +1,258 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2008-2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+#include "common/debug.h"
+
+using std::ostringstream;
+
+namespace ceph {
+  static CephContext *g_assert_context = NULL;
+
+  /* If you register an assert context, ceph_assert() will try to lock the dout
+   * stream of that context before starting an assert. This is nice because the
+   * output looks better. Your assert will not be interleaved with other dout
+   * statements.
+   *
+   * However, this is strictly optional and library code currently does not
+   * register an assert context. The extra complexity of supporting this
+   * wouldn't really be worth it.
+   */
+  void register_assert_context(CephContext *cct)
+  {
+    ceph_assert(!g_assert_context);
+    g_assert_context = cct;
+  }
+
+  [[gnu::cold]] void __ceph_assert_fail(const char *assertion,
+					const char *file, int line,
+					const char *func)
+  {
+    g_assert_condition = assertion;
+    g_assert_file = file;
+    g_assert_line = line;
+    g_assert_func = func;
+    g_assert_thread = (unsigned long long)pthread_self();
+    ceph_pthread_getname(pthread_self(), g_assert_thread_name,
+		       sizeof(g_assert_thread_name));
+
+    ostringstream tss;
+    tss << ceph_clock_now();
+
+    snprintf(g_assert_msg, sizeof(g_assert_msg),
+	     "%s: In function '%s' thread %llx time %s\n"
+	     "%s: %d: FAILED ceph_assert(%s)\n",
+	     file, func, (unsigned long long)pthread_self(), tss.str().c_str(),
+	     file, line, assertion);
+    dout_emergency(g_assert_msg);
+
+    // TODO: get rid of this memory allocation.
+    ostringstream oss;
+    oss << ClibBackTrace(1);
+    dout_emergency(oss.str());
+
+    if (g_assert_context) {
+      lderr(g_assert_context) << g_assert_msg << std::endl;
+      *_dout << oss.str() << dendl;
+
+      // dump recent only if the abort signal handler won't do it for us
+      if (!g_assert_context->_conf->fatal_signal_handlers) {
+	g_assert_context->_log->dump_recent();
+      }
+    }
+
+    abort();
+  }
+
+  [[gnu::cold]] void __ceph_assert_fail(const assert_data &ctx)
+  {
+    __ceph_assert_fail(ctx.assertion, ctx.file, ctx.line, ctx.function);
+  }
+
+  class BufAppender {
+  public:
+    BufAppender(char* buf, int size) : bufptr(buf), remaining(size) {}
+
+    void printf(const char * format, ...) {
+      va_list args;
+      va_start(args, format);
+      this->vprintf(format, args);
+      va_end(args);
+    }
+
+    void vprintf(const char * format, va_list args) {
+      int n = vsnprintf(bufptr, remaining, format, args);
+      if (n >= 0) {
+	if (n < remaining) {
+	  remaining -= n;
+	  bufptr += n;
+	} else {
+	  remaining = 0;
+	}
+      }
+    }
+
+  private:
+    char* bufptr;
+    int remaining;
+  };
+
+
+  [[gnu::cold]] void __ceph_assertf_fail(const char *assertion,
+					 const char *file, int line,
+					 const char *func, const char* msg,
+					 ...)
+  {
+    ostringstream tss;
+    tss << ceph_clock_now();
+
+    g_assert_condition = assertion;
+    g_assert_file = file;
+    g_assert_line = line;
+    g_assert_func = func;
+    g_assert_thread = (unsigned long long)pthread_self();
+    ceph_pthread_getname(pthread_self(), g_assert_thread_name,
+		       sizeof(g_assert_thread_name));
+
+    BufAppender ba(g_assert_msg, sizeof(g_assert_msg));
+    BackTrace *bt = new ClibBackTrace(1);
+    ba.printf("%s: In function '%s' thread %llx time %s\n"
+	     "%s: %d: FAILED ceph_assert(%s)\n",
+	     file, func, (unsigned long long)pthread_self(), tss.str().c_str(),
+	     file, line, assertion);
+    ba.printf("Assertion details: ");
+    va_list args;
+    va_start(args, msg);
+    ba.vprintf(msg, args);
+    va_end(args);
+    ba.printf("\n");
+    dout_emergency(g_assert_msg);
+
+    // TODO: get rid of this memory allocation.
+    ostringstream oss;
+    oss << *bt;
+    dout_emergency(oss.str());
+
+    if (g_assert_context) {
+      lderr(g_assert_context) << g_assert_msg << std::endl;
+      *_dout << oss.str() << dendl;
+
+      // dump recent only if the abort signal handler won't do it for us
+      if (!g_assert_context->_conf->fatal_signal_handlers) {
+	g_assert_context->_log->dump_recent();
+      }
+    }
+
+    abort();
+  }
+
+  [[gnu::cold]] void __ceph_abort(const char *file, int line,
+				  const char *func, const std::string& msg)
+  {
+    ostringstream tss;
+    tss << ceph_clock_now();
+
+    g_assert_condition = "abort";
+    g_assert_file = file;
+    g_assert_line = line;
+    g_assert_func = func;
+    g_assert_thread = (unsigned long long)pthread_self();
+    ceph_pthread_getname(pthread_self(), g_assert_thread_name,
+		       sizeof(g_assert_thread_name));
+
+    BackTrace *bt = new ClibBackTrace(1);
+    snprintf(g_assert_msg, sizeof(g_assert_msg),
+             "%s: In function '%s' thread %llx time %s\n"
+	     "%s: %d: ceph_abort_msg(\"%s\")\n", file, func,
+	     (unsigned long long)pthread_self(),
+	     tss.str().c_str(), file, line,
+	     msg.c_str());
+    dout_emergency(g_assert_msg);
+
+    // TODO: get rid of this memory allocation.
+    ostringstream oss;
+    oss << *bt;
+    dout_emergency(oss.str());
+
+    if (g_assert_context) {
+      lderr(g_assert_context) << g_assert_msg << std::endl;
+      *_dout << oss.str() << dendl;
+
+      // dump recent only if the abort signal handler won't do it for us
+      if (!g_assert_context->_conf->fatal_signal_handlers) {
+	g_assert_context->_log->dump_recent();
+      }
+    }
+
+    abort();
+  }
+
+  [[gnu::cold]] void __ceph_abortf(const char *file, int line,
+				   const char *func, const char* msg,
+				   ...)
+  {
+    ostringstream tss;
+    tss << ceph_clock_now();
+
+    g_assert_condition = "abort";
+    g_assert_file = file;
+    g_assert_line = line;
+    g_assert_func = func;
+    g_assert_thread = (unsigned long long)pthread_self();
+    ceph_pthread_getname(pthread_self(), g_assert_thread_name,
+		       sizeof(g_assert_thread_name));
+
+    BufAppender ba(g_assert_msg, sizeof(g_assert_msg));
+    BackTrace *bt = new ClibBackTrace(1);
+    ba.printf("%s: In function '%s' thread %llx time %s\n"
+	      "%s: %d: abort()\n",
+	      file, func, (unsigned long long)pthread_self(), tss.str().c_str(),
+	      file, line);
+    ba.printf("Abort details: ");
+    va_list args;
+    va_start(args, msg);
+    ba.vprintf(msg, args);
+    va_end(args);
+    ba.printf("\n");
+    dout_emergency(g_assert_msg);
+
+    // TODO: get rid of this memory allocation.
+    ostringstream oss;
+    oss << *bt;
+    dout_emergency(oss.str());
+
+    if (g_assert_context) {
+      lderr(g_assert_context) << g_assert_msg << std::endl;
+      *_dout << oss.str() << dendl;
+
+      // dump recent only if the abort signal handler won't do it for us
+      if (!g_assert_context->_conf->fatal_signal_handlers) {
+	g_assert_context->_log->dump_recent();
+      }
+    }
+
+    abort();
+  }
+
+  [[gnu::cold]] void __ceph_assert_warn(const char *assertion,
+					const char *file,
+					int line, const char *func)
+  {
+    char buf[8096];
+    snprintf(buf, sizeof(buf),
+	     "WARNING: ceph_assert(%s) at: %s: %d: %s()\n",
+	     assertion, file, line, func);
+    dout_emergency(buf);
+  }
+}
diff --git a/src/common/async/bind_handler.h b/src/common/async/bind_handler.h
new file mode 100644
index 000000000..516d8a5e8
--- /dev/null
+++ b/src/common/async/bind_handler.h
@@ -0,0 +1,111 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_ASYNC_BIND_HANDLER_H
+#define CEPH_ASYNC_BIND_HANDLER_H
+
+#include <tuple>
+#include <boost/asio.hpp>
+
+namespace ceph::async {
+
+/**
+ * A bound completion handler for use with boost::asio.
+ *
+ * A completion handler wrapper that allows a tuple of arguments to be forwarded
+ * to the original Handler. This is intended for use with boost::asio functions
+ * like defer(), dispatch() and post() which expect handlers which are callable
+ * with no arguments.
+ *
+ * The original Handler's associated allocator and executor are maintained.
+ *
+ * @see bind_handler
+ */
+template <typename Handler, typename Tuple>
+struct CompletionHandler {
+  Handler handler;
+  Tuple args;
+
+  CompletionHandler(Handler&& handler, Tuple&& args)
+    : handler(std::move(handler)),
+      args(std::move(args))
+  {}
+
+  void operator()() & {
+    std::apply(handler, args);
+  }
+  void operator()() const & {
+    std::apply(handler, args);
+  }
+  void operator()() && {
+    std::apply(std::move(handler), std::move(args));
+  }
+
+  using allocator_type = boost::asio::associated_allocator_t<Handler>;
+  allocator_type get_allocator() const noexcept {
+    return boost::asio::get_associated_allocator(handler);
+  }
+};
+
+} // namespace ceph::async
+
+namespace boost::asio {
+
+// specialize boost::asio::associated_executor<> for CompletionHandler
+template <typename Handler, typename Tuple, typename Executor>
+struct associated_executor<ceph::async::CompletionHandler<Handler, Tuple>, Executor> {
+  using type = boost::asio::associated_executor_t<Handler, Executor>;
+
+  static type get(const ceph::async::CompletionHandler<Handler, Tuple>& handler,
+                  const Executor& ex = Executor()) noexcept {
+    return boost::asio::get_associated_executor(handler.handler, ex);
+  }
+};
+
+} // namespace boost::asio
+
+namespace ceph::async {
+
+/**
+ * Returns a wrapped completion handler with bound arguments.
+ *
+ * Binds the given arguments to a handler, and returns a CompletionHandler that
+ * is callable with no arguments. This is similar to std::bind(), except that
+ * all arguments must be provided. Move-only argument types are supported as
+ * long as the CompletionHandler's 'operator() &&' overload is used, i.e.
+ * std::move(handler)().
+ *
+ * Example use:
+ *
+ *   // bind the arguments (5, "hello") to a callback lambda:
+ *   auto callback = [] (int a, std::string b) {};
+ *   auto handler = bind_handler(callback, 5, "hello");
+ *
+ *   // execute the bound handler on an io_context:
+ *   boost::asio::io_context context;
+ *   boost::asio::post(context, std::move(handler));
+ *   context.run();
+ *
+ * @see CompletionHandler
+ */
+template <typename Handler, typename ...Args>
+auto bind_handler(Handler&& h, Args&& ...args)
+{
+  return CompletionHandler{std::forward<Handler>(h),
+                           std::make_tuple(std::forward<Args>(args)...)};
+}
+
+} // namespace ceph::async
+
+#endif // CEPH_ASYNC_BIND_HANDLER_H
diff --git a/src/common/async/bind_like.h b/src/common/async/bind_like.h
new file mode 100644
index 000000000..c360eac0a
--- /dev/null
+++ b/src/common/async/bind_like.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat <contact@redhat.com>
+ * Author: Adam C. Emerson
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <boost/asio/associated_allocator.hpp>
+#include <boost/asio/associated_executor.hpp>
+#include <boost/asio/bind_allocator.hpp>
+#include <boost/asio/bind_executor.hpp>
+
+namespace ceph::async {
+template<typename Executor, typename Allocator, typename Completion>
+auto bind_ea(const Executor& executor, const Allocator& allocator,
+	     Completion&& completion) {
+  return bind_allocator(allocator,
+			boost::asio::bind_executor(
+			  executor,
+			  std::forward<Completion>(completion)));
+}
+
+
+// Bind `Completion` to the executor and allocator of `Proto`
+template<typename Proto, typename Completion>
+auto bind_like(const Proto& proto, Completion&& completion) {
+  return bind_ea(boost::asio::get_associated_executor(proto),
+		 boost::asio::get_associated_allocator(proto),
+		 std::forward<Completion>(completion));
+}
+}
diff --git a/src/common/async/blocked_completion.h b/src/common/async/blocked_completion.h
new file mode 100644
index 000000000..23a1319bc
--- /dev/null
+++ b/src/common/async/blocked_completion.h
@@ -0,0 +1,290 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat
+ * Author: Adam C. Emerson <aemerson@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_ASYNC_BLOCKED_COMPLETION_H
+#define CEPH_COMMON_ASYNC_BLOCKED_COMPLETION_H
+
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+#include <optional>
+#include <type_traits>
+
+#include <boost/asio/async_result.hpp>
+
+#include <boost/system/error_code.hpp>
+#include <boost/system/system_error.hpp>
+
+namespace ceph::async {
+
+namespace bs = boost::system;
+
+class use_blocked_t {
+  use_blocked_t(bs::error_code* ec) : ec(ec) {}
+public:
+  use_blocked_t() = default;
+
+  use_blocked_t operator [](bs::error_code& _ec) const {
+    return use_blocked_t(&_ec);
+  }
+
+  bs::error_code* ec = nullptr;
+};
+
+inline constexpr use_blocked_t use_blocked;
+
+namespace detail {
+
+template<typename... Ts>
+struct blocked_handler
+{
+  blocked_handler(use_blocked_t b) noexcept : ec(b.ec) {}
+
+  void operator ()(Ts... values) noexcept {
+    std::scoped_lock l(*m);
+    *ec = bs::error_code{};
+    *value = std::forward_as_tuple(std::move(values)...);
+    *done = true;
+    cv->notify_one();
+  }
+
+  void operator ()(bs::error_code ec, Ts... values) noexcept {
+    std::scoped_lock l(*m);
+    *this->ec = ec;
+    *value = std::forward_as_tuple(std::move(values)...);
+    *done = true;
+    cv->notify_one();
+  }
+
+  bs::error_code* ec;
+  std::optional<std::tuple<Ts...>>* value = nullptr;
+  std::mutex* m = nullptr;
+  std::condition_variable* cv = nullptr;
+  bool* done = nullptr;
+};
+
+template<typename T>
+struct blocked_handler<T>
+{
+  blocked_handler(use_blocked_t b) noexcept : ec(b.ec) {}
+
+  void operator ()(T value) noexcept {
+    std::scoped_lock l(*m);
+    *ec = bs::error_code();
+    *this->value = std::move(value);
+    *done = true;
+    cv->notify_one();
+  }
+
+  void operator ()(bs::error_code ec, T value) noexcept {
+    std::scoped_lock l(*m);
+    *this->ec = ec;
+    *this->value = std::move(value);
+    *done = true;
+    cv->notify_one();
+  }
+
+  //private:
+  bs::error_code* ec;
+  std::optional<T>* value;
+  std::mutex* m = nullptr;
+  std::condition_variable* cv = nullptr;
+  bool* done = nullptr;
+};
+
+template<>
+struct blocked_handler<void>
+{
+  blocked_handler(use_blocked_t b) noexcept : ec(b.ec) {}
+
+  void operator ()() noexcept {
+    std::scoped_lock l(*m);
+    *ec = bs::error_code{};
+    *done = true;
+    cv->notify_one();
+  }
+
+  void operator ()(bs::error_code ec) noexcept {
+    std::scoped_lock l(*m);
+    *this->ec = ec;
+    *done = true;
+    cv->notify_one();
+  }
+
+  bs::error_code* ec;
+  std::mutex* m = nullptr;
+  std::condition_variable* cv = nullptr;
+  bool* done = nullptr;
+};
+
+template<typename... Ts>
+class blocked_result
+{
+public:
+  using completion_handler_type = blocked_handler<Ts...>;
+  using return_type = std::tuple<Ts...>;
+
+  explicit blocked_result(completion_handler_type& h) noexcept {
+    std::scoped_lock l(m);
+    out_ec = h.ec;
+    if (!out_ec) h.ec = &ec;
+    h.value = &value;
+    h.m = &m;
+    h.cv = &cv;
+    h.done = &done;
+  }
+
+  return_type get() {
+    std::unique_lock l(m);
+    cv.wait(l, [this]() { return done; });
+    if (!out_ec && ec) throw bs::system_error(ec);
+    return std::move(*value);
+  }
+
+  blocked_result(const blocked_result&) = delete;
+  blocked_result& operator =(const blocked_result&) = delete;
+  blocked_result(blocked_result&&) = delete;
+  blocked_result& operator =(blocked_result&&) = delete;
+
+private:
+  bs::error_code* out_ec;
+  bs::error_code ec;
+  std::optional<return_type> value;
+  std::mutex m;
+  std::condition_variable cv;
+  bool done = false;
+};
+
+template<typename T>
+class blocked_result<T>
+{
+public:
+  using completion_handler_type = blocked_handler<T>;
+  using return_type = T;
+
+  explicit blocked_result(completion_handler_type& h) noexcept {
+    std::scoped_lock l(m);
+    out_ec = h.ec;
+    if (!out_ec) h.ec = &ec;
+    h.value = &value;
+    h.m = &m;
+    h.cv = &cv;
+    h.done = &done;
+  }
+
+  return_type get() {
+    std::unique_lock l(m);
+    cv.wait(l, [this]() { return done; });
+    if (!out_ec && ec) throw bs::system_error(ec);
+    return std::move(*value);
+  }
+
+  blocked_result(const blocked_result&) = delete;
+  blocked_result& operator =(const blocked_result&) = delete;
+  blocked_result(blocked_result&&) = delete;
+  blocked_result& operator =(blocked_result&&) = delete;
+
+private:
+  bs::error_code* out_ec;
+  bs::error_code ec;
+  std::optional<return_type> value;
+  std::mutex m;
+  std::condition_variable cv;
+  bool done = false;
+};
+
+template<>
+class blocked_result<void>
+{
+public:
+  using completion_handler_type = blocked_handler<void>;
+  using return_type = void;
+
+  explicit blocked_result(completion_handler_type& h) noexcept {
+    std::scoped_lock l(m);
+    out_ec = h.ec;
+    if (!out_ec) h.ec = &ec;
+    h.m = &m;
+    h.cv = &cv;
+    h.done = &done;
+  }
+
+  void get() {
+    std::unique_lock l(m);
+    cv.wait(l, [this]() { return done; });
+    if (!out_ec && ec) throw bs::system_error(ec);
+  }
+
+  blocked_result(const blocked_result&) = delete;
+  blocked_result& operator =(const blocked_result&) = delete;
+  blocked_result(blocked_result&&) = delete;
+  blocked_result& operator =(blocked_result&&) = delete;
+
+private:
+  bs::error_code* out_ec;
+  bs::error_code ec;
+  std::mutex m;
+  std::condition_variable cv;
+  bool done = false;
+};
+} // namespace detail
+} // namespace ceph::async
+
+
+namespace boost::asio {
+template<typename ReturnType>
+class async_result<ceph::async::use_blocked_t, ReturnType()>
+  : public ceph::async::detail::blocked_result<void>
+{
+public:
+  explicit async_result(typename ceph::async::detail::blocked_result<void>
+			::completion_handler_type& h)
+    : ceph::async::detail::blocked_result<void>(h) {}
+};
+
+template<typename ReturnType, typename... Args>
+class async_result<ceph::async::use_blocked_t, ReturnType(Args...)>
+  : public ceph::async::detail::blocked_result<std::decay_t<Args>...>
+{
+public:
+  explicit async_result(
+    typename ceph::async::detail::blocked_result<std::decay_t<Args>...>::completion_handler_type& h)
+    : ceph::async::detail::blocked_result<std::decay_t<Args>...>(h) {}
+};
+
+template<typename ReturnType>
+class async_result<ceph::async::use_blocked_t,
+		   ReturnType(boost::system::error_code)>
+  : public ceph::async::detail::blocked_result<void>
+{
+public:
+  explicit async_result(
+    typename ceph::async::detail::blocked_result<void>::completion_handler_type& h)
+    : ceph::async::detail::blocked_result<void>(h) {}
+};
+
+template<typename ReturnType, typename... Args>
+class async_result<ceph::async::use_blocked_t,
+		   ReturnType(boost::system::error_code, Args...)>
+  : public ceph::async::detail::blocked_result<std::decay_t<Args>...>
+{
+public:
+  explicit async_result(
+    typename ceph::async::detail::blocked_result<std::decay_t<Args>...>::completion_handler_type& h)
+    : ceph::async::detail::blocked_result<std::decay_t<Args>...>(h) {}
+};
+}
+
+#endif // !CEPH_COMMON_ASYNC_BLOCKED_COMPLETION_H
diff --git a/src/common/async/completion.h b/src/common/async/completion.h
new file mode 100644
index 000000000..6af9109d5
--- /dev/null
+++ b/src/common/async/completion.h
@@ -0,0 +1,320 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_ASYNC_COMPLETION_H
+#define CEPH_ASYNC_COMPLETION_H
+
+#include <memory>
+
+#include "bind_handler.h"
+#include "forward_handler.h"
+
+namespace ceph::async {
+
+/**
+ * Abstract completion handler interface for use with boost::asio.
+ *
+ * Memory management is performed using the Handler's 'associated allocator',
+ * which carries the additional requirement that its memory be released before
+ * the Handler is invoked. This allows memory allocated for one asynchronous
+ * operation to be reused in its continuation. Because of this requirement, any
+ * calls to invoke the completion must first release ownership of it. To enforce
+ * this, the static functions defer()/dispatch()/post() take the completion by
+ * rvalue-reference to std::unique_ptr<Completion>, i.e. std::move(completion).
+ *
+ * Handlers may also have an 'associated executor', so the calls to defer(),
+ * dispatch(), and post() are forwarded to that executor. If there is no
+ * associated executor (which is generally the case unless one was bound with
+ * boost::asio::bind_executor()), the executor passed to Completion::create()
+ * is used as a default.
+ *
+ * Example use:
+ *
+ *   // declare a Completion type with Signature = void(int, string)
+ *   using MyCompletion = ceph::async::Completion<void(int, string)>;
+ *
+ *   // create a completion with the given callback:
+ *   std::unique_ptr<MyCompletion> c;
+ *   c = MyCompletion::create(ex, [] (int a, const string& b) {});
+ *
+ *   // bind arguments to the callback and post to its associated executor:
+ *   MyCompletion::post(std::move(c), 5, "hello");
+ *
+ *
+ * Additional user data may be stored along with the Completion to take
+ * advantage of the handler allocator optimization. This is accomplished by
+ * specifying its type in the template parameter T. For example, the type
+ * Completion<void(), int> contains a public member variable 'int user_data'.
+ * Any additional arguments to Completion::create() will be forwarded to type
+ * T's constructor.
+ *
+ * If the AsBase<T> type tag is used, as in Completion<void(), AsBase<T>>,
+ * the Completion will inherit from T instead of declaring it as a member
+ * variable.
+ *
+ * When invoking the completion handler via defer(), dispatch(), or post(),
+ * care must be taken when passing arguments that refer to user data, because
+ * its memory is destroyed prior to invocation. In such cases, the user data
+ * should be moved/copied out of the Completion first.
+ */
+template <typename Signature, typename T = void>
+class Completion;
+
+
+/// type tag for UserData
+template <typename T> struct AsBase {};
+
+namespace detail {
+
+/// optional user data to be stored with the Completion
+template <typename T>
+struct UserData {
+  T user_data;
+  template <typename ...Args>
+  UserData(Args&& ...args)
+    : user_data(std::forward<Args>(args)...)
+  {}
+};
+// AsBase specialization inherits from T
+template <typename T>
+struct UserData<AsBase<T>> : public T {
+  template <typename ...Args>
+  UserData(Args&& ...args)
+    : T(std::forward<Args>(args)...)
+  {}
+};
+// void specialization
+template <>
+class UserData<void> {};
+
+} // namespace detail
+
+
+// template specialization to pull the Signature's args apart
+template <typename T, typename ...Args>
+class Completion<void(Args...), T> : public detail::UserData<T> {
+ protected:
+  // internal interfaces for type-erasure on the Handler/Executor. uses
+  // tuple<Args...> to provide perfect forwarding because you can't make
+  // virtual function templates
+  virtual void destroy_defer(std::tuple<Args...>&& args) = 0;
+  virtual void destroy_dispatch(std::tuple<Args...>&& args) = 0;
+  virtual void destroy_post(std::tuple<Args...>&& args) = 0;
+  virtual void destroy() = 0;
+
+  // constructor is protected, use create(). any constructor arguments are
+  // forwarded to UserData
+  template <typename ...TArgs>
+  Completion(TArgs&& ...args)
+    : detail::UserData<T>(std::forward<TArgs>(args)...)
+  {}
+ public:
+  virtual ~Completion() = default;
+
+  // use the virtual destroy() interface on delete. this allows the derived
+  // class to manage its memory using Handler allocators, without having to use
+  // a custom Deleter for std::unique_ptr<>
+  static void operator delete(void *p) {
+    static_cast<Completion*>(p)->destroy();
+  }
+
+  /// completion factory function that uses the handler's associated allocator.
+  /// any additional arguments are forwared to T's constructor
+  template <typename Executor1, typename Handler, typename ...TArgs>
+  static std::unique_ptr<Completion>
+  create(const Executor1& ex1, Handler&& handler, TArgs&& ...args);
+
+  /// take ownership of the completion, bind any arguments to the completion
+  /// handler, then defer() it on its associated executor
+  template <typename ...Args2>
+  static void defer(std::unique_ptr<Completion>&& c, Args2&&...args);
+
+  /// take ownership of the completion, bind any arguments to the completion
+  /// handler, then dispatch() it on its associated executor
+  template <typename ...Args2>
+  static void dispatch(std::unique_ptr<Completion>&& c, Args2&&...args);
+
+  /// take ownership of the completion, bind any arguments to the completion
+  /// handler, then post() it to its associated executor
+  template <typename ...Args2>
+  static void post(std::unique_ptr<Completion>&& c, Args2&&...args);
+};
+
+namespace detail {
+
+// concrete Completion that knows how to invoke the completion handler. this
+// observes all of the 'Requirements on asynchronous operations' specified by
+// the C++ Networking TS
+template <typename Executor1, typename Handler, typename T, typename ...Args>
+class CompletionImpl final : public Completion<void(Args...), T> {
+  // use Handler's associated executor (or Executor1 by default) for callbacks
+  using Executor2 = boost::asio::associated_executor_t<Handler, Executor1>;
+  // maintain work on both executors
+  using Work1 = boost::asio::executor_work_guard<Executor1>;
+  using Work2 = boost::asio::executor_work_guard<Executor2>;
+  std::pair<Work1, Work2> work;
+  Handler handler;
+
+  // use Handler's associated allocator
+  using Alloc2 = boost::asio::associated_allocator_t<Handler>;
+  using Traits2 = std::allocator_traits<Alloc2>;
+  using RebindAlloc2 = typename Traits2::template rebind_alloc<CompletionImpl>;
+  using RebindTraits2 = std::allocator_traits<RebindAlloc2>;
+
+  // placement new for the handler allocator
+  static void* operator new(size_t, RebindAlloc2 alloc2) {
+    return RebindTraits2::allocate(alloc2, 1);
+  }
+  // placement delete for when the constructor throws during placement new
+  static void operator delete(void *p, RebindAlloc2 alloc2) {
+    RebindTraits2::deallocate(alloc2, static_cast<CompletionImpl*>(p), 1);
+  }
+
+  static auto bind_and_forward(Handler&& h, std::tuple<Args...>&& args) {
+    return forward_handler(CompletionHandler{std::move(h), std::move(args)});
+  }
+
+  void destroy_defer(std::tuple<Args...>&& args) override {
+    auto w = std::move(work);
+    auto f = bind_and_forward(std::move(handler), std::move(args));
+    RebindAlloc2 alloc2 = boost::asio::get_associated_allocator(handler);
+    RebindTraits2::destroy(alloc2, this);
+    RebindTraits2::deallocate(alloc2, this, 1);
+    w.second.get_executor().defer(std::move(f), alloc2);
+  }
+  void destroy_dispatch(std::tuple<Args...>&& args) override {
+    auto w = std::move(work);
+    auto f = bind_and_forward(std::move(handler), std::move(args));
+    RebindAlloc2 alloc2 = boost::asio::get_associated_allocator(handler);
+    RebindTraits2::destroy(alloc2, this);
+    RebindTraits2::deallocate(alloc2, this, 1);
+    w.second.get_executor().dispatch(std::move(f), alloc2);
+  }
+  void destroy_post(std::tuple<Args...>&& args) override {
+    auto w = std::move(work);
+    auto f = bind_and_forward(std::move(handler), std::move(args));
+    RebindAlloc2 alloc2 = boost::asio::get_associated_allocator(handler);
+    RebindTraits2::destroy(alloc2, this);
+    RebindTraits2::deallocate(alloc2, this, 1);
+    w.second.get_executor().post(std::move(f), alloc2);
+  }
+  void destroy() override {
+    RebindAlloc2 alloc2 = boost::asio::get_associated_allocator(handler);
+    RebindTraits2::destroy(alloc2, this);
+    RebindTraits2::deallocate(alloc2, this, 1);
+  }
+
+  // constructor is private, use create(). extra constructor arguments are
+  // forwarded to UserData
+  template <typename ...TArgs>
+  CompletionImpl(const Executor1& ex1, Handler&& handler, TArgs&& ...args)
+    : Completion<void(Args...), T>(std::forward<TArgs>(args)...),
+      work(ex1, boost::asio::make_work_guard(handler, ex1)),
+      handler(std::move(handler))
+  {}
+
+ public:
+  template <typename ...TArgs>
+  static auto create(const Executor1& ex, Handler&& handler, TArgs&& ...args) {
+    auto alloc2 = boost::asio::get_associated_allocator(handler);
+    using Ptr = std::unique_ptr<CompletionImpl>;
+    return Ptr{new (alloc2) CompletionImpl(ex, std::move(handler),
+                                           std::forward<TArgs>(args)...)};
+  }
+
+  static void operator delete(void *p) {
+    static_cast<CompletionImpl*>(p)->destroy();
+  }
+};
+
+} // namespace detail
+
+
+template <typename T, typename ...Args>
+template <typename Executor1, typename Handler, typename ...TArgs>
+std::unique_ptr<Completion<void(Args...), T>>
+Completion<void(Args...), T>::create(const Executor1& ex,
+                                     Handler&& handler, TArgs&& ...args)
+{
+  using Impl = detail::CompletionImpl<Executor1, Handler, T, Args...>;
+  return Impl::create(ex, std::forward<Handler>(handler),
+                      std::forward<TArgs>(args)...);
+}
+
+template <typename T, typename ...Args>
+template <typename ...Args2>
+void Completion<void(Args...), T>::defer(std::unique_ptr<Completion>&& ptr,
+                                         Args2&& ...args)
+{
+  auto c = ptr.release();
+  c->destroy_defer(std::make_tuple(std::forward<Args2>(args)...));
+}
+
+template <typename T, typename ...Args>
+template <typename ...Args2>
+void Completion<void(Args...), T>::dispatch(std::unique_ptr<Completion>&& ptr,
+                                            Args2&& ...args)
+{
+  auto c = ptr.release();
+  c->destroy_dispatch(std::make_tuple(std::forward<Args2>(args)...));
+}
+
+template <typename T, typename ...Args>
+template <typename ...Args2>
+void Completion<void(Args...), T>::post(std::unique_ptr<Completion>&& ptr,
+                                        Args2&& ...args)
+{
+  auto c = ptr.release();
+  c->destroy_post(std::make_tuple(std::forward<Args2>(args)...));
+}
+
+
+/// completion factory function that uses the handler's associated allocator.
+/// any additional arguments are forwared to T's constructor
+template <typename Signature, typename T, typename Executor1,
+          typename Handler, typename ...TArgs>
+std::unique_ptr<Completion<Signature, T>>
+create_completion(const Executor1& ex, Handler&& handler, TArgs&& ...args)
+{
+  return Completion<Signature, T>::create(ex, std::forward<Handler>(handler),
+                                          std::forward<TArgs>(args)...);
+}
+
+/// take ownership of the completion, bind any arguments to the completion
+/// handler, then defer() it on its associated executor
+template <typename Signature, typename T, typename ...Args>
+void defer(std::unique_ptr<Completion<Signature, T>>&& ptr, Args&& ...args)
+{
+  Completion<Signature, T>::defer(std::move(ptr), std::forward<Args>(args)...);
+}
+
+/// take ownership of the completion, bind any arguments to the completion
+/// handler, then dispatch() it on its associated executor
+template <typename Signature, typename T, typename ...Args>
+void dispatch(std::unique_ptr<Completion<Signature, T>>&& ptr, Args&& ...args)
+{
+  Completion<Signature, T>::dispatch(std::move(ptr), std::forward<Args>(args)...);
+}
+
+/// take ownership of the completion, bind any arguments to the completion
+/// handler, then post() it to its associated executor
+template <typename Signature, typename T, typename ...Args>
+void post(std::unique_ptr<Completion<Signature, T>>&& ptr, Args&& ...args)
+{
+  Completion<Signature, T>::post(std::move(ptr), std::forward<Args>(args)...);
+}
+
+} // namespace ceph::async
+
+#endif // CEPH_ASYNC_COMPLETION_H
diff --git a/src/common/async/context_pool.h b/src/common/async/context_pool.h
new file mode 100644
index 000000000..9c6cab767
--- /dev/null
+++ b/src/common/async/context_pool.h
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat <contact@redhat.com>
+ * Author: Adam C. Emerson <aemerson@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_ASYNC_CONTEXT_POOL_H
+#define CEPH_COMMON_ASYNC_CONTEXT_POOL_H
+
+#include <cstddef>
+#include <cstdint>
+#include <mutex>
+#include <optional>
+#include <thread>
+#include <vector>
+
+#include <boost/asio/io_context.hpp>
+#include <boost/asio/executor_work_guard.hpp>
+
+#include "common/ceph_mutex.h"
+#include "common/Thread.h"
+
+namespace ceph::async {
+class io_context_pool {
+  std::vector<std::thread> threadvec;
+  boost::asio::io_context ioctx;
+  std::optional<boost::asio::executor_work_guard<
+		  boost::asio::io_context::executor_type>> guard;
+  ceph::mutex m = make_mutex("ceph::io_context_pool::m");
+
+  void cleanup() noexcept {
+    guard = std::nullopt;
+    for (auto& th : threadvec) {
+      th.join();
+    }
+    threadvec.clear();
+  }
+public:
+  io_context_pool() noexcept {}
+  io_context_pool(std::int16_t threadcnt) noexcept {
+    start(threadcnt);
+  }
+  ~io_context_pool() {
+    stop();
+  }
+  void start(std::int16_t threadcnt) noexcept {
+    auto l = std::scoped_lock(m);
+    if (threadvec.empty()) {
+      guard.emplace(boost::asio::make_work_guard(ioctx));
+      ioctx.restart();
+      for (std::int16_t i = 0; i < threadcnt; ++i) {
+	threadvec.emplace_back(make_named_thread("io_context_pool",
+						 [this]() {
+						   ioctx.run();
+						 }));
+      }
+    }
+  }
+  void finish() noexcept {
+    auto l = std::scoped_lock(m);
+    if (!threadvec.empty()) {
+      cleanup();
+    }
+  }
+  void stop() noexcept {
+    auto l = std::scoped_lock(m);
+    if (!threadvec.empty()) {
+      ioctx.stop();
+      cleanup();
+    }
+  }
+
+  boost::asio::io_context& get_io_context() {
+    return ioctx;
+  }
+  operator boost::asio::io_context&() {
+    return ioctx;
+  }
+  boost::asio::io_context::executor_type get_executor() {
+    return ioctx.get_executor();
+  }
+};
+}
+
+#endif // CEPH_COMMON_ASYNC_CONTEXT_POOL_H
diff --git a/src/common/async/detail/shared_lock.h b/src/common/async/detail/shared_lock.h
new file mode 100644
index 000000000..12e6a9220
--- /dev/null
+++ b/src/common/async/detail/shared_lock.h
@@ -0,0 +1,185 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+namespace std {
+
+// specialize unique_lock and shared_lock for SharedMutex to operate on
+// SharedMutexImpl instead, because the locks may outlive the SharedMutex itself
+
+template <typename Executor>
+class unique_lock<ceph::async::SharedMutex<Executor>> {
+ public:
+  using mutex_type = boost::intrusive_ptr<ceph::async::detail::SharedMutexImpl>;
+
+  unique_lock() = default;
+  explicit unique_lock(ceph::async::SharedMutex<Executor>& m)
+    : impl(m.impl), locked(true)
+  {
+    impl->lock();
+  }
+  unique_lock(ceph::async::SharedMutex<Executor>& m, defer_lock_t t) noexcept
+    : impl(m.impl)
+  {}
+  unique_lock(ceph::async::SharedMutex<Executor>& m, try_to_lock_t t)
+    : impl(m.impl), locked(impl->try_lock())
+  {}
+  unique_lock(ceph::async::SharedMutex<Executor>& m, adopt_lock_t t) noexcept
+    : impl(m.impl), locked(true)
+  {}
+  ~unique_lock() {
+    if (impl && locked)
+      impl->unlock();
+  }
+
+  unique_lock(unique_lock&& other) noexcept
+    : impl(std::move(other.impl)),
+      locked(other.locked) {
+    other.locked = false;
+  }
+  unique_lock& operator=(unique_lock&& other) noexcept {
+    if (impl && locked) {
+      impl->unlock();
+    }
+    impl = std::move(other.impl);
+    locked = other.locked;
+    other.locked = false;
+    return *this;
+  }
+  void swap(unique_lock& other) noexcept {
+    using std::swap;
+    swap(impl, other.impl);
+    swap(locked, other.locked);
+  }
+
+  mutex_type mutex() const noexcept { return impl; }
+  bool owns_lock() const noexcept { return impl && locked; }
+  explicit operator bool() const noexcept { return impl && locked; }
+
+  mutex_type release() {
+    auto result = std::move(impl);
+    locked = false;
+    return result;
+  }
+
+  void lock() {
+    if (!impl)
+      throw system_error(make_error_code(errc::operation_not_permitted));
+    if (locked)
+      throw system_error(make_error_code(errc::resource_deadlock_would_occur));
+    impl->lock();
+    locked = true;
+  }
+  bool try_lock() {
+    if (!impl)
+      throw system_error(make_error_code(errc::operation_not_permitted));
+    if (locked)
+      throw system_error(make_error_code(errc::resource_deadlock_would_occur));
+    return locked = impl->try_lock();
+  }
+  void unlock() {
+    if (!impl || !locked)
+      throw system_error(make_error_code(errc::operation_not_permitted));
+    impl->unlock();
+    locked = false;
+  }
+ private:
+  mutex_type impl;
+  bool locked{false};
+};
+
+template <typename Executor>
+class shared_lock<ceph::async::SharedMutex<Executor>> {
+ public:
+  using mutex_type = boost::intrusive_ptr<ceph::async::detail::SharedMutexImpl>;
+
+  shared_lock() = default;
+  explicit shared_lock(ceph::async::SharedMutex<Executor>& m)
+    : impl(m.impl), locked(true)
+  {
+    impl->lock_shared();
+  }
+  shared_lock(ceph::async::SharedMutex<Executor>& m, defer_lock_t t) noexcept
+    : impl(m.impl)
+  {}
+  shared_lock(ceph::async::SharedMutex<Executor>& m, try_to_lock_t t)
+    : impl(m.impl), locked(impl->try_lock_shared())
+  {}
+  shared_lock(ceph::async::SharedMutex<Executor>& m, adopt_lock_t t) noexcept
+    : impl(m.impl), locked(true)
+  {}
+
+  ~shared_lock() {
+    if (impl && locked)
+      impl->unlock_shared();
+  }
+
+  shared_lock(shared_lock&& other) noexcept
+    : impl(std::move(other.impl)),
+      locked(other.locked) {
+    other.locked = false;
+  }
+  shared_lock& operator=(shared_lock&& other) noexcept {
+    if (impl && locked) {
+      impl->unlock_shared();
+    }
+    impl = std::move(other.impl);
+    locked = other.locked;
+    other.locked = false;
+    return *this;
+  }
+  void swap(shared_lock& other) noexcept {
+    using std::swap;
+    swap(impl, other.impl);
+    swap(locked, other.locked);
+  }
+
+  mutex_type mutex() const noexcept { return impl; }
+  bool owns_lock() const noexcept { return impl && locked; }
+  explicit operator bool() const noexcept { return impl && locked; }
+
+  mutex_type release() {
+    auto result = std::move(impl);
+    locked = false;
+    return result;
+  }
+
+  void lock() {
+    if (!impl)
+      throw system_error(make_error_code(errc::operation_not_permitted));
+    if (locked)
+      throw system_error(make_error_code(errc::resource_deadlock_would_occur));
+    impl->lock_shared();
+    locked = true;
+  }
+  bool try_lock() {
+    if (!impl)
+      throw system_error(make_error_code(errc::operation_not_permitted));
+    if (locked)
+      throw system_error(make_error_code(errc::resource_deadlock_would_occur));
+    return locked = impl->try_lock_shared();
+  }
+  void unlock() {
+    if (!impl || !locked)
+      throw system_error(make_error_code(errc::operation_not_permitted));
+    impl->unlock_shared();
+    locked = false;
+  }
+ private:
+  mutex_type impl;
+  bool locked{false};
+};
+
+} // namespace std
diff --git a/src/common/async/detail/shared_mutex.h b/src/common/async/detail/shared_mutex.h
new file mode 100644
index 000000000..8e5436350
--- /dev/null
+++ b/src/common/async/detail/shared_mutex.h
@@ -0,0 +1,326 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <condition_variable>
+#include <mutex>
+#include <optional>
+#include <shared_mutex> // for std::shared_lock
+
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <boost/intrusive_ptr.hpp>
+#include <boost/intrusive/list.hpp>
+
+#include "include/ceph_assert.h"
+
+#include "common/async/completion.h"
+
+namespace ceph::async::detail {
+
+struct LockRequest : public boost::intrusive::list_base_hook<> {
+  virtual ~LockRequest() {}
+  virtual void complete(boost::system::error_code ec) = 0;
+  virtual void destroy() = 0;
+};
+
+class SharedMutexImpl : public boost::intrusive_ref_counter<SharedMutexImpl> {
+ public:
+  ~SharedMutexImpl();
+
+  template <typename Mutex, typename CompletionToken>
+  auto async_lock(Mutex& mtx, CompletionToken&& token);
+  void lock();
+  void lock(boost::system::error_code& ec);
+  bool try_lock();
+  void unlock();
+  template <typename Mutex, typename CompletionToken>
+  auto async_lock_shared(Mutex& mtx, CompletionToken&& token);
+  void lock_shared();
+  void lock_shared(boost::system::error_code& ec);
+  bool try_lock_shared();
+  void unlock_shared();
+  void cancel();
+
+ private:
+  using RequestList = boost::intrusive::list<LockRequest>;
+
+  RequestList shared_queue; //< requests waiting on a shared lock
+  RequestList exclusive_queue; //< requests waiting on an exclusive lock
+
+  /// lock state encodes the number of shared lockers, or 'max' for exclusive
+  using LockState = uint16_t;
+  static constexpr LockState Unlocked = 0;
+  static constexpr LockState Exclusive = std::numeric_limits<LockState>::max();
+  static constexpr LockState MaxShared = Exclusive - 1;
+  LockState state = Unlocked; //< current lock state
+
+  std::mutex mutex; //< protects lock state and wait queues
+
+  void complete(RequestList&& requests, boost::system::error_code ec);
+};
+
+// sync requests live on the stack and wait on a condition variable
+class SyncRequest : public LockRequest {
+  std::condition_variable cond;
+  std::optional<boost::system::error_code> ec;
+ public:
+  boost::system::error_code wait(std::unique_lock<std::mutex>& lock) {
+    // return the error code once its been set
+    cond.wait(lock, [this] { return ec; });
+    return *ec;
+  }
+  void complete(boost::system::error_code ec) override {
+    this->ec = ec;
+    cond.notify_one();
+  }
+  void destroy() override {
+    // nothing, SyncRequests live on the stack
+  }
+};
+
+// async requests use async::Completion to invoke a handler on its executor
+template <typename Mutex, template <typename> typename Lock>
+class AsyncRequest : public LockRequest {
+  Mutex& mutex; //< mutex argument for lock guard
+ public:
+  explicit AsyncRequest(Mutex& mutex) : mutex(mutex) {}
+
+  using Signature = void(boost::system::error_code, Lock<Mutex>);
+  using LockCompletion = Completion<Signature, AsBase<AsyncRequest>>;
+
+  void complete(boost::system::error_code ec) override {
+    auto r = static_cast<LockCompletion*>(this);
+    // pass ownership of ourselves to post(). on error, pass an empty lock
+    post(std::unique_ptr<LockCompletion>{r}, ec,
+         ec ? Lock{mutex, std::defer_lock} : Lock{mutex, std::adopt_lock});
+  }
+  void destroy() override {
+    delete static_cast<LockCompletion*>(this);
+  }
+};
+
+inline SharedMutexImpl::~SharedMutexImpl()
+{
+  ceph_assert(state == Unlocked);
+  ceph_assert(shared_queue.empty());
+  ceph_assert(exclusive_queue.empty());
+}
+
+template <typename Mutex, typename CompletionToken>
+auto SharedMutexImpl::async_lock(Mutex& mtx, CompletionToken&& token)
+{
+  using Request = AsyncRequest<Mutex, std::unique_lock>;
+  using Signature = typename Request::Signature;
+  boost::asio::async_completion<CompletionToken, Signature> init(token);
+  auto& handler = init.completion_handler;
+  auto ex1 = mtx.get_executor();
+  {
+    std::lock_guard lock{mutex};
+
+    boost::system::error_code ec;
+    if (state == Unlocked) {
+      state = Exclusive;
+
+      // post a successful completion
+      auto ex2 = boost::asio::get_associated_executor(handler, ex1);
+      auto alloc2 = boost::asio::get_associated_allocator(handler);
+      auto b = bind_handler(std::move(handler), ec,
+                            std::unique_lock{mtx, std::adopt_lock});
+      ex2.post(forward_handler(std::move(b)), alloc2);
+    } else {
+      // create a request and add it to the exclusive list
+      using LockCompletion = typename Request::LockCompletion;
+      auto request = LockCompletion::create(ex1, std::move(handler), mtx);
+      exclusive_queue.push_back(*request.release());
+    }
+  }
+  return init.result.get();
+}
+
+inline void SharedMutexImpl::lock()
+{
+  boost::system::error_code ec;
+  lock(ec);
+  if (ec) {
+    throw boost::system::system_error(ec);
+  }
+}
+
+void SharedMutexImpl::lock(boost::system::error_code& ec)
+{
+  std::unique_lock lock{mutex};
+
+  if (state == Unlocked) {
+    state = Exclusive;
+    ec.clear();
+  } else {
+    SyncRequest request;
+    exclusive_queue.push_back(request);
+    ec = request.wait(lock);
+  }
+}
+
+inline bool SharedMutexImpl::try_lock()
+{
+  std::lock_guard lock{mutex};
+
+  if (state == Unlocked) {
+    state = Exclusive;
+    return true;
+  }
+  return false;
+}
+
+void SharedMutexImpl::unlock()
+{
+  RequestList granted;
+  {
+    std::lock_guard lock{mutex};
+    ceph_assert(state == Exclusive);
+
+    if (!exclusive_queue.empty()) {
+      // grant next exclusive lock
+      auto& request = exclusive_queue.front();
+      exclusive_queue.pop_front();
+      granted.push_back(request);
+    } else {
+      // grant shared locks, if any
+      state = shared_queue.size();
+      if (state > MaxShared) {
+        state = MaxShared;
+        auto end = std::next(shared_queue.begin(), MaxShared);
+        granted.splice(granted.end(), shared_queue,
+                       shared_queue.begin(), end, MaxShared);
+      } else {
+        granted.splice(granted.end(), shared_queue);
+      }
+    }
+  }
+  complete(std::move(granted), boost::system::error_code{});
+}
+
+template <typename Mutex, typename CompletionToken>
+auto SharedMutexImpl::async_lock_shared(Mutex& mtx, CompletionToken&& token)
+{
+  using Request = AsyncRequest<Mutex, std::shared_lock>;
+  using Signature = typename Request::Signature;
+  boost::asio::async_completion<CompletionToken, Signature> init(token);
+  auto& handler = init.completion_handler;
+  auto ex1 = mtx.get_executor();
+  {
+    std::lock_guard lock{mutex};
+
+    boost::system::error_code ec;
+    if (exclusive_queue.empty() && state < MaxShared) {
+      state++;
+
+      auto ex2 = boost::asio::get_associated_executor(handler, ex1);
+      auto alloc2 = boost::asio::get_associated_allocator(handler);
+      auto b = bind_handler(std::move(handler), ec,
+                            std::shared_lock{mtx, std::adopt_lock});
+      ex2.post(forward_handler(std::move(b)), alloc2);
+    } else {
+      using LockCompletion = typename Request::LockCompletion;
+      auto request = LockCompletion::create(ex1, std::move(handler), mtx);
+      shared_queue.push_back(*request.release());
+    }
+  }
+  return init.result.get();
+}
+
+inline void SharedMutexImpl::lock_shared()
+{
+  boost::system::error_code ec;
+  lock_shared(ec);
+  if (ec) {
+    throw boost::system::system_error(ec);
+  }
+}
+
+void SharedMutexImpl::lock_shared(boost::system::error_code& ec)
+{
+  std::unique_lock lock{mutex};
+
+  if (exclusive_queue.empty() && state < MaxShared) {
+    state++;
+    ec.clear();
+  } else {
+    SyncRequest request;
+    shared_queue.push_back(request);
+    ec = request.wait(lock);
+  }
+}
+
+inline bool SharedMutexImpl::try_lock_shared()
+{
+  std::lock_guard lock{mutex};
+
+  if (exclusive_queue.empty() && state < MaxShared) {
+    state++;
+    return true;
+  }
+  return false;
+}
+
+inline void SharedMutexImpl::unlock_shared()
+{
+  std::lock_guard lock{mutex};
+  ceph_assert(state != Unlocked && state <= MaxShared);
+
+  if (state == 1 && !exclusive_queue.empty()) {
+    // grant next exclusive lock
+    state = Exclusive;
+    auto& request = exclusive_queue.front();
+    exclusive_queue.pop_front();
+    request.complete(boost::system::error_code{});
+  } else if (state == MaxShared && !shared_queue.empty() &&
+             exclusive_queue.empty()) {
+    // grant next shared lock
+    auto& request = shared_queue.front();
+    shared_queue.pop_front();
+    request.complete(boost::system::error_code{});
+  } else {
+    state--;
+  }
+}
+
+inline void SharedMutexImpl::cancel()
+{
+  RequestList canceled;
+  {
+    std::lock_guard lock{mutex};
+    canceled.splice(canceled.end(), shared_queue);
+    canceled.splice(canceled.end(), exclusive_queue);
+  }
+  complete(std::move(canceled), boost::asio::error::operation_aborted);
+}
+
+void SharedMutexImpl::complete(RequestList&& requests,
+                               boost::system::error_code ec)
+{
+  while (!requests.empty()) {
+    auto& request = requests.front();
+    requests.pop_front();
+    try {
+      request.complete(ec);
+    } catch (...) {
+      // clean up any remaining completions and rethrow
+      requests.clear_and_dispose([] (LockRequest *r) { r->destroy(); });
+      throw;
+    }
+  }
+}
+
+} // namespace ceph::async::detail
diff --git a/src/common/async/forward_handler.h b/src/common/async/forward_handler.h
new file mode 100644
index 000000000..ae88cc83f
--- /dev/null
+++ b/src/common/async/forward_handler.h
@@ -0,0 +1,103 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_ASYNC_FORWARD_HANDLER_H
+#define CEPH_ASYNC_FORWARD_HANDLER_H
+
+#include <boost/asio.hpp>
+
+namespace ceph::async {
+
+/**
+ * A forwarding completion handler for use with boost::asio.
+ *
+ * A completion handler wrapper that invokes the handler's operator() as an
+ * rvalue, regardless of whether the wrapper is invoked as an lvalue or rvalue.
+ * This operation is potentially destructive to the wrapped handler, so is only
+ * suitable for single-use handlers.
+ *
+ * This is useful when combined with bind_handler() and move-only arguments,
+ * because executors will always call the lvalue overload of operator().
+ *
+ * The original Handler's associated allocator and executor are maintained.
+ *
+ * @see forward_handler
+ */
+template <typename Handler>
+struct ForwardingHandler {
+  Handler handler;
+
+  ForwardingHandler(Handler&& handler)
+    : handler(std::move(handler))
+  {}
+
+  template <typename ...Args>
+  void operator()(Args&& ...args) {
+    std::move(handler)(std::forward<Args>(args)...);
+  }
+
+  using allocator_type = boost::asio::associated_allocator_t<Handler>;
+  allocator_type get_allocator() const noexcept {
+    return boost::asio::get_associated_allocator(handler);
+  }
+};
+
+} // namespace ceph::async
+
+namespace boost::asio {
+
+// specialize boost::asio::associated_executor<> for ForwardingHandler
+template <typename Handler, typename Executor>
+struct associated_executor<ceph::async::ForwardingHandler<Handler>, Executor> {
+  using type = boost::asio::associated_executor_t<Handler, Executor>;
+
+  static type get(const ceph::async::ForwardingHandler<Handler>& handler,
+                  const Executor& ex = Executor()) noexcept {
+    return boost::asio::get_associated_executor(handler.handler, ex);
+  }
+};
+
+} // namespace boost::asio
+
+namespace ceph::async {
+
+/**
+ * Returns a single-use completion handler that always forwards on operator().
+ *
+ * Wraps a completion handler such that it is always invoked as an rvalue. This
+ * is necessary when combining executors and bind_handler() with move-only
+ * argument types.
+ *
+ * Example use:
+ *
+ *   auto callback = [] (std::unique_ptr<int>&& p) {};
+ *   auto bound_handler = bind_handler(callback, std::make_unique<int>(5));
+ *   auro handler = forward_handler(std::move(bound_handler));
+ *
+ *   // execute the forwarding handler on an io_context:
+ *   boost::asio::io_context context;
+ *   boost::asio::post(context, std::move(handler));
+ *   context.run();
+ *
+ * @see ForwardingHandler
+ */
+template <typename Handler>
+auto forward_handler(Handler&& h)
+{
+  return ForwardingHandler{std::forward<Handler>(h)};
+}
+
+} // namespace ceph::async
+
+#endif // CEPH_ASYNC_FORWARD_HANDLER_H
diff --git a/src/common/async/librados_completion.h b/src/common/async/librados_completion.h
new file mode 100644
index 000000000..2fa5555e7
--- /dev/null
+++ b/src/common/async/librados_completion.h
@@ -0,0 +1,125 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat
+ * Author: Adam C. Emerson <aemerson@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_ASYNC_LIBRADOS_COMPLETION_H
+#define CEPH_COMMON_ASYNC_LIBRADOS_COMPLETION_H
+
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+#include <optional>
+#include <type_traits>
+
+#include <boost/asio/async_result.hpp>
+
+#include <boost/system/error_code.hpp>
+#include <boost/system/system_error.hpp>
+
+#include "include/rados/librados.hpp"
+#include "librados/AioCompletionImpl.h"
+
+// Allow librados::AioCompletion to be provided as a completion
+// handler. This is only allowed with a signature of
+// (boost::system::error_code) or (). On completion the AioCompletion
+// is completed with the error_code converted to an int with
+// ceph::from_error_code.
+//
+// async_result::return_type is void.
+
+namespace ceph::async {
+
+namespace bs = boost::system;
+namespace lr = librados;
+
+namespace detail {
+
+struct librados_handler {
+  lr::AioCompletionImpl* pc;
+
+  explicit librados_handler(lr::AioCompletion* c) : pc(c->pc) {
+    pc->get();
+  }
+  ~librados_handler() {
+    if (pc) {
+      pc->put();
+      pc = nullptr;
+    }
+  }
+
+  librados_handler(const librados_handler&) = delete;
+  librados_handler& operator =(const librados_handler&) = delete;
+  librados_handler(librados_handler&& rhs) {
+    pc = rhs.pc;
+    rhs.pc = nullptr;
+  }
+
+  void operator()(bs::error_code ec) {
+    pc->lock.lock();
+    pc->rval = ceph::from_error_code(ec);
+    pc->complete = true;
+    pc->lock.unlock();
+
+    auto cb_complete = pc->callback_complete;
+    auto cb_complete_arg = pc->callback_complete_arg;
+    if (cb_complete)
+      cb_complete(pc, cb_complete_arg);
+
+    auto cb_safe = pc->callback_safe;
+    auto cb_safe_arg = pc->callback_safe_arg;
+    if (cb_safe)
+      cb_safe(pc, cb_safe_arg);
+
+    pc->lock.lock();
+    pc->callback_complete = NULL;
+    pc->callback_safe = NULL;
+    pc->cond.notify_all();
+    pc->put_unlock();
+    pc = nullptr;
+  }
+
+  void operator ()() {
+    (*this)(bs::error_code{});
+  }
+};
+} // namespace detail
+} // namespace ceph::async
+
+
+namespace boost::asio {
+template<typename ReturnType>
+class async_result<librados::AioCompletion*, ReturnType()> {
+public:
+  using completion_handler_type = ceph::async::detail::librados_handler;
+  explicit async_result(completion_handler_type&) {};
+  using return_type = void;
+  void get() {
+    return;
+  }
+};
+
+template<typename ReturnType>
+class async_result<librados::AioCompletion*,
+		   ReturnType(boost::system::error_code)> {
+public:
+  using completion_handler_type = ceph::async::detail::librados_handler;
+  explicit async_result(completion_handler_type&) {};
+  using return_type = void;
+  void get() {
+    return;
+  }
+};
+}
+
+#endif // !CEPH_COMMON_ASYNC_LIBRADOS_COMPLETION_H
diff --git a/src/common/async/shared_mutex.h b/src/common/async/shared_mutex.h
new file mode 100644
index 000000000..3e471a4df
--- /dev/null
+++ b/src/common/async/shared_mutex.h
@@ -0,0 +1,212 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "common/async/detail/shared_mutex.h"
+
+namespace ceph::async {
+
+/**
+ * An asynchronous shared mutex for use with boost::asio.
+ *
+ * A shared mutex class with asynchronous lock operations that complete on a
+ * boost::asio executor. The class also has synchronous interfaces that meet
+ * most of the standard library's requirements for the SharedMutex concept,
+ * which makes it compatible with lock_guard, unique_lock, and shared_lock.
+ *
+ * All lock requests can fail with operation_aborted on cancel() or destruction.
+ * The non-error_code overloads of lock() and lock_shared() will throw this
+ * error as an exception of type boost::system::system_error.
+ *
+ * Exclusive locks are prioritized over shared locks. Locks of the same type
+ * are granted in fifo order. The implementation defines a limit on the number
+ * of shared locks to 65534 at a time.
+ *
+ * Example use:
+ *
+ *   boost::asio::io_context context;
+ *   SharedMutex mutex{context.get_executor()};
+ *
+ *   mutex.async_lock([&] (boost::system::error_code ec, auto lock) {
+ *       if (!ec) {
+ *         // mutate shared state ...
+ *       }
+ *     });
+ *   mutex.async_lock_shared([&] (boost::system::error_code ec, auto lock) {
+ *       if (!ec) {
+ *         // read shared state ...
+ *       }
+ *     });
+ *
+ *   context.run();
+ */
+template <typename Executor>
+class SharedMutex {
+ public:
+  explicit SharedMutex(const Executor& ex);
+
+  /// on destruction, all pending lock requests are canceled
+  ~SharedMutex();
+
+  using executor_type = Executor;
+  executor_type get_executor() const noexcept { return ex; }
+
+  /// initiate an asynchronous request for an exclusive lock. when the lock is
+  /// granted, the completion handler is invoked with a successful error code
+  /// and a std::unique_lock that owns this mutex.
+  /// Signature = void(boost::system::error_code, std::unique_lock)
+  template <typename CompletionToken>
+  auto async_lock(CompletionToken&& token);
+
+  /// wait synchronously for an exclusive lock. if an error occurs before the
+  /// lock is granted, that error is thrown as an exception
+  void lock();
+
+  /// wait synchronously for an exclusive lock. if an error occurs before the
+  /// lock is granted, that error is assigned to 'ec'
+  void lock(boost::system::error_code& ec);
+
+  /// try to acquire an exclusive lock. if the lock is not immediately
+  /// available, returns false
+  bool try_lock();
+
+  /// releases an exclusive lock. not required to be called from the same thread
+  /// that initiated the lock
+  void unlock();
+
+  /// initiate an asynchronous request for a shared lock. when the lock is
+  /// granted, the completion handler is invoked with a successful error code
+  /// and a std::shared_lock that owns this mutex.
+  /// Signature = void(boost::system::error_code, std::shared_lock)
+  template <typename CompletionToken>
+  auto async_lock_shared(CompletionToken&& token);
+
+  /// wait synchronously for a shared lock. if an error occurs before the
+  /// lock is granted, that error is thrown as an exception
+  void lock_shared();
+
+  /// wait synchronously for a shared lock. if an error occurs before the lock
+  /// is granted, that error is assigned to 'ec'
+  void lock_shared(boost::system::error_code& ec);
+
+  /// try to acquire a shared lock. if the lock is not immediately available,
+  /// returns false
+  bool try_lock_shared();
+
+  /// releases a shared lock. not required to be called from the same thread
+  /// that initiated the lock
+  void unlock_shared();
+
+  /// cancel any pending requests for exclusive or shared locks with an
+  /// operation_aborted error
+  void cancel();
+
+ private:
+  Executor ex; //< default callback executor
+  boost::intrusive_ptr<detail::SharedMutexImpl> impl;
+
+  // allow lock guards to access impl
+  friend class std::unique_lock<SharedMutex>;
+  friend class std::shared_lock<SharedMutex>;
+};
+
+
+template <typename Executor>
+SharedMutex<Executor>::SharedMutex(const Executor& ex)
+  : ex(ex), impl(new detail::SharedMutexImpl)
+{
+}
+
+template <typename Executor>
+SharedMutex<Executor>::~SharedMutex()
+{
+  try {
+    impl->cancel();
+  } catch (const std::exception&) {
+    // swallow any exceptions, the destructor can't throw
+  }
+}
+
+template <typename Executor>
+template <typename CompletionToken>
+auto SharedMutex<Executor>::async_lock(CompletionToken&& token)
+{
+  return impl->async_lock(*this, std::forward<CompletionToken>(token));
+}
+
+template <typename Executor>
+void SharedMutex<Executor>::lock()
+{
+  impl->lock();
+}
+
+template <typename Executor>
+void SharedMutex<Executor>::lock(boost::system::error_code& ec)
+{
+  impl->lock(ec);
+}
+
+template <typename Executor>
+bool SharedMutex<Executor>::try_lock()
+{
+  return impl->try_lock();
+}
+
+template <typename Executor>
+void SharedMutex<Executor>::unlock()
+{
+  impl->unlock();
+}
+
+template <typename Executor>
+template <typename CompletionToken>
+auto SharedMutex<Executor>::async_lock_shared(CompletionToken&& token)
+{
+  return impl->async_lock_shared(*this, std::forward<CompletionToken>(token));
+}
+
+template <typename Executor>
+void SharedMutex<Executor>::lock_shared()
+{
+  impl->lock_shared();
+}
+
+template <typename Executor>
+void SharedMutex<Executor>::lock_shared(boost::system::error_code& ec)
+{
+  impl->lock_shared(ec);
+}
+
+template <typename Executor>
+bool SharedMutex<Executor>::try_lock_shared()
+{
+  return impl->try_lock_shared();
+}
+
+template <typename Executor>
+void SharedMutex<Executor>::unlock_shared()
+{
+  impl->unlock_shared();
+}
+
+template <typename Executor>
+void SharedMutex<Executor>::cancel()
+{
+  impl->cancel();
+}
+
+} // namespace ceph::async
+
+#include "common/async/detail/shared_lock.h"
diff --git a/src/common/async/waiter.h b/src/common/async/waiter.h
new file mode 100644
index 000000000..219a27cf7
--- /dev/null
+++ b/src/common/async/waiter.h
@@ -0,0 +1,223 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_WAITER_H
+#define CEPH_COMMON_WAITER_H
+
+#include <condition_variable>
+#include <tuple>
+
+#include <boost/asio/async_result.hpp>
+
+#include "include/ceph_assert.h"
+#include "include/function2.hpp"
+
+#include "common/ceph_mutex.h"
+
+namespace ceph::async {
+namespace detail {
+// For safety reasons (avoiding undefined behavior around sequence
+// points) std::reference_wrapper disallows move construction. This
+// harms us in cases where we want to pass a reference in to something
+// that unavoidably moves.
+//
+// It should not be used generally.
+template<typename T>
+class rvalue_reference_wrapper {
+public:
+  // types
+  using type = T;
+
+  rvalue_reference_wrapper(T& r) noexcept
+    : p(std::addressof(r)) {}
+
+  // We write our semantics to match those of reference collapsing. If
+  // we're treated as an lvalue, collapse to one.
+
+  rvalue_reference_wrapper(const rvalue_reference_wrapper&) noexcept = default;
+  rvalue_reference_wrapper(rvalue_reference_wrapper&&) noexcept = default;
+
+  // assignment
+  rvalue_reference_wrapper& operator=(
+    const rvalue_reference_wrapper& x) noexcept = default;
+  rvalue_reference_wrapper& operator=(
+    rvalue_reference_wrapper&& x) noexcept = default;
+
+  operator T& () const noexcept {
+    return *p;
+  }
+  T& get() const noexcept {
+    return *p;
+  }
+
+  operator T&& () noexcept {
+    return std::move(*p);
+  }
+  T&& get() noexcept {
+    return std::move(*p);
+  }
+
+  template<typename... Args>
+  std::result_of_t<T&(Args&&...)> operator ()(Args&&... args ) const {
+    return (*p)(std::forward<Args>(args)...);
+  }
+
+  template<typename... Args>
+  std::result_of_t<T&&(Args&&...)> operator ()(Args&&... args ) {
+    return std::move(*p)(std::forward<Args>(args)...);
+  }
+
+private:
+  T* p;
+};
+
+class base {
+protected:
+  ceph::mutex lock = ceph::make_mutex("ceph::async::detail::base::lock");
+  ceph::condition_variable cond;
+  bool has_value = false;
+
+  ~base() = default;
+
+  auto wait_base() {
+    std::unique_lock l(lock);
+    cond.wait(l, [this](){ return has_value; });
+    return l;
+  }
+
+  auto exec_base() {
+    std::unique_lock l(lock);
+    // There's no really good way to handle being called twice
+    // without being reset.
+    ceph_assert(!has_value);
+    has_value = true;
+    cond.notify_one();
+    return l;
+  }
+};
+}
+
+// waiter is a replacement for C_SafeCond and friends. It is the
+// moral equivalent of a future but plays well with a world of
+// callbacks.
+template<typename ...S>
+class waiter;
+
+template<>
+class waiter<> final : public detail::base {
+public:
+  void wait() {
+    wait_base();
+    has_value = false;
+  }
+
+  void operator()() {
+    exec_base();
+  }
+
+  auto ref() {
+    return detail::rvalue_reference_wrapper(*this);
+  }
+
+
+  operator fu2::unique_function<void() &&>() {
+    return fu2::unique_function<void() &&>(ref());
+  }
+};
+
+template<typename Ret>
+class waiter<Ret> final : public detail::base {
+  std::aligned_storage_t<sizeof(Ret)> ret;
+
+public:
+  Ret wait() {
+    auto l = wait_base();
+    auto r = reinterpret_cast<Ret*>(&ret);
+    auto t = std::move(*r);
+    r->~Ret();
+    has_value = false;
+    return t;
+  }
+
+  void operator()(Ret&& _ret) {
+    auto l = exec_base();
+    auto r = reinterpret_cast<Ret*>(&ret);
+    *r = std::move(_ret);
+  }
+
+  void operator()(const Ret& _ret) {
+    auto l = exec_base();
+    auto r = reinterpret_cast<Ret*>(&ret);
+    *r = std::move(_ret);
+  }
+
+  auto ref() {
+    return detail::rvalue_reference_wrapper(*this);
+  }
+
+  operator fu2::unique_function<void(Ret) &&>() {
+    return fu2::unique_function<void(Ret) &&>(ref());
+  }
+
+  ~waiter() {
+    if (has_value)
+      reinterpret_cast<Ret*>(&ret)->~Ret();
+  }
+};
+
+template<typename ...Ret>
+class waiter final : public detail::base {
+  std::tuple<Ret...> ret;
+
+public:
+  std::tuple<Ret...> wait() {
+    using std::tuple;
+    auto l = wait_base();
+    return std::move(ret);
+    auto r = reinterpret_cast<std::tuple<Ret...>*>(&ret);
+    auto t = std::move(*r);
+    r->~tuple<Ret...>();
+    has_value = false;
+    return t;
+  }
+
+  void operator()(Ret&&... _ret) {
+    auto l = exec_base();
+    auto r = reinterpret_cast<std::tuple<Ret...>*>(&ret);
+    *r = std::forward_as_tuple(_ret...);
+  }
+
+  void operator()(const Ret&... _ret) {
+    auto l = exec_base();
+    auto r = reinterpret_cast<std::tuple<Ret...>*>(&ret);
+    *r = std::forward_as_tuple(_ret...);
+  }
+
+  auto ref() {
+    return detail::rvalue_reference_wrapper(*this);
+  }
+
+  operator fu2::unique_function<void(Ret...) &&>() {
+    return fu2::unique_function<void(Ret...) &&>(ref());
+  }
+
+  ~waiter() {
+    using std::tuple;
+    if (has_value)
+      reinterpret_cast<tuple<Ret...>*>(&ret)->~tuple<Ret...>();
+  }
+};
+}
+
+#endif // CEPH_COMMON_WAITER_H
diff --git a/src/common/async/yield_context.h b/src/common/async/yield_context.h
new file mode 100644
index 000000000..05e6ca614
--- /dev/null
+++ b/src/common/async/yield_context.h
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <boost/range/begin.hpp>
+#include <boost/range/end.hpp>
+#include <boost/asio/io_context.hpp>
+
+#include "acconfig.h"
+
+#include <spawn/spawn.hpp>
+
+// use explicit executor types instead of the type-erased boost::asio::executor.
+// coroutines wrap the default io_context executor with a strand executor
+using yield_context = spawn::basic_yield_context<
+    boost::asio::executor_binder<void(*)(),
+        boost::asio::strand<boost::asio::io_context::executor_type>>>;
+
+/// optional-like wrapper for a spawn::yield_context and its associated
+/// boost::asio::io_context. operations that take an optional_yield argument
+/// will, when passed a non-empty yield context, suspend this coroutine instead
+/// of the blocking the thread of execution
+class optional_yield {
+  boost::asio::io_context *c = nullptr;
+  yield_context *y = nullptr;
+ public:
+  /// construct with a valid io and yield_context
+  explicit optional_yield(boost::asio::io_context& c,
+                          yield_context& y) noexcept
+    : c(&c), y(&y) {}
+
+  /// type tag to construct an empty object
+  struct empty_t {};
+  optional_yield(empty_t) noexcept {}
+
+  /// implicit conversion to bool, returns true if non-empty
+  operator bool() const noexcept { return y; }
+
+  /// return a reference to the associated io_context. only valid if non-empty
+  boost::asio::io_context& get_io_context() const noexcept { return *c; }
+
+  /// return a reference to the yield_context. only valid if non-empty
+  yield_context& get_yield_context() const noexcept { return *y; }
+};
+
+// type tag object to construct an empty optional_yield
+static constexpr optional_yield::empty_t null_yield{};
diff --git a/src/common/autovector.h b/src/common/autovector.h
new file mode 100644
index 000000000..f52a585f2
--- /dev/null
+++ b/src/common/autovector.h
@@ -0,0 +1,336 @@
+// Copyright (c) 2018-Present Red Hat Inc.  All rights reserved.
+//
+// Copyright (c) 2011-2018, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 and Apache 2.0 License
+
+#ifndef CEPH_AUTOVECTOR_H
+#define CEPH_AUTOVECTOR_H
+
+#include <algorithm>
+#include <cassert>
+#include <initializer_list>
+#include <iterator>
+#include <stdexcept>
+#include <vector>
+
+#include "include/ceph_assert.h"
+
+// A vector that leverages pre-allocated stack-based array to achieve better
+// performance for array with small amount of items.
+//
+// The interface resembles that of vector, but with less features since we aim
+// to solve the problem that we have in hand, rather than implementing a
+// full-fledged generic container.
+//
+// Currently we don't support:
+//  * reserve()/shrink_to_fit()
+//     If used correctly, in most cases, people should not touch the
+//     underlying vector at all.
+//  * random insert()/erase(), please only use push_back()/pop_back().
+//  * No move/swap operations. Each autovector instance has a
+//     stack-allocated array and if we want support move/swap operations, we
+//     need to copy the arrays other than just swapping the pointers. In this
+//     case we'll just explicitly forbid these operations since they may
+//     lead users to make false assumption by thinking they are inexpensive
+//     operations.
+//
+// Naming style of public methods almost follows that of the STL's.
+namespace ceph {
+
+template <class T, size_t kSize = 8>
+class autovector {
+ public:
+  // General STL-style container member types.
+  typedef T value_type;
+  typedef typename std::vector<T>::difference_type difference_type;
+  typedef typename std::vector<T>::size_type size_type;
+  typedef value_type& reference;
+  typedef const value_type& const_reference;
+  typedef value_type* pointer;
+  typedef const value_type* const_pointer;
+
+  // This class is the base for regular/const iterator
+  template <class TAutoVector, class TValueType>
+  class iterator_impl {
+   public:
+    // -- iterator traits
+    typedef iterator_impl<TAutoVector, TValueType> self_type;
+    typedef TValueType value_type;
+    typedef TValueType& reference;
+    typedef TValueType* pointer;
+    typedef typename TAutoVector::difference_type difference_type;
+    typedef std::random_access_iterator_tag iterator_category;
+
+    iterator_impl(TAutoVector* vect, size_t index)
+        : vect_(vect), index_(index) {};
+    iterator_impl(const iterator_impl&) = default;
+    ~iterator_impl() {}
+    iterator_impl& operator=(const iterator_impl&) = default;
+
+    // -- Advancement
+    // ++iterator
+    self_type& operator++() {
+      ++index_;
+      return *this;
+    }
+
+    // iterator++
+    self_type operator++(int) {
+      auto old = *this;
+      ++index_;
+      return old;
+    }
+
+    // --iterator
+    self_type& operator--() {
+      --index_;
+      return *this;
+    }
+
+    // iterator--
+    self_type operator--(int) {
+      auto old = *this;
+      --index_;
+      return old;
+    }
+
+    self_type operator-(difference_type len) const {
+      return self_type(vect_, index_ - len);
+    }
+
+    difference_type operator-(const self_type& other) const {
+      ceph_assert(vect_ == other.vect_);
+      return index_ - other.index_;
+    }
+
+    self_type operator+(difference_type len) const {
+      return self_type(vect_, index_ + len);
+    }
+
+    self_type& operator+=(difference_type len) {
+      index_ += len;
+      return *this;
+    }
+
+    self_type& operator-=(difference_type len) {
+      index_ -= len;
+      return *this;
+    }
+
+    // -- Reference
+    reference operator*() {
+      ceph_assert(vect_->size() >= index_);
+      return (*vect_)[index_];
+    }
+
+    const_reference operator*() const {
+      ceph_assert(vect_->size() >= index_);
+      return (*vect_)[index_];
+    }
+
+    pointer operator->() {
+      ceph_assert(vect_->size() >= index_);
+      return &(*vect_)[index_];
+    }
+
+    const_pointer operator->() const {
+      ceph_assert(vect_->size() >= index_);
+      return &(*vect_)[index_];
+    }
+
+
+    // -- Logical Operators
+    bool operator==(const self_type& other) const {
+      ceph_assert(vect_ == other.vect_);
+      return index_ == other.index_;
+    }
+
+    bool operator!=(const self_type& other) const { return !(*this == other); }
+
+    bool operator>(const self_type& other) const {
+      ceph_assert(vect_ == other.vect_);
+      return index_ > other.index_;
+    }
+
+    bool operator<(const self_type& other) const {
+      ceph_assert(vect_ == other.vect_);
+      return index_ < other.index_;
+    }
+
+    bool operator>=(const self_type& other) const {
+      ceph_assert(vect_ == other.vect_);
+      return index_ >= other.index_;
+    }
+
+    bool operator<=(const self_type& other) const {
+      ceph_assert(vect_ == other.vect_);
+      return index_ <= other.index_;
+    }
+
+   private:
+    TAutoVector* vect_ = nullptr;
+    size_t index_ = 0;
+  };
+
+  typedef iterator_impl<autovector, value_type> iterator;
+  typedef iterator_impl<const autovector, const value_type> const_iterator;
+  typedef std::reverse_iterator<iterator> reverse_iterator;
+  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+
+  autovector() = default;
+
+  autovector(std::initializer_list<T> init_list) {
+    for (const T& item : init_list) {
+      push_back(item);
+    }
+  }
+
+  ~autovector() = default;
+
+  // -- Immutable operations
+  // Indicate if all data resides in in-stack data structure.
+  bool only_in_stack() const {
+    // If no element was inserted at all, the vector's capacity will be `0`.
+    return vect_.capacity() == 0;
+  }
+
+  size_type size() const { return num_stack_items_ + vect_.size(); }
+
+  // resize does not guarantee anything about the contents of the newly
+  // available elements
+  void resize(size_type n) {
+    if (n > kSize) {
+      vect_.resize(n - kSize);
+      num_stack_items_ = kSize;
+    } else {
+      vect_.clear();
+      num_stack_items_ = n;
+    }
+  }
+
+  bool empty() const { return size() == 0; }
+
+  const_reference operator[](size_type n) const {
+    ceph_assert(n < size());
+    return n < kSize ? values_[n] : vect_[n - kSize];
+  }
+
+  reference operator[](size_type n) {
+    ceph_assert(n < size());
+    return n < kSize ? values_[n] : vect_[n - kSize];
+  }
+
+  const_reference at(size_type n) const {
+    ceph_assert(n < size());
+    return (*this)[n];
+  }
+
+  reference at(size_type n) {
+    ceph_assert(n < size());
+    return (*this)[n];
+  }
+
+  reference front() {
+    ceph_assert(!empty());
+    return *begin();
+  }
+
+  const_reference front() const {
+    ceph_assert(!empty());
+    return *begin();
+  }
+
+  reference back() {
+    ceph_assert(!empty());
+    return *(end() - 1);
+  }
+
+  const_reference back() const {
+    ceph_assert(!empty());
+    return *(end() - 1);
+  }
+
+  // -- Mutable Operations
+  void push_back(T&& item) {
+    if (num_stack_items_ < kSize) {
+      values_[num_stack_items_++] = std::move(item);
+    } else {
+      vect_.push_back(item);
+    }
+  }
+
+  void push_back(const T& item) {
+    if (num_stack_items_ < kSize) {
+      values_[num_stack_items_++] = item;
+    } else {
+      vect_.push_back(item);
+    }
+  }
+
+  template <class... Args>
+  void emplace_back(Args&&... args) {
+    push_back(value_type(args...));
+  }
+
+  void pop_back() {
+    ceph_assert(!empty());
+    if (!vect_.empty()) {
+      vect_.pop_back();
+    } else {
+      --num_stack_items_;
+    }
+  }
+
+  void clear() {
+    num_stack_items_ = 0;
+    vect_.clear();
+  }
+
+  // -- Copy and Assignment
+  autovector& assign(const autovector& other);
+
+  autovector(const autovector& other) { assign(other); }
+
+  autovector& operator=(const autovector& other) { return assign(other); }
+
+  // -- Iterator Operations
+  iterator begin() { return iterator(this, 0); }
+
+  const_iterator begin() const { return const_iterator(this, 0); }
+
+  iterator end() { return iterator(this, this->size()); }
+
+  const_iterator end() const { return const_iterator(this, this->size()); }
+
+  reverse_iterator rbegin() { return reverse_iterator(end()); }
+
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
+  }
+
+  reverse_iterator rend() { return reverse_iterator(begin()); }
+
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
+  }
+
+ private:
+  size_type num_stack_items_ = 0;  // current number of items
+  value_type values_[kSize];       // the first `kSize` items
+  // used only if there are more than `kSize` items.
+  std::vector<T> vect_;
+};
+
+template <class T, size_t kSize>
+autovector<T, kSize>& autovector<T, kSize>::assign(const autovector& other) {
+  // copy the internal vector
+  vect_.assign(other.vect_.begin(), other.vect_.end());
+
+  // copy array
+  num_stack_items_ = other.num_stack_items_;
+  std::copy(other.values_, other.values_ + num_stack_items_, values_);
+
+  return *this;
+}
+}  // namespace ceph 
+#endif // CEPH_AUTOVECTOR_H
diff --git a/src/common/bit_str.cc b/src/common/bit_str.cc
new file mode 100644
index 000000000..f14b2daad
--- /dev/null
+++ b/src/common/bit_str.cc
@@ -0,0 +1,72 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "common/bit_str.h"
+#include "common/Formatter.h"
+#include "include/ceph_assert.h"
+
+static void _dump_bit_str(
+    uint64_t bits,
+    std::ostream *out,
+    ceph::Formatter *f,
+    std::function<const char*(uint64_t)> func,
+    bool dump_bit_val)
+{
+  uint64_t b = bits;
+  int cnt = 0;
+  bool outted = false;
+
+  while (b && cnt < 64) {
+    uint64_t r = bits & (1ULL << cnt++);
+    if (r) {
+      if (out) {
+        if (outted)
+          *out << ",";
+        *out << func(r);
+        if (dump_bit_val) {
+          *out << "(" << r << ")";
+        }
+      } else {
+        ceph_assert(f != NULL);
+        if (dump_bit_val) {
+          f->dump_stream("bit_flag") << func(r)
+                                     << "(" << r << ")";
+        } else {
+          f->dump_stream("bit_flag") << func(r);
+        }
+      }
+      outted = true;
+    }
+    b >>= 1;
+  }
+  if (!outted && out)
+      *out << "none";
+}
+
+void print_bit_str(
+    uint64_t bits,
+    std::ostream &out,
+    const std::function<const char*(uint64_t)> &func,
+    bool dump_bit_val)
+{
+  _dump_bit_str(bits, &out, NULL, func, dump_bit_val);
+}
+
+void dump_bit_str(
+    uint64_t bits,
+    ceph::Formatter *f,
+    const std::function<const char*(uint64_t)> &func,
+    bool dump_bit_val)
+{
+  _dump_bit_str(bits, NULL, f, func, dump_bit_val);
+}
diff --git a/src/common/bit_str.h b/src/common/bit_str.h
new file mode 100644
index 000000000..5271c8ffe
--- /dev/null
+++ b/src/common/bit_str.h
@@ -0,0 +1,37 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_COMMON_BIT_STR_H
+#define CEPH_COMMON_BIT_STR_H
+
+#include <cstdint>
+#include <iosfwd>
+#include <functional>
+
+namespace ceph {
+  class Formatter;
+}
+
+extern void print_bit_str(
+    uint64_t bits,
+    std::ostream &out,
+    const std::function<const char*(uint64_t)> &func,
+    bool dump_bit_val = false);
+
+extern void dump_bit_str(
+    uint64_t bits,
+    ceph::Formatter *f,
+    const std::function<const char*(uint64_t)> &func,
+    bool dump_bit_val = false);
+
+#endif /* CEPH_COMMON_BIT_STR_H */
diff --git a/src/common/bit_vector.hpp b/src/common/bit_vector.hpp
new file mode 100644
index 000000000..9ce3e8b1e
--- /dev/null
+++ b/src/common/bit_vector.hpp
@@ -0,0 +1,647 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat <contact@redhat.com>
+ *
+ * LGPL2.1 (see COPYING-LGPL2.1) or later
+ */
+
+#ifndef BIT_VECTOR_HPP
+#define BIT_VECTOR_HPP
+
+#include "common/Formatter.h"
+#include "include/ceph_assert.h"
+#include "include/encoding.h"
+#include <memory>
+#include <utility>
+#include <vector>
+
+namespace ceph {
+
+template <uint8_t _bit_count>
+class BitVector
+{
+private:
+  static const uint8_t BITS_PER_BYTE = 8;
+  static const uint32_t ELEMENTS_PER_BLOCK = BITS_PER_BYTE / _bit_count;
+  static const uint8_t MASK = static_cast<uint8_t>((1 << _bit_count) - 1);
+
+  // must be power of 2
+  BOOST_STATIC_ASSERT((_bit_count != 0) && !(_bit_count & (_bit_count - 1)));
+  BOOST_STATIC_ASSERT(_bit_count <= BITS_PER_BYTE);
+
+  template <typename DataIterator>
+  class ReferenceImpl {
+  protected:
+    DataIterator m_data_iterator;
+    uint64_t m_shift;
+
+    ReferenceImpl(const DataIterator& data_iterator, uint64_t shift)
+      : m_data_iterator(data_iterator), m_shift(shift) {
+    }
+    ReferenceImpl(DataIterator&& data_iterator, uint64_t shift)
+      : m_data_iterator(std::move(data_iterator)), m_shift(shift) {
+    }
+
+  public:
+    inline operator uint8_t() const {
+      return (*m_data_iterator >> m_shift) & MASK;
+    }
+  };
+
+public:
+
+  class ConstReference : public ReferenceImpl<bufferlist::const_iterator> {
+  private:
+    friend class BitVector;
+
+    ConstReference(const bufferlist::const_iterator& data_iterator,
+                   uint64_t shift)
+      : ReferenceImpl<bufferlist::const_iterator>(data_iterator, shift) {
+    }
+    ConstReference(bufferlist::const_iterator&& data_iterator, uint64_t shift)
+      : ReferenceImpl<bufferlist::const_iterator>(std::move(data_iterator),
+                                                  shift) {
+    }
+  };
+
+  class Reference : public ReferenceImpl<bufferlist::iterator> {
+  public:
+    Reference& operator=(uint8_t v);
+
+  private:
+    friend class BitVector;
+
+    Reference(const bufferlist::iterator& data_iterator, uint64_t shift)
+      : ReferenceImpl<bufferlist::iterator>(data_iterator, shift) {
+    }
+    Reference(bufferlist::iterator&& data_iterator, uint64_t shift)
+      : ReferenceImpl<bufferlist::iterator>(std::move(data_iterator), shift) {
+    }
+  };
+
+public:
+  template <typename BitVectorT, typename DataIterator>
+  class IteratorImpl {
+  private:
+    friend class BitVector;
+
+    uint64_t m_offset = 0;
+    BitVectorT *m_bit_vector;
+
+    // cached derived values
+    uint64_t m_index = 0;
+    uint64_t m_shift = 0;
+    DataIterator m_data_iterator;
+
+    IteratorImpl(BitVectorT *bit_vector, uint64_t offset)
+      : m_bit_vector(bit_vector),
+        m_data_iterator(bit_vector->m_data.begin()) {
+      *this += offset;
+    }
+
+  public:
+    inline IteratorImpl& operator++() {
+      ++m_offset;
+
+      uint64_t index;
+      compute_index(m_offset, &index, &m_shift);
+
+      ceph_assert(index == m_index || index == m_index + 1);
+      if (index > m_index) {
+        m_index = index;
+        ++m_data_iterator;
+      }
+      return *this;
+    }
+    inline IteratorImpl& operator+=(uint64_t offset) {
+      m_offset += offset;
+      compute_index(m_offset, &m_index, &m_shift);
+      if (m_offset < m_bit_vector->size()) {
+        m_data_iterator.seek(m_index);
+      } else {
+        m_data_iterator = m_bit_vector->m_data.end();
+      }
+      return *this;
+    }
+
+    inline IteratorImpl operator++(int) {
+      IteratorImpl iterator_impl(*this);
+      ++iterator_impl;
+      return iterator_impl;
+    }
+    inline IteratorImpl operator+(uint64_t offset) {
+      IteratorImpl iterator_impl(*this);
+      iterator_impl += offset;
+      return iterator_impl;
+    }
+
+    inline bool operator==(const IteratorImpl& rhs) const {
+      return (m_offset == rhs.m_offset && m_bit_vector == rhs.m_bit_vector);
+    }
+    inline bool operator!=(const IteratorImpl& rhs) const {
+      return (m_offset != rhs.m_offset || m_bit_vector != rhs.m_bit_vector);
+    }
+
+    inline ConstReference operator*() const {
+      return ConstReference(m_data_iterator, m_shift);
+    }
+    inline Reference operator*() {
+      return Reference(m_data_iterator, m_shift);
+    }
+  };
+
+  typedef IteratorImpl<const BitVector,
+                       bufferlist::const_iterator> ConstIterator;
+  typedef IteratorImpl<BitVector, bufferlist::iterator> Iterator;
+
+  static const uint32_t BLOCK_SIZE;
+  static const uint8_t BIT_COUNT = _bit_count;
+
+  BitVector();
+
+  inline ConstIterator begin() const {
+    return ConstIterator(this, 0);
+  }
+  inline ConstIterator end() const {
+    return ConstIterator(this, m_size);
+  }
+  inline Iterator begin() {
+    return Iterator(this, 0);
+  }
+  inline Iterator end() {
+    return Iterator(this, m_size);
+  }
+
+  void set_crc_enabled(bool enabled) {
+    m_crc_enabled = enabled;
+  }
+  void clear();
+
+  void resize(uint64_t elements);
+  uint64_t size() const;
+
+  const bufferlist& get_data() const;
+
+  Reference operator[](uint64_t offset);
+  ConstReference operator[](uint64_t offset) const;
+
+  void encode_header(bufferlist& bl) const;
+  void decode_header(bufferlist::const_iterator& it);
+  uint64_t get_header_length() const;
+
+  void encode_data(bufferlist& bl, uint64_t data_byte_offset,
+		   uint64_t byte_length) const;
+  void decode_data(bufferlist::const_iterator& it, uint64_t data_byte_offset);
+  void get_data_extents(uint64_t offset, uint64_t length,
+                        uint64_t *data_byte_offset,
+                        uint64_t *object_byte_offset,
+                        uint64_t *byte_length) const;
+
+  void encode_footer(bufferlist& bl) const;
+  void decode_footer(bufferlist::const_iterator& it);
+  uint64_t get_footer_offset() const;
+
+  void decode_header_crc(bufferlist::const_iterator& it);
+  void get_header_crc_extents(uint64_t *byte_offset,
+                              uint64_t *byte_length) const;
+
+  void encode_data_crcs(bufferlist& bl, uint64_t offset,
+                        uint64_t length) const;
+  void decode_data_crcs(bufferlist::const_iterator& it, uint64_t offset);
+  void get_data_crcs_extents(uint64_t offset, uint64_t length,
+                             uint64_t *byte_offset,
+                             uint64_t *byte_length) const;
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& it);
+  void dump(Formatter *f) const;
+
+  bool operator==(const BitVector &b) const;
+
+  static void generate_test_instances(std::list<BitVector *> &o);
+private:
+  bufferlist m_data;
+  uint64_t m_size;
+  bool m_crc_enabled;
+
+  mutable __u32 m_header_crc;
+
+  // inhibit value-initialization when used in std::vector
+  struct u32_struct {
+    u32_struct() {}
+    __u32 val;
+  };
+  mutable std::vector<u32_struct> m_data_crcs;
+
+  void resize(uint64_t elements, bool zero);
+
+  static void compute_index(uint64_t offset, uint64_t *index, uint64_t *shift);
+
+};
+
+template <uint8_t _b>
+const uint32_t BitVector<_b>::BLOCK_SIZE = 4096;
+
+template <uint8_t _b>
+BitVector<_b>::BitVector() : m_size(0), m_crc_enabled(true), m_header_crc(0)
+{
+}
+
+template <uint8_t _b>
+void BitVector<_b>::clear() {
+  m_data.clear();
+  m_data_crcs.clear();
+  m_size = 0;
+  m_header_crc = 0;
+}
+
+template <uint8_t _b>
+void BitVector<_b>::resize(uint64_t size) {
+  resize(size, true);
+}
+
+template <uint8_t _b>
+void BitVector<_b>::resize(uint64_t size, bool zero) {
+  uint64_t buffer_size = (size + ELEMENTS_PER_BLOCK - 1) / ELEMENTS_PER_BLOCK;
+  if (buffer_size > m_data.length()) {
+    if (zero) {
+      m_data.append_zero(buffer_size - m_data.length());
+    } else {
+      m_data.append(buffer::ptr(buffer_size - m_data.length()));
+    }
+  } else if (buffer_size < m_data.length()) {
+    bufferlist bl;
+    bl.substr_of(m_data, 0, buffer_size);
+    bl.swap(m_data);
+  }
+  m_size = size;
+
+  uint64_t block_count = (buffer_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  m_data_crcs.resize(block_count);
+}
+
+template <uint8_t _b>
+uint64_t BitVector<_b>::size() const {
+  return m_size;
+}
+
+template <uint8_t _b>
+const bufferlist& BitVector<_b>::get_data() const {
+  return m_data;
+}
+
+template <uint8_t _b>
+void BitVector<_b>::compute_index(uint64_t offset, uint64_t *index, uint64_t *shift) {
+  *index = offset / ELEMENTS_PER_BLOCK;
+  *shift = ((ELEMENTS_PER_BLOCK - 1) - (offset % ELEMENTS_PER_BLOCK)) * _b;
+}
+
+template <uint8_t _b>
+void BitVector<_b>::encode_header(bufferlist& bl) const {
+  bufferlist header_bl;
+  ENCODE_START(1, 1, header_bl);
+  encode(m_size, header_bl);
+  ENCODE_FINISH(header_bl);
+  m_header_crc = header_bl.crc32c(0);
+
+  encode(header_bl, bl);
+}
+
+template <uint8_t _b>
+void BitVector<_b>::decode_header(bufferlist::const_iterator& it) {
+  using ceph::decode;
+  bufferlist header_bl;
+  decode(header_bl, it);
+
+  auto header_it = header_bl.cbegin();
+  uint64_t size;
+  DECODE_START(1, header_it);
+  decode(size, header_it);
+  DECODE_FINISH(header_it);
+
+  resize(size, false);
+  m_header_crc = header_bl.crc32c(0);
+}
+
+template <uint8_t _b>
+uint64_t BitVector<_b>::get_header_length() const {
+  // 4 byte bl length, 6 byte encoding header, 8 byte size
+  return 18;
+}
+
+template <uint8_t _b>
+void BitVector<_b>::encode_data(bufferlist& bl, uint64_t data_byte_offset,
+				uint64_t byte_length) const {
+  ceph_assert(data_byte_offset % BLOCK_SIZE == 0);
+  ceph_assert(data_byte_offset + byte_length == m_data.length() ||
+              byte_length % BLOCK_SIZE == 0);
+
+  uint64_t end_offset = data_byte_offset + byte_length;
+  while (data_byte_offset < end_offset) {
+    uint64_t len = std::min<uint64_t>(BLOCK_SIZE,
+                                      end_offset - data_byte_offset);
+
+    bufferlist bit;
+    bit.substr_of(m_data, data_byte_offset, len);
+    m_data_crcs[data_byte_offset / BLOCK_SIZE].val = bit.crc32c(0);
+
+    bl.claim_append(bit);
+    data_byte_offset += BLOCK_SIZE;
+  }
+}
+
+template <uint8_t _b>
+void BitVector<_b>::decode_data(bufferlist::const_iterator& it,
+                                uint64_t data_byte_offset) {
+  ceph_assert(data_byte_offset % BLOCK_SIZE == 0);
+  if (it.end()) {
+    return;
+  }
+
+  uint64_t end_offset = data_byte_offset + it.get_remaining();
+  if (end_offset > m_data.length()) {
+    throw buffer::end_of_buffer();
+  }
+
+  bufferlist data;
+  if (data_byte_offset > 0) {
+    data.substr_of(m_data, 0, data_byte_offset);
+  }
+
+  while (data_byte_offset < end_offset) {
+    uint64_t len = std::min<uint64_t>(BLOCK_SIZE, end_offset - data_byte_offset);
+
+    bufferptr ptr;
+    it.copy_deep(len, ptr);
+
+    bufferlist bit;
+    bit.append(ptr);
+    if (m_crc_enabled &&
+	m_data_crcs[data_byte_offset / BLOCK_SIZE].val != bit.crc32c(0)) {
+      throw buffer::malformed_input("invalid data block CRC");
+    }
+    data.append(bit);
+    data_byte_offset += bit.length();
+  }
+
+  if (m_data.length() > end_offset) {
+    bufferlist tail;
+    tail.substr_of(m_data, end_offset, m_data.length() - end_offset);
+    data.append(tail);
+  }
+  ceph_assert(data.length() == m_data.length());
+  data.swap(m_data);
+}
+
+template <uint8_t _b>
+void BitVector<_b>::get_data_extents(uint64_t offset, uint64_t length,
+                                     uint64_t *data_byte_offset,
+                                     uint64_t *object_byte_offset,
+                                     uint64_t *byte_length) const {
+  // read BLOCK_SIZE-aligned chunks
+  ceph_assert(length > 0 && offset + length <= m_size);
+  uint64_t shift;
+  compute_index(offset, data_byte_offset, &shift);
+  *data_byte_offset -= (*data_byte_offset % BLOCK_SIZE);
+
+  uint64_t end_offset;
+  compute_index(offset + length - 1, &end_offset, &shift);
+  end_offset += (BLOCK_SIZE - (end_offset % BLOCK_SIZE));
+  ceph_assert(*data_byte_offset <= end_offset);
+
+  *object_byte_offset = get_header_length() + *data_byte_offset;
+  *byte_length = end_offset - *data_byte_offset;
+  if (*data_byte_offset + *byte_length > m_data.length()) {
+    *byte_length = m_data.length() - *data_byte_offset;
+  }
+}
+
+template <uint8_t _b>
+void BitVector<_b>::encode_footer(bufferlist& bl) const {
+  using ceph::encode;
+  bufferlist footer_bl;
+  if (m_crc_enabled) {
+    encode(m_header_crc, footer_bl);
+
+    __u32 size = m_data_crcs.size();
+    encode(size, footer_bl);
+    encode_data_crcs(footer_bl, 0, m_size);
+  }
+  encode(footer_bl, bl);
+}
+
+template <uint8_t _b>
+void BitVector<_b>::decode_footer(bufferlist::const_iterator& it) {
+  using ceph::decode;
+  bufferlist footer_bl;
+  decode(footer_bl, it);
+
+  m_crc_enabled = (footer_bl.length() > 0);
+  if (m_crc_enabled) {
+    auto footer_it = footer_bl.cbegin();
+    decode_header_crc(footer_it);
+
+    __u32 data_src_size;
+    decode(data_src_size, footer_it);
+    decode_data_crcs(footer_it, 0);
+
+    uint64_t block_count = (m_data.length() + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    if (m_data_crcs.size() != block_count) {
+      throw buffer::malformed_input("invalid data block CRCs");
+    }
+  }
+}
+
+template <uint8_t _b>
+uint64_t BitVector<_b>::get_footer_offset() const {
+  return get_header_length() + m_data.length();
+}
+
+template <uint8_t _b>
+void BitVector<_b>::decode_header_crc(bufferlist::const_iterator& it) {
+  if (it.get_remaining() > 0) {
+    __u32 header_crc;
+    ceph::decode(header_crc, it);
+    if (m_header_crc != header_crc) {
+      throw buffer::malformed_input("incorrect header CRC");
+    }
+  }
+}
+
+template <uint8_t _b>
+void BitVector<_b>::get_header_crc_extents(uint64_t *byte_offset,
+                                           uint64_t *byte_length) const {
+  // footer is prefixed with a bufferlist length
+  *byte_offset = get_footer_offset() + sizeof(__u32);
+  *byte_length = sizeof(__u32);
+}
+
+template <uint8_t _b>
+void BitVector<_b>::encode_data_crcs(bufferlist& bl, uint64_t offset,
+                                     uint64_t length) const {
+  if (length == 0) {
+    return;
+  }
+
+  uint64_t index;
+  uint64_t shift;
+  compute_index(offset, &index, &shift);
+  uint64_t crc_index = index / BLOCK_SIZE;
+
+  compute_index(offset + length - 1, &index, &shift);
+  uint64_t end_crc_index = index / BLOCK_SIZE;
+  while (crc_index <= end_crc_index) {
+    __u32 crc = m_data_crcs[crc_index++].val;
+    ceph::encode(crc, bl);
+  }
+}
+
+template <uint8_t _b>
+void BitVector<_b>::decode_data_crcs(bufferlist::const_iterator& it,
+                                     uint64_t offset) {
+  if (it.end()) {
+    return;
+  }
+
+  uint64_t index;
+  uint64_t shift;
+  compute_index(offset, &index, &shift);
+
+  uint64_t crc_index = index / BLOCK_SIZE;
+  uint64_t remaining = it.get_remaining() / sizeof(__u32);
+  while (remaining > 0) {
+    __u32 crc;
+    ceph::decode(crc, it);
+    m_data_crcs[crc_index++].val = crc;
+    --remaining;
+  }
+}
+
+template <uint8_t _b>
+void BitVector<_b>::get_data_crcs_extents(uint64_t offset, uint64_t length,
+                                          uint64_t *byte_offset,
+                                          uint64_t *byte_length) const {
+  // data CRCs immediately follow the header CRC
+  get_header_crc_extents(byte_offset, byte_length);
+  *byte_offset += *byte_length;
+
+  // skip past data CRC vector size
+  *byte_offset += sizeof(__u32);
+
+  // CRCs are computed over BLOCK_SIZE chunks
+  ceph_assert(length > 0 && offset + length <= m_size);
+  uint64_t index;
+  uint64_t shift;
+  compute_index(offset, &index, &shift);
+  uint64_t start_byte_offset =
+    *byte_offset + ((index / BLOCK_SIZE) * sizeof(__u32));
+
+  compute_index(offset + length, &index, &shift);
+  uint64_t end_byte_offset =
+    *byte_offset + (((index / BLOCK_SIZE) + 1) * sizeof(__u32));
+  ceph_assert(start_byte_offset < end_byte_offset);
+
+  *byte_offset = start_byte_offset;
+  *byte_length = end_byte_offset - start_byte_offset;
+}
+
+template <uint8_t _b>
+void BitVector<_b>::encode(bufferlist& bl) const {
+  encode_header(bl);
+  encode_data(bl, 0, m_data.length());
+  encode_footer(bl);
+}
+
+template <uint8_t _b>
+void BitVector<_b>::decode(bufferlist::const_iterator& it) {
+  decode_header(it);
+
+  bufferlist data_bl;
+  if (m_data.length() > 0) {
+    it.copy(m_data.length(), data_bl);
+  }
+
+  decode_footer(it);
+
+  auto data_it = data_bl.cbegin();
+  decode_data(data_it, 0);
+}
+
+template <uint8_t _b>
+void BitVector<_b>::dump(Formatter *f) const {
+  f->dump_unsigned("size", m_size);
+  f->open_array_section("bit_table");
+  for (unsigned i = 0; i < m_data.length(); ++i) {
+    f->dump_format("byte", "0x%02hhX", m_data[i]);
+  }
+  f->close_section();
+}
+
+template <uint8_t _b>
+bool BitVector<_b>::operator==(const BitVector &b) const {
+  return (this->m_size == b.m_size && this->m_data == b.m_data);
+}
+
+template <uint8_t _b>
+typename BitVector<_b>::Reference BitVector<_b>::operator[](uint64_t offset) {
+  uint64_t index;
+  uint64_t shift;
+  compute_index(offset, &index, &shift);
+
+  bufferlist::iterator data_iterator(m_data.begin());
+  data_iterator.seek(index);
+  return Reference(std::move(data_iterator), shift);
+}
+
+template <uint8_t _b>
+typename BitVector<_b>::ConstReference BitVector<_b>::operator[](uint64_t offset) const {
+  uint64_t index;
+  uint64_t shift;
+  compute_index(offset, &index, &shift);
+
+  bufferlist::const_iterator data_iterator(m_data.begin());
+  data_iterator.seek(index);
+  return ConstReference(std::move(data_iterator), shift);
+}
+
+template <uint8_t _b>
+typename BitVector<_b>::Reference& BitVector<_b>::Reference::operator=(uint8_t v) {
+  uint8_t mask = MASK << this->m_shift;
+  char packed_value = (*this->m_data_iterator & ~mask) |
+                      ((v << this->m_shift) & mask);
+  bufferlist::iterator it(this->m_data_iterator);
+  it.copy_in(1, &packed_value, true);
+  return *this;
+}
+
+template <uint8_t _b>
+void BitVector<_b>::generate_test_instances(std::list<BitVector *> &o) {
+  o.push_back(new BitVector());
+
+  BitVector *b = new BitVector();
+  const uint64_t radix = 1 << b->BIT_COUNT;
+  const uint64_t size = 1024;
+
+  b->resize(size, false);
+  for (uint64_t i = 0; i < size; ++i) {
+    (*b)[i] = rand() % radix;
+  }
+  o.push_back(b);
+}
+
+
+WRITE_CLASS_ENCODER(ceph::BitVector<2>)
+
+template <uint8_t _b>
+inline std::ostream& operator<<(std::ostream& out, const ceph::BitVector<_b> &b)
+{
+  out << "ceph::BitVector<" << _b << ">(size=" << b.size() << ", data="
+      << b.get_data() << ")";
+  return out;
+}
+}
+
+#endif // BIT_VECTOR_HPP
diff --git a/src/common/blkdev.cc b/src/common/blkdev.cc
new file mode 100644
index 000000000..6fc2965a1
--- /dev/null
+++ b/src/common/blkdev.cc
@@ -0,0 +1,1192 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (c) 2015 Hewlett-Packard Development Company, L.P.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+
+#ifdef __FreeBSD__
+#include <sys/param.h>
+#include <geom/geom_disk.h>
+#include <sys/disk.h>
+#include <fcntl.h>
+#endif
+
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <boost/algorithm/string/replace.hpp>
+//#include "common/debug.h"
+#include "include/scope_guard.h"
+#include "include/uuid.h"
+#include "include/stringify.h"
+#include "blkdev.h"
+#include "numa.h"
+
+#include "json_spirit/json_spirit_reader.h"
+
+
+int get_device_by_path(const char *path, char* partition, char* device,
+		       size_t max)
+{
+  int fd = ::open(path, O_RDONLY|O_DIRECTORY);
+  if (fd < 0) {
+    return -errno;
+  }
+  auto close_fd = make_scope_guard([fd] {
+    ::close(fd);
+  });
+  BlkDev blkdev(fd);
+  if (auto ret = blkdev.partition(partition, max); ret) {
+    return ret;
+  }
+  if (auto ret = blkdev.wholedisk(device, max); ret) {
+    return ret;
+  }
+  return 0;
+}
+
+
+#include "common/blkdev.h"
+
+#ifdef __linux__
+#include <libudev.h>
+#include <linux/fs.h>
+#include <linux/kdev_t.h>
+#include <blkid/blkid.h>
+
+#include <set>
+
+#include "common/SubProcess.h"
+#include "common/errno.h"
+
+
+#define UUID_LEN 36
+
+#endif
+
+using namespace std::literals;
+
+using std::string;
+
+using ceph::bufferlist;
+
+
+BlkDev::BlkDev(int f)
+  : fd(f)
+{}
+
+BlkDev::BlkDev(const std::string& devname)
+  : devname(devname)
+{}
+
+int BlkDev::get_devid(dev_t *id) const
+{
+  struct stat st;
+  int r;
+  if (fd >= 0) {
+    r = fstat(fd, &st);
+  } else {
+    char path[PATH_MAX];
+    snprintf(path, sizeof(path), "/dev/%s", devname.c_str());
+    r = stat(path, &st);
+  }
+  if (r < 0) {
+    return -errno;
+  }
+  *id = S_ISBLK(st.st_mode) ? st.st_rdev : st.st_dev;
+  return 0;
+}
+
+#ifdef __linux__
+
+const char *BlkDev::sysfsdir() const {
+  return "/sys";
+}
+
+int BlkDev::get_size(int64_t *psize) const
+{
+#ifdef BLKGETSIZE64
+  int ret = ::ioctl(fd, BLKGETSIZE64, psize);
+#elif defined(BLKGETSIZE)
+  unsigned long sectors = 0;
+  int ret = ::ioctl(fd, BLKGETSIZE, &sectors);
+  *psize = sectors * 512ULL;
+#else
+// cppcheck-suppress preprocessorErrorDirective
+# error "Linux configuration error (get_size)"
+#endif
+  if (ret < 0)
+    ret = -errno;
+  return ret;
+}
+
+/**
+ * get a block device property as a string
+ *
+ * store property in *val, up to maxlen chars
+ * return 0 on success
+ * return negative error on error
+ */
+int64_t BlkDev::get_string_property(const char* prop,
+				    char *val, size_t maxlen) const
+{
+  char filename[PATH_MAX], wd[PATH_MAX];
+  const char* dev = nullptr;
+
+  if (fd >= 0) {
+    // sysfs isn't fully populated for partitions, so we need to lookup the sysfs
+    // entry for the underlying whole disk.
+    if (int r = wholedisk(wd, sizeof(wd)); r < 0)
+      return r;
+    dev = wd;
+  } else {
+    dev = devname.c_str();
+  }
+  if (snprintf(filename, sizeof(filename), "%s/block/%s/%s", sysfsdir(), dev,
+	       prop) >= static_cast<int>(sizeof(filename))) {
+    return -ERANGE;
+  }
+
+  FILE *fp = fopen(filename, "r");
+  if (fp == NULL) {
+    return -errno;
+  }
+
+  int r = 0;
+  if (fgets(val, maxlen - 1, fp)) {
+    // truncate at newline
+    char *p = val;
+    while (*p && *p != '\n')
+      ++p;
+    *p = 0;
+  } else {
+    r = -EINVAL;
+  }
+  fclose(fp);
+  return r;
+}
+
+/**
+ * get a block device property
+ *
+ * return the value (we assume it is positive)
+ * return negative error on error
+ */
+int64_t BlkDev::get_int_property(const char* prop) const
+{
+  char buff[256] = {0};
+  int r = get_string_property(prop, buff, sizeof(buff));
+  if (r < 0)
+    return r;
+  // take only digits
+  for (char *p = buff; *p; ++p) {
+    if (!isdigit(*p)) {
+      *p = 0;
+      break;
+    }
+  }
+  char *endptr = 0;
+  r = strtoll(buff, &endptr, 10);
+  if (endptr != buff + strlen(buff))
+    r = -EINVAL;
+  return r;
+}
+
+bool BlkDev::support_discard() const
+{
+  return get_int_property("queue/discard_granularity") > 0;
+}
+
+int BlkDev::discard(int64_t offset, int64_t len) const
+{
+  uint64_t range[2] = {(uint64_t)offset, (uint64_t)len};
+  return ioctl(fd, BLKDISCARD, range);
+}
+
+int BlkDev::get_optimal_io_size() const
+{
+	return get_int_property("queue/optimal_io_size");
+}
+
+bool BlkDev::is_rotational() const
+{
+  return get_int_property("queue/rotational") > 0;
+}
+
+int BlkDev::get_numa_node(int *node) const
+{
+  int numa = get_int_property("device/device/numa_node");
+  if (numa < 0)
+    return -1;
+  *node = numa;
+  return 0;
+}
+
+int BlkDev::dev(char *dev, size_t max) const
+{
+  return get_string_property("dev", dev, max);
+}
+
+int BlkDev::vendor(char *vendor, size_t max) const
+{
+  return get_string_property("device/device/vendor", vendor, max);
+}
+
+int BlkDev::model(char *model, size_t max) const
+{
+  return get_string_property("device/model", model, max);
+}
+
+int BlkDev::serial(char *serial, size_t max) const
+{
+  return get_string_property("device/serial", serial, max);
+}
+
+int BlkDev::partition(char *partition, size_t max) const
+{
+  dev_t id;
+  int r = get_devid(&id);
+  if (r < 0)
+    return -EINVAL;  // hrm.
+
+  char *t = blkid_devno_to_devname(id);
+  if (!t) {
+    return -EINVAL;
+  }
+  strncpy(partition, t, max);
+  free(t);
+  return 0;
+}
+
+int BlkDev::wholedisk(char *device, size_t max) const
+{
+  dev_t id;
+  int r = get_devid(&id);
+  if (r < 0)
+    return -EINVAL;  // hrm.
+
+  r = blkid_devno_to_wholedisk(id, device, max, nullptr);
+  if (r < 0) {
+    return -EINVAL;
+  }
+  return 0;
+}
+
+static int easy_readdir(const std::string& dir, std::set<std::string> *out)
+{
+  DIR *h = ::opendir(dir.c_str());
+  if (!h) {
+    return -errno;
+  }
+  struct dirent *de = nullptr;
+  while ((de = ::readdir(h))) {
+    if (strcmp(de->d_name, ".") == 0 ||
+	strcmp(de->d_name, "..") == 0) {
+      continue;
+    }
+    out->insert(de->d_name);
+  }
+  closedir(h);
+  return 0;
+}
+
+void get_dm_parents(const std::string& dev, std::set<std::string> *ls)
+{
+  std::string p = std::string("/sys/block/") + dev + "/slaves";
+  std::set<std::string> parents;
+  easy_readdir(p, &parents);
+  for (auto& d : parents) {
+    ls->insert(d);
+    // recurse in case it is dm-on-dm
+    if (d.find("dm-") == 0) {
+      get_dm_parents(d, ls);
+    }
+  }
+}
+
+void get_raw_devices(const std::string& in,
+		     std::set<std::string> *ls)
+{
+  if (in.substr(0, 3) == "dm-") {
+    std::set<std::string> o;
+    get_dm_parents(in, &o);
+    for (auto& d : o) {
+      get_raw_devices(d, ls);
+    }
+  } else {
+    BlkDev d(in);
+    std::string wholedisk;
+    if (d.wholedisk(&wholedisk) == 0) {
+      ls->insert(wholedisk);
+    } else {
+      ls->insert(in);
+    }
+  }
+}
+
+std::string _decode_model_enc(const std::string& in)
+{
+  auto v = boost::replace_all_copy(in, "\\x20", " ");
+  if (auto found = v.find_last_not_of(" "); found != v.npos) {
+    v.erase(found + 1);
+  }
+  std::replace(v.begin(), v.end(), ' ', '_');
+
+  // remove "__", which seems to come up on by ubuntu box for some reason.
+  while (true) {
+    auto p = v.find("__");
+    if (p == std::string::npos) break;
+    v.replace(p, 2, "_");
+  }
+
+  return v;
+}
+
+// trying to use udev first, and if it doesn't work, we fall back to 
+// reading /sys/block/$devname/device/(vendor/model/serial).
+std::string get_device_id(const std::string& devname,
+			  std::string *err)
+{
+  struct udev_device *dev;
+  static struct udev *udev;
+  const char *data;
+
+  udev = udev_new();
+  if (!udev) {
+    if (err) {
+      *err = "udev_new failed";
+    }
+    return {};
+  }
+  dev = udev_device_new_from_subsystem_sysname(udev, "block", devname.c_str());
+  if (!dev) {
+    if (err) {
+      *err = std::string("udev_device_new_from_subsystem_sysname failed on '")
+	+ devname + "'";
+    }
+    udev_unref(udev);
+    return {};
+  }
+
+  // ****
+  //   NOTE: please keep this implementation in sync with _get_device_id() in
+  //   src/ceph-volume/ceph_volume/util/device.py
+  // ****
+
+  std::string id_vendor, id_model, id_serial, id_serial_short, id_scsi_serial;
+  data = udev_device_get_property_value(dev, "ID_VENDOR");
+  if (data) {
+    id_vendor = data;
+  }
+  data = udev_device_get_property_value(dev, "ID_MODEL");
+  if (data) {
+    id_model = data;
+    // sometimes, ID_MODEL is "LVM ..." but ID_MODEL_ENC is correct (but
+    // encoded with \x20 for space).
+    if (id_model.substr(0, 7) == "LVM PV ") {
+      const char *enc = udev_device_get_property_value(dev, "ID_MODEL_ENC");
+      if (enc) {
+	id_model = _decode_model_enc(enc);
+      } else {
+	// ignore ID_MODEL then
+	id_model.clear();
+      }
+    }
+  }
+  data = udev_device_get_property_value(dev, "ID_SERIAL_SHORT");
+  if (data) {
+    id_serial_short = data;
+  }
+  data = udev_device_get_property_value(dev, "ID_SCSI_SERIAL");
+  if (data) {
+    id_scsi_serial = data;
+  }
+  data = udev_device_get_property_value(dev, "ID_SERIAL");
+  if (data) {
+    id_serial = data;
+  }
+  udev_device_unref(dev);
+  udev_unref(udev);
+
+  // ID_SERIAL is usually $vendor_$model_$serial, but not always
+  // ID_SERIAL_SHORT is mostly always just the serial
+  // ID_MODEL is sometimes $vendor_$model, but
+  // ID_VENDOR is sometimes $vendor and ID_MODEL just $model and ID_SCSI_SERIAL the real serial number, with ID_SERIAL and ID_SERIAL_SHORT gibberish (ick)
+  std::string device_id;
+  if (id_vendor.size() && id_model.size() && id_scsi_serial.size()) {
+    device_id = id_vendor + '_' + id_model + '_' + id_scsi_serial;
+  } else if (id_model.size() && id_serial_short.size()) {
+    device_id = id_model + '_' + id_serial_short;
+  } else if (id_serial.size()) {
+    device_id = id_serial;
+    if (device_id.substr(0, 4) == "MTFD") {
+      // Micron NVMes hide the vendor
+      device_id = "Micron_" + device_id;
+    }
+  }
+  if (device_id.size()) {
+    std::replace(device_id.begin(), device_id.end(), ' ', '_');
+    return device_id;
+  }
+
+  // either udev_device_get_property_value() failed, or succeeded but
+  // returned nothing; trying to read from files.  note that the 'vendor'
+  // file rarely contains the actual vendor; it's usually 'ATA'.
+  std::string model, serial;
+  char buf[1024] = {0};
+  BlkDev blkdev(devname);
+  if (!blkdev.model(buf, sizeof(buf))) {
+    model = buf;
+  }
+  if (!blkdev.serial(buf, sizeof(buf))) {
+    serial = buf;
+  }
+  if (err) {
+    if (model.empty() && serial.empty()) {
+      *err = std::string("fallback method has no model nor serial");
+      return {};
+    } else if (model.empty()) {
+      *err = std::string("fallback method has serial '") + serial
+        + "' but no model'";
+      return {};
+    } else if (serial.empty()) {
+      *err = std::string("fallback method has model '") + model
+        + "' but no serial'";
+      return {};
+    }
+  }
+
+  device_id = model + "_" + serial;
+  std::replace(device_id.begin(), device_id.end(), ' ', '_');
+  return device_id;
+}
+
+static std::string get_device_vendor(const std::string& devname)
+{
+  struct udev_device *dev;
+  static struct udev *udev;
+  const char *data;
+
+  udev = udev_new();
+  if (!udev) {
+    return {};
+  }
+  dev = udev_device_new_from_subsystem_sysname(udev, "block", devname.c_str());
+  if (!dev) {
+    udev_unref(udev);
+    return {};
+  }
+
+  std::string id_vendor, id_model;
+  data = udev_device_get_property_value(dev, "ID_VENDOR");
+  if (data) {
+    id_vendor = data;
+  }
+  data = udev_device_get_property_value(dev, "ID_MODEL");
+  if (data) {
+    id_model = data;
+  }
+  udev_device_unref(dev);
+  udev_unref(udev);
+
+  std::transform(id_vendor.begin(), id_vendor.end(), id_vendor.begin(),
+		 ::tolower);
+  std::transform(id_model.begin(), id_model.end(), id_model.begin(),
+		 ::tolower);
+
+  if (id_vendor.size()) {
+    return id_vendor;
+  }
+  if (id_model.size()) {
+    int pos = id_model.find(" ");
+    if (pos > 0) {
+      return id_model.substr(0, pos);
+    } else {
+      return id_model;
+    }
+  }
+
+  std::string vendor, model;
+  char buf[1024] = {0};
+  BlkDev blkdev(devname);
+  if (!blkdev.vendor(buf, sizeof(buf))) {
+    vendor = buf;
+  }
+  if (!blkdev.model(buf, sizeof(buf))) {
+    model = buf;
+  }
+  if (vendor.size()) {
+    return vendor;
+  }
+  if (model.size()) {
+     int pos = model.find(" ");
+    if (pos > 0) {
+      return model.substr(0, pos);
+    } else {
+      return model;
+    }
+  }
+
+  return {};
+}
+
+static int block_device_run_vendor_nvme(
+  const string& devname, const string& vendor, int timeout,
+  std::string *result)
+{
+  string device = "/dev/" + devname;
+
+  SubProcessTimed nvmecli(
+    "sudo", SubProcess::CLOSE, SubProcess::PIPE, SubProcess::CLOSE,
+    timeout);
+  nvmecli.add_cmd_args(
+    "nvme",
+    vendor.c_str(),
+    "smart-log-add",
+    "--json",
+    device.c_str(),
+    NULL);
+  int ret = nvmecli.spawn();
+  if (ret != 0) {
+    *result = std::string("error spawning nvme command: ") + nvmecli.err();
+    return ret;
+  }
+
+  bufferlist output;
+  ret = output.read_fd(nvmecli.get_stdout(), 100*1024);
+  if (ret < 0) {
+    bufferlist err;
+    err.read_fd(nvmecli.get_stderr(), 100 * 1024);
+    *result = std::string("failed to execute nvme: ") + err.to_str();
+  } else {
+    ret = 0;
+    *result = output.to_str();
+  }
+
+  if (nvmecli.join() != 0) {
+    *result = std::string("nvme returned an error: ") + nvmecli.err();
+    return -EINVAL;
+  }
+
+  return ret;
+}
+
+std::string get_device_path(const std::string& devname,
+			    std::string *err)
+{
+  std::set<std::string> links;
+  int r = easy_readdir("/dev/disk/by-path", &links);
+  if (r < 0) {
+    *err = "unable to list contents of /dev/disk/by-path: "s +
+      cpp_strerror(r);
+    return {};
+  }
+  for (auto& i : links) {
+    char fn[PATH_MAX];
+    char target[PATH_MAX+1];
+    snprintf(fn, sizeof(fn), "/dev/disk/by-path/%s", i.c_str());
+    int r = readlink(fn, target, sizeof(target));
+    if (r < 0 || r >= (int)sizeof(target))
+      continue;
+    target[r] = 0;
+    if ((unsigned)r > devname.size() + 1 &&
+	strncmp(target + r - devname.size(), devname.c_str(), r) == 0 &&
+	target[r - devname.size() - 1] == '/') {
+      return fn;
+    }
+  }
+  *err = "no symlink to "s + devname + " in /dev/disk/by-path";
+  return {};
+}
+
+static int block_device_run_smartctl(const string& devname, int timeout,
+				     std::string *result)
+{
+  string device = "/dev/" + devname;
+
+  // when using --json, smartctl will report its errors in JSON format to stdout 
+  SubProcessTimed smartctl(
+    "sudo", SubProcess::CLOSE, SubProcess::PIPE, SubProcess::CLOSE,
+    timeout);
+  smartctl.add_cmd_args(
+    "smartctl",
+    //"-a",    // all SMART info
+    "-x",    // all SMART and non-SMART info
+    "--json=o",
+    device.c_str(),
+    NULL);
+
+  int ret = smartctl.spawn();
+  if (ret != 0) {
+    *result = std::string("error spawning smartctl: ") + smartctl.err();
+    return ret;
+  }
+
+  bufferlist output;
+  ret = output.read_fd(smartctl.get_stdout(), 100*1024);
+  if (ret < 0) {
+    *result = std::string("failed read smartctl output: ") + cpp_strerror(-ret);
+  } else {
+    ret = 0;
+    *result = output.to_str();
+  }
+
+  int joinerr = smartctl.join();
+  // Bit 0: Command line did not parse.
+  // Bit 1: Device open failed, device did not return an IDENTIFY DEVICE structure, or device is in a low-power mode (see '-n' option above).
+  // Bit 2: Some SMART or other ATA command to the disk failed, or there was a checksum error in a SMART data structure (see '-b' option above).
+  // Bit 3: SMART status check returned "DISK FAILING".
+  // Bit 4: We found prefail Attributes <= threshold.
+  // Bit 5: SMART status check returned "DISK OK" but we found that some (usage or prefail) Attributes have been <= threshold at some time in the past.
+  // Bit 6: The device error log contains records of errors.
+  // Bit 7: The device self-test log contains records of errors.  [ATA only] Failed self-tests outdated by a newer successful extended self-test are ignored.
+  if (joinerr & 3) {
+    *result = "smartctl returned an error ("s + stringify(joinerr) +
+      "): stderr:\n"s + smartctl.err() + "\nstdout:\n"s + *result;
+    return -EINVAL;
+  }
+
+  return ret;
+}
+
+static std::string escape_quotes(const std::string& s)
+{
+  std::string r = s;
+  auto pos = r.find("\"");
+  while (pos != std::string::npos) {
+    r.replace(pos, 1, "\"");
+    pos = r.find("\"", pos + 1);
+  }
+  return r;
+}
+
+int block_device_get_metrics(const string& devname, int timeout,
+			     json_spirit::mValue *result)
+{
+  std::string s;
+
+  // smartctl
+  if (int r = block_device_run_smartctl(devname, timeout, &s);
+      r != 0) {
+    string orig = s;
+    s = "{\"error\": \"smartctl failed\", \"dev\": \"/dev/";
+    s += devname;
+    s += "\", \"smartctl_error_code\": " + stringify(r);
+    s += ", \"smartctl_output\": \"" + escape_quotes(orig);
+    s += + "\"}";
+  } else if (!json_spirit::read(s, *result)) {
+    string orig = s;
+    s = "{\"error\": \"smartctl returned invalid JSON\", \"dev\": \"/dev/";
+    s += devname;
+    s += "\",\"output\":\"";
+    s += escape_quotes(orig);
+    s += "\"}";
+  }
+  if (!json_spirit::read(s, *result)) {
+    return -EINVAL;
+  }
+
+  json_spirit::mObject& base = result->get_obj();
+  string vendor = get_device_vendor(devname);
+  if (vendor.size()) {
+    base["nvme_vendor"] = vendor;
+    s.clear();
+    json_spirit::mValue nvme_json;
+    if (int r = block_device_run_vendor_nvme(devname, vendor, timeout, &s);
+	r == 0) {
+      if (json_spirit::read(s, nvme_json) != 0) {
+	base["nvme_smart_health_information_add_log"] = nvme_json;
+      } else {
+	base["nvme_smart_health_information_add_log_error"] = "bad json output: "
+	  + s;
+      }
+    } else {
+      base["nvme_smart_health_information_add_log_error_code"] = r;
+      base["nvme_smart_health_information_add_log_error"] = s;
+    }
+  } else {
+    base["nvme_vendor"] = "unknown";
+  }
+
+  return 0;
+}
+
+#elif defined(__APPLE__)
+#include <sys/disk.h>
+
+const char *BlkDev::sysfsdir() const {
+  assert(false);  // Should never be called on Apple
+  return "";
+}
+
+int BlkDev::dev(char *dev, size_t max) const
+{
+  struct stat sb;
+
+  if (fstat(fd, &sb) < 0)
+    return -errno;
+
+  snprintf(dev, max, "%" PRIu64, (uint64_t)sb.st_rdev);
+
+  return 0;
+}
+
+int BlkDev::get_size(int64_t *psize) const
+{
+  unsigned long blocksize = 0;
+  int ret = ::ioctl(fd, DKIOCGETBLOCKSIZE, &blocksize);
+  if (!ret) {
+    unsigned long nblocks;
+    ret = ::ioctl(fd, DKIOCGETBLOCKCOUNT, &nblocks);
+    if (!ret)
+      *psize = (int64_t)nblocks * blocksize;
+  }
+  if (ret < 0)
+    ret = -errno;
+  return ret;
+}
+
+int64_t BlkDev::get_int_property(const char* prop) const
+{
+  return 0;
+}
+
+bool BlkDev::support_discard() const
+{
+  return false;
+}
+
+int BlkDev::discard(int64_t offset, int64_t len) const
+{
+  return -EOPNOTSUPP;
+}
+
+int BlkDev::get_optimal_io_size() const
+{
+  return 0;
+}
+
+bool BlkDev::is_rotational() const
+{
+  return false;
+}
+
+int BlkDev::get_numa_node(int *node) const
+{
+  return -1;
+}
+
+int BlkDev::model(char *model, size_t max) const
+{
+  return -EOPNOTSUPP;
+}
+
+int BlkDev::serial(char *serial, size_t max) const
+{
+  return -EOPNOTSUPP;
+}
+
+int BlkDev::partition(char *partition, size_t max) const
+{
+  return -EOPNOTSUPP;
+}
+
+int BlkDev::wholedisk(char *device, size_t max) const
+{
+}
+
+
+void get_dm_parents(const std::string& dev, std::set<std::string> *ls)
+{
+}
+
+void get_raw_devices(const std::string& in,
+		     std::set<std::string> *ls)
+{
+}
+
+std::string get_device_id(const std::string& devname,
+			  std::string *err)
+{
+  // FIXME: implement me
+  if (err) {
+    *err = "not implemented";
+  }
+  return std::string();
+}
+
+std::string get_device_path(const std::string& devname,
+			    std::string *err)
+{
+  // FIXME: implement me
+  if (err) {
+    *err = "not implemented";
+  }
+  return std::string();
+}
+
+#elif defined(__FreeBSD__)
+
+const char *BlkDev::sysfsdir() const {
+  assert(false);  // Should never be called on FreeBSD
+  return "";
+}
+
+int BlkDev::dev(char *dev, size_t max) const
+{
+  struct stat sb;
+
+  if (fstat(fd, &sb) < 0)
+    return -errno;
+
+  snprintf(dev, max, "%" PRIu64, (uint64_t)sb.st_rdev);
+
+  return 0;
+}
+
+int BlkDev::get_size(int64_t *psize) const
+{
+  int ret = ::ioctl(fd, DIOCGMEDIASIZE, psize);
+  if (ret < 0)
+    ret = -errno;
+  return ret;
+}
+
+int64_t BlkDev::get_int_property(const char* prop) const
+{
+  return 0;
+}
+
+bool BlkDev::support_discard() const
+{
+#ifdef FREEBSD_WITH_TRIM
+  // there is no point to claim support of discard, but
+  // unable to do so.
+  struct diocgattr_arg arg;
+
+  strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
+  arg.len = sizeof(arg.value.i);
+  if (ioctl(fd, DIOCGATTR, &arg) == 0) {
+    return (arg.value.i != 0);
+  } else {
+    return false;
+  }
+#endif
+  return false;
+}
+
+int BlkDev::discard(int64_t offset, int64_t len) const
+{
+  return -EOPNOTSUPP;
+}
+
+int BlkDev::get_optimal_io_size() const
+{
+  return 0;
+}
+
+bool BlkDev::is_rotational() const
+{
+#if __FreeBSD_version >= 1200049
+  struct diocgattr_arg arg;
+
+  strlcpy(arg.name, "GEOM::rotation_rate", sizeof(arg.name));
+  arg.len = sizeof(arg.value.u16);
+
+  int ioctl_ret = ioctl(fd, DIOCGATTR, &arg);
+  bool ret;
+  if (ioctl_ret < 0 || arg.value.u16 == DISK_RR_UNKNOWN)
+    // DISK_RR_UNKNOWN usually indicates an old drive, which is usually spinny
+    ret = true;
+  else if (arg.value.u16 == DISK_RR_NON_ROTATING)
+    ret = false;
+  else if (arg.value.u16 >= DISK_RR_MIN && arg.value.u16 <= DISK_RR_MAX)
+    ret = true;
+  else
+    ret = true;     // Invalid value.  Probably spinny?
+
+  return ret;
+#else
+  return true;      // When in doubt, it's probably spinny
+#endif
+}
+
+int BlkDev::get_numa_node(int *node) const
+{
+  int numa = get_int_property("device/device/numa_node");
+  if (numa < 0)
+    return -1;
+  *node = numa;
+  return 0;
+}
+
+int BlkDev::model(char *model, size_t max) const
+{
+  struct diocgattr_arg arg;
+
+  strlcpy(arg.name, "GEOM::descr", sizeof(arg.name));
+  arg.len = sizeof(arg.value.str);
+  if (ioctl(fd, DIOCGATTR, &arg) < 0) {
+    return -errno;
+  }
+
+  // The GEOM description is of the form "vendor product" for SCSI disks
+  // and "ATA device_model" for ATA disks.  Some vendors choose to put the
+  // vendor name in device_model, and some don't.  Strip the first bit.
+  char *p = arg.value.str;
+  if (p == NULL || *p == '\0') {
+    *model = '\0';
+  } else {
+    (void) strsep(&p, " ");
+    snprintf(model, max, "%s", p);
+  }
+
+  return 0;
+}
+
+int BlkDev::serial(char *serial, size_t max) const
+{
+  char ident[DISK_IDENT_SIZE];
+
+  if (ioctl(fd, DIOCGIDENT, ident) < 0)
+    return -errno;
+
+  snprintf(serial, max, "%s", ident);
+
+  return 0;
+}
+
+void get_dm_parents(const std::string& dev, std::set<std::string> *ls)
+{
+}
+
+void get_raw_devices(const std::string& in,
+		     std::set<std::string> *ls)
+{
+}
+
+std::string get_device_id(const std::string& devname,
+			  std::string *err)
+{
+  // FIXME: implement me for freebsd
+  if (err) {
+    *err = "not implemented for FreeBSD";
+  }
+  return std::string();
+}
+
+std::string get_device_path(const std::string& devname,
+			    std::string *err)
+{
+  // FIXME: implement me for freebsd
+  if (err) {
+    *err = "not implemented for FreeBSD";
+  }
+  return std::string();
+}
+
+int block_device_run_smartctl(const char *device, int timeout,
+			      std::string *result)
+{
+  // FIXME: implement me for freebsd
+  return -EOPNOTSUPP;  
+}
+
+int block_device_get_metrics(const string& devname, int timeout,
+                             json_spirit::mValue *result)
+{
+  // FIXME: implement me for freebsd
+  return -EOPNOTSUPP;  
+}
+
+int block_device_run_nvme(const char *device, const char *vendor, int timeout,
+             std::string *result)
+{
+  return -EOPNOTSUPP;
+}
+
+static int block_device_devname(int fd, char *devname, size_t max)
+{
+  struct fiodgname_arg arg;
+
+  arg.buf = devname;
+  arg.len = max;
+  if (ioctl(fd, FIODGNAME, &arg) < 0)
+    return -errno;
+  return 0;
+}
+
+int BlkDev::partition(char *partition, size_t max) const
+{
+  char devname[PATH_MAX];
+
+  if (block_device_devname(fd, devname, sizeof(devname)) < 0)
+    return -errno;
+  snprintf(partition, max, "/dev/%s", devname);
+  return 0;
+}
+
+int BlkDev::wholedisk(char *wd, size_t max) const
+{
+  char devname[PATH_MAX];
+
+  if (block_device_devname(fd, devname, sizeof(devname)) < 0)
+    return -errno;
+
+  size_t first_digit = strcspn(devname, "0123456789");
+  // first_digit now indexes the first digit or null character of devname
+  size_t next_nondigit = strspn(&devname[first_digit], "0123456789");
+  next_nondigit += first_digit;
+  // next_nondigit now indexes the first alphabetic or null character after the
+  // unit number
+  strlcpy(wd, devname, next_nondigit + 1);
+  return 0;
+}
+
+#else
+
+const char *BlkDev::sysfsdir() const {
+  assert(false);  // Should never be called on non-Linux
+  return "";
+}
+
+int BlkDev::dev(char *dev, size_t max) const
+{
+  return -EOPNOTSUPP;
+}
+
+int BlkDev::get_size(int64_t *psize) const
+{
+  return -EOPNOTSUPP;
+}
+
+bool BlkDev::support_discard() const
+{
+  return false;
+}
+
+int BlkDev::discard(int fd, int64_t offset, int64_t len) const
+{
+  return -EOPNOTSUPP;
+}
+
+bool BlkDev::is_rotational(const char *devname) const
+{
+  return false;
+}
+
+int BlkDev::model(char *model, size_t max) const
+{
+  return -EOPNOTSUPP;
+}
+
+int BlkDev::serial(char *serial, size_t max) const
+{
+  return -EOPNOTSUPP;
+}
+
+int BlkDev::partition(char *partition, size_t max) const
+{
+  return -EOPNOTSUPP;
+}
+
+int BlkDev::wholedisk(char *wd, size_t max) const
+{
+  return -EOPNOTSUPP;
+}
+
+void get_dm_parents(const std::string& dev, std::set<std::string> *ls)
+{
+}
+
+void get_raw_devices(const std::string& in,
+		     std::set<std::string> *ls)
+{
+}
+
+std::string get_device_id(const std::string& devname,
+			  std::string *err)
+{
+  // not implemented
+  if (err) {
+    *err = "not implemented";
+  }
+  return std::string();
+}
+
+std::string get_device_path(const std::string& devname,
+			  std::string *err)
+{
+  // not implemented
+  if (err) {
+    *err = "not implemented";
+  }
+  return std::string();
+}
+
+int block_device_run_smartctl(const char *device, int timeout,
+			      std::string *result)
+{
+  return -EOPNOTSUPP;
+}
+
+int block_device_get_metrics(const string& devname, int timeout,
+                             json_spirit::mValue *result)
+{
+  return -EOPNOTSUPP;
+}
+
+int block_device_run_nvme(const char *device, const char *vendor, int timeout,
+            std::string *result)
+{
+  return -EOPNOTSUPP;
+}
+
+#endif
+
+
+
+void get_device_metadata(
+  const std::set<std::string>& devnames,
+  std::map<std::string,std::string> *pm,
+  std::map<std::string,std::string> *errs)
+{
+  (*pm)["devices"] = stringify(devnames);
+  string &devids = (*pm)["device_ids"];
+  string &devpaths = (*pm)["device_paths"];
+  for (auto& dev : devnames) {
+    string err;
+    string id = get_device_id(dev, &err);
+    if (id.size()) {
+      if (!devids.empty()) {
+	devids += ",";
+      }
+      devids += dev + "=" + id;
+    } else {
+      (*errs)[dev] = " no unique device id for "s + dev + ": " + err;
+    }
+    string path = get_device_path(dev, &err);
+    if (path.size()) {
+      if (!devpaths.empty()) {
+	devpaths += ",";
+      }
+      devpaths += dev + "=" + path;
+    } else {
+      (*errs)[dev] + " no unique device path for "s + dev + ": " + err;
+    }
+  }
+}
diff --git a/src/common/blkdev.h b/src/common/blkdev.h
new file mode 100644
index 000000000..369cbc204
--- /dev/null
+++ b/src/common/blkdev.h
@@ -0,0 +1,84 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef __CEPH_COMMON_BLKDEV_H
+#define __CEPH_COMMON_BLKDEV_H
+
+#include <set>
+#include <map>
+#include <string>
+#include "json_spirit/json_spirit_value.h"
+
+extern int get_device_by_path(const char *path, char* partition, char* device, size_t max);
+
+extern std::string _decode_model_enc(const std::string& in);  // helper, exported only so we can unit test
+
+// get $vendor_$model_$serial style device id
+extern std::string get_device_id(const std::string& devname,
+				 std::string *err=0);
+
+// get /dev/disk/by-path/... style device id that is stable for a disk slot across reboots etc
+extern std::string get_device_path(const std::string& devname,
+				   std::string *err=0);
+
+// populate daemon metadata map with device info
+extern void get_device_metadata(
+  const std::set<std::string>& devnames,
+  std::map<std::string,std::string> *pm,
+  std::map<std::string,std::string> *errs);
+
+extern void get_dm_parents(const std::string& dev, std::set<std::string> *ls);
+extern int block_device_get_metrics(const std::string& devname, int timeout,
+				    json_spirit::mValue *result);
+
+// do everything to translate a device to the raw physical devices that
+// back it, including partitions -> wholedisks and dm -> constituent devices.
+extern void get_raw_devices(const std::string& in,
+			    std::set<std::string> *ls);
+
+class BlkDev {
+public:
+  BlkDev(int fd);
+  BlkDev(const std::string& devname);
+  /* GoogleMock requires a virtual destructor */
+  virtual ~BlkDev() {}
+
+  // from an fd
+  int discard(int64_t offset, int64_t len) const;
+  int get_size(int64_t *psize) const;
+  int get_devid(dev_t *id) const;
+  int partition(char* partition, size_t max) const;
+  // from a device (e.g., "sdb")
+  bool support_discard() const;
+  int get_optimal_io_size() const;
+  bool is_rotational() const;
+  int get_numa_node(int *node) const;
+  int dev(char *dev, size_t max) const;
+  int vendor(char *vendor, size_t max) const;
+  int model(char *model, size_t max) const;
+  int serial(char *serial, size_t max) const;
+
+  /* virtual for testing purposes */
+  virtual const char *sysfsdir() const;
+  virtual int wholedisk(char* device, size_t max) const;
+  int wholedisk(std::string *s) const {
+    char out[PATH_MAX] = {0};
+    int r = wholedisk(out, sizeof(out));
+    if (r < 0) {
+      return r;
+    }
+    *s = out;
+    return r;
+  }
+
+protected:
+  int64_t get_int_property(const char* prop) const;
+  int64_t get_string_property(const char* prop, char *val,
+    size_t maxlen) const;
+
+private:
+  int fd = -1;
+  std::string devname;
+};
+
+#endif
diff --git a/src/common/bloom_filter.cc b/src/common/bloom_filter.cc
new file mode 100644
index 000000000..4bacb5473
--- /dev/null
+++ b/src/common/bloom_filter.cc
@@ -0,0 +1,145 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/bloom_filter.hpp"
+
+#include <bit>
+#include <numeric>
+
+using ceph::bufferlist;
+using ceph::bufferptr;
+using ceph::Formatter;
+
+double bloom_filter::density() const
+{
+  // TODO: use transform_reduce() in GCC-9 and up
+  unsigned set = std::accumulate(
+    bit_table_.begin(),
+    bit_table_.begin() + table_size_,
+    0u, [](unsigned set, cell_type cell) {
+      return set + std::popcount(cell);
+    });
+  return (double)set / (table_size_ * sizeof(cell_type) * CHAR_BIT);
+}
+
+void bloom_filter::encode(bufferlist& bl) const
+{
+  ENCODE_START(2, 2, bl);
+  encode((uint64_t)salt_count_, bl);
+  encode((uint64_t)insert_count_, bl);
+  encode((uint64_t)target_element_count_, bl);
+  encode((uint64_t)random_seed_, bl);
+  encode(bit_table_, bl);
+  ENCODE_FINISH(bl);
+}
+
+void bloom_filter::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(2, p);
+  uint64_t v;
+  decode(v, p);
+  salt_count_ = v;
+  decode(v, p);
+  insert_count_ = v;
+  decode(v, p);
+  target_element_count_ = v;
+  decode(v, p);
+  random_seed_ = v;
+  salt_.clear();
+  generate_unique_salt();
+  decode(bit_table_, p);
+  table_size_ = bit_table_.size();
+  DECODE_FINISH(p);
+}
+
+void bloom_filter::dump(Formatter *f) const
+{
+  f->dump_unsigned("salt_count", salt_count_);
+  f->dump_unsigned("table_size", table_size_);
+  f->dump_unsigned("insert_count", insert_count_);
+  f->dump_unsigned("target_element_count", target_element_count_);
+  f->dump_unsigned("random_seed", random_seed_);
+
+  f->open_array_section("salt_table");
+  for (std::vector<bloom_type>::const_iterator i = salt_.begin(); i != salt_.end(); ++i)
+    f->dump_unsigned("salt", *i);
+  f->close_section();
+
+  f->open_array_section("bit_table");
+  for (auto byte : bit_table_) {
+    f->dump_unsigned("byte", (unsigned)byte);
+  }
+  f->close_section();
+}
+
+void bloom_filter::generate_test_instances(std::list<bloom_filter*>& ls)
+{
+  ls.push_back(new bloom_filter(10, .5, 1));
+  ls.push_back(new bloom_filter(10, .5, 1));
+  ls.back()->insert("foo");
+  ls.back()->insert("bar");
+  ls.push_back(new bloom_filter(50, .5, 1));
+  ls.back()->insert("foo");
+  ls.back()->insert("bar");
+  ls.back()->insert("baz");
+  ls.back()->insert("boof");
+  ls.back()->insert("boogggg");
+}
+
+
+void compressible_bloom_filter::encode(bufferlist& bl) const
+{
+  ENCODE_START(2, 2, bl);
+  bloom_filter::encode(bl);
+
+  uint32_t s = size_list.size();
+  encode(s, bl);
+  for (std::vector<size_t>::const_iterator p = size_list.begin();
+       p != size_list.end(); ++p)
+    encode((uint64_t)*p, bl);
+
+  ENCODE_FINISH(bl);
+}
+
+void compressible_bloom_filter::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(2, p);
+  bloom_filter::decode(p);
+
+  uint32_t s;
+  decode(s, p);
+  size_list.resize(s);
+  for (unsigned i = 0; i < s; i++) {
+    uint64_t v;
+    decode(v, p);
+    size_list[i] = v;
+  }
+
+  DECODE_FINISH(p);
+}
+
+void compressible_bloom_filter::dump(Formatter *f) const
+{
+  bloom_filter::dump(f);
+
+  f->open_array_section("table_sizes");
+  for (std::vector<size_t>::const_iterator p = size_list.begin();
+       p != size_list.end(); ++p)
+    f->dump_unsigned("size", (uint64_t)*p);
+  f->close_section();
+}
+
+void compressible_bloom_filter::generate_test_instances(std::list<compressible_bloom_filter*>& ls)
+{
+  ls.push_back(new compressible_bloom_filter(10, .5, 1));
+  ls.push_back(new compressible_bloom_filter(10, .5, 1));
+  ls.back()->insert("foo");
+  ls.back()->insert("bar");
+  ls.push_back(new compressible_bloom_filter(50, .5, 1));
+  ls.back()->insert("foo");
+  ls.back()->insert("bar");
+  ls.back()->insert("baz");
+  ls.back()->insert("boof");
+  ls.back()->compress(20);
+  ls.back()->insert("boogggg");
+}
diff --git a/src/common/bloom_filter.hpp b/src/common/bloom_filter.hpp
new file mode 100644
index 000000000..639516fe4
--- /dev/null
+++ b/src/common/bloom_filter.hpp
@@ -0,0 +1,585 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ *******************************************************************
+ *                                                                 *
+ *                        Open Bloom Filter                        *
+ *                                                                 *
+ * Author: Arash Partow - 2000                                     *
+ * URL: http://www.partow.net/programming/hashfunctions/index.html *
+ *                                                                 *
+ * Copyright notice:                                               *
+ * Free use of the Open Bloom Filter Library is permitted under    *
+ * the guidelines and in accordance with the most current version  *
+ * of the Boost Software License, Version 1.0                      *
+ * http://www.opensource.org/licenses/bsl1.0.html                  *
+ *                                                                 *
+ *******************************************************************
+*/
+
+
+#ifndef COMMON_BLOOM_FILTER_HPP
+#define COMMON_BLOOM_FILTER_HPP
+
+#include <cmath>
+
+#include "include/encoding.h"
+#include "include/mempool.h"
+
+static const unsigned char bit_mask[CHAR_BIT] = {
+  0x01,  //00000001
+  0x02,  //00000010
+  0x04,  //00000100
+  0x08,  //00001000
+  0x10,  //00010000
+  0x20,  //00100000
+  0x40,  //01000000
+  0x80   //10000000
+};
+
+class bloom_filter
+{
+protected:
+
+  using bloom_type = unsigned int;
+  using cell_type = unsigned char;
+  using table_type = mempool::bloom_filter::vector<cell_type>;
+
+  std::vector<bloom_type> salt_;     ///< vector of salts
+  table_type          bit_table_;    ///< bit map
+  std::size_t         salt_count_;   ///< number of salts
+  std::size_t         table_size_;   ///< bit table size in bytes
+  std::size_t         insert_count_;  ///< insertion count
+  std::size_t         target_element_count_;  ///< target number of unique insertions
+  std::size_t         random_seed_;  ///< random seed
+
+public:
+
+  bloom_filter()
+    : salt_count_(0),
+      table_size_(0),
+      insert_count_(0),
+      target_element_count_(0),
+      random_seed_(0)
+  {}
+
+  bloom_filter(const std::size_t& predicted_inserted_element_count,
+	       const double& false_positive_probability,
+	       const std::size_t& random_seed)
+    : insert_count_(0),
+      target_element_count_(predicted_inserted_element_count),
+      random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
+  {
+    ceph_assert(false_positive_probability > 0.0);
+    std::tie(salt_count_, table_size_) =
+      find_optimal_parameters(predicted_inserted_element_count,
+			      false_positive_probability);
+    init();
+  }
+
+  bloom_filter(const std::size_t& salt_count,
+	       std::size_t table_size,
+	       const std::size_t& random_seed,
+	       std::size_t target_element_count)
+    : salt_count_(salt_count),
+      table_size_(table_size),
+      insert_count_(0),
+      target_element_count_(target_element_count),
+      random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
+  {
+    init();
+  }
+
+  void init() {
+    generate_unique_salt();
+    bit_table_.resize(table_size_, static_cast<unsigned char>(0x00));
+  }
+
+  bloom_filter(const bloom_filter& filter)
+  {
+    this->operator=(filter);
+  }
+
+  bloom_filter& operator = (const bloom_filter& filter)
+  {
+    if (this != &filter) {
+      salt_count_ = filter.salt_count_;
+      table_size_ = filter.table_size_;
+      insert_count_ = filter.insert_count_;
+      target_element_count_ = filter.target_element_count_;
+      random_seed_ = filter.random_seed_;
+      bit_table_ = filter.bit_table_;
+      salt_ = filter.salt_;
+    }
+    return *this;
+  }
+
+  virtual ~bloom_filter() = default;
+
+  inline bool operator!() const
+  {
+    return (0 == table_size_);
+  }
+
+  inline void clear()
+  {
+    std::fill(bit_table_.begin(), bit_table_.end(),
+	      static_cast<unsigned char>(0x00));
+    insert_count_ = 0;
+  }
+
+  /**
+   * insert a u32 into the set
+   *
+   * NOTE: the internal hash is weak enough that consecutive inputs do
+   * not achieve the desired fpp.  Well-mixed values should be used
+   * here (e.g., put rjhash(x) into the filter instead of just x).
+   *
+   * @param val integer value to insert
+   */
+  inline void insert(uint32_t val) {
+    for (auto salt : salt_) {
+      auto [bit_index, bit] = compute_indices(hash_ap(val, salt));
+      bit_table_[bit_index >> 3] |= bit_mask[bit];
+    }
+    ++insert_count_;
+  }
+
+  inline void insert(const unsigned char* key_begin, const std::size_t& length)
+  {
+    for (auto salt : salt_) {
+      auto [bit_index, bit] = compute_indices(hash_ap(key_begin, length, salt));
+      bit_table_[bit_index >> 3] |= bit_mask[bit];
+    }
+    ++insert_count_;
+  }
+
+  inline void insert(const std::string& key)
+  {
+    insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
+  }
+
+  inline void insert(const char* data, const std::size_t& length)
+  {
+    insert(reinterpret_cast<const unsigned char*>(data),length);
+  }
+
+  template<typename InputIterator>
+  inline void insert(const InputIterator begin, const InputIterator end)
+  {
+    InputIterator itr = begin;
+    while (end != itr)
+    {
+      insert(*(itr++));
+    }
+  }
+
+  /**
+   * check if a u32 is contained by set
+   *
+   * NOTE: the internal hash is weak enough that consecutive inputs do
+   * not achieve the desired fpp.  Well-mixed values should be used
+   * here (e.g., put rjhash(x) into the filter instead of just x).
+   *
+   * @param val integer value to query
+   * @returns true if value is (probably) in the set, false if it definitely is not
+   */
+  inline virtual bool contains(uint32_t val) const
+  {
+    if (table_size_ == 0) {
+      return false;
+    }
+    for (auto salt : salt_) {
+      auto [bit_index, bit] = compute_indices(hash_ap(val, salt));
+      if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit]) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
+  {
+    if (table_size_ == 0) {
+      return false;
+    }
+    for (auto salt : salt_) {
+      auto [bit_index, bit] = compute_indices(hash_ap(key_begin, length, salt));
+      if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit]) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  inline bool contains(const std::string& key) const
+  {
+    return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
+  }
+
+  inline bool contains(const char* data, const std::size_t& length) const
+  {
+    return contains(reinterpret_cast<const unsigned char*>(data),length);
+  }
+
+  template<typename InputIterator>
+  inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
+  {
+    InputIterator itr = begin;
+    while (end != itr)
+    {
+      if (!contains(*itr))
+      {
+        return itr;
+      }
+      ++itr;
+    }
+    return end;
+  }
+
+  template<typename InputIterator>
+  inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
+  {
+    InputIterator itr = begin;
+    while (end != itr)
+    {
+      if (contains(*itr))
+      {
+        return itr;
+      }
+      ++itr;
+    }
+    return end;
+  }
+
+  inline virtual std::size_t size() const
+  {
+    return table_size_ * CHAR_BIT;
+  }
+
+  inline std::size_t element_count() const
+  {
+    return insert_count_;
+  }
+
+  inline bool is_full() const
+  {
+    return insert_count_ >= target_element_count_;
+  }
+
+  /*
+   * density of bits set.  inconvenient units, but:
+   *    .3  = ~50% target insertions
+   *    .5  = 100% target insertions, "perfectly full"
+   *    .75 = 200% target insertions
+   *   1.0  = all bits set... infinite insertions
+   */
+  double density() const;
+
+  virtual inline double approx_unique_element_count() const {
+    // this is not a very good estimate; a better solution should have
+    // some asymptotic behavior as density() approaches 1.0.
+    return (double)target_element_count_ * 2.0 * density();
+  }
+
+  inline double effective_fpp() const
+  {
+    /*
+      Note:
+      The effective false positive probability is calculated using the
+      designated table size and hash function count in conjunction with
+      the current number of inserted elements - not the user defined
+      predicated/expected number of inserted elements.
+    */
+    return std::pow(1.0 - std::exp(-1.0 * salt_.size() * insert_count_ / size()), 1.0 * salt_.size());
+  }
+
+  inline const cell_type* table() const
+  {
+    return bit_table_.data();
+  }
+
+protected:
+
+  virtual std::pair<size_t /* bit_index */,
+		    size_t /* bit */>
+  compute_indices(const bloom_type& hash) const
+  {
+    size_t bit_index = hash % (table_size_ << 3);
+    size_t bit = bit_index & 7;
+    return {bit_index, bit};
+  }
+
+  void generate_unique_salt()
+  {
+    /*
+      Note:
+      A distinct hash function need not be implementation-wise
+      distinct. In the current implementation "seeding" a common
+      hash function with different values seems to be adequate.
+    */
+    const unsigned int predef_salt_count = 128;
+    static const bloom_type predef_salt[predef_salt_count] = {
+      0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
+      0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
+      0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
+      0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
+      0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
+      0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
+      0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
+      0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
+      0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
+      0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
+      0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
+      0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
+      0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
+      0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
+      0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
+      0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
+      0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
+      0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
+      0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
+      0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
+      0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
+      0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
+      0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
+      0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
+      0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
+      0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
+      0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
+      0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
+      0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
+      0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
+      0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
+      0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
+    };
+
+    if (salt_count_ <= predef_salt_count)
+    {
+      std::copy(predef_salt,
+		predef_salt + salt_count_,
+		std::back_inserter(salt_));
+       for (unsigned int i = 0; i < salt_.size(); ++i)
+       {
+        /*
+          Note:
+          This is done to integrate the user defined random seed,
+          so as to allow for the generation of unique bloom filter
+          instances.
+        */
+        salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_;
+       }
+    }
+    else
+    {
+      std::copy(predef_salt,predef_salt + predef_salt_count,
+		std::back_inserter(salt_));
+      srand(static_cast<unsigned int>(random_seed_));
+      while (salt_.size() < salt_count_)
+      {
+        bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
+        if (0 == current_salt)
+	  continue;
+        if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
+        {
+          salt_.push_back(current_salt);
+        }
+      }
+    }
+  }
+
+  static std::pair<std::size_t /* salt_count */,
+		   std::size_t /* table_size */>
+  find_optimal_parameters(std::size_t target_insert_count,
+			  double target_fpp)
+  {
+    /*
+      Note:
+      The following will attempt to find the number of hash functions
+      and minimum amount of storage bits required to construct a bloom
+      filter consistent with the user defined false positive probability
+      and estimated element insertion count.
+    */
+
+    double min_m = std::numeric_limits<double>::infinity();
+    double min_k = 0.0;
+    double curr_m = 0.0;
+    double k = 1.0;
+    while (k < 1000.0)
+    {
+      double numerator  = (- k * target_insert_count);
+      double denominator = std::log(1.0 - std::pow(target_fpp, 1.0 / k));
+      curr_m = numerator / denominator;
+
+      if (curr_m < min_m)
+      {
+        min_m = curr_m;
+        min_k = k;
+      }
+      k += 1.0;
+    }
+
+    size_t salt_count = static_cast<std::size_t>(min_k);
+    size_t t = static_cast<std::size_t>(min_m);
+    t += (((t & 7) != 0) ? (CHAR_BIT - (t & 7)) : 0);
+    size_t table_size = t >> 3;
+    return {salt_count, table_size};
+  }
+
+  inline bloom_type hash_ap(uint32_t val, bloom_type hash) const
+  {
+    hash ^=    (hash <<  7) ^  ((val & 0xff000000) >> 24) * (hash >> 3);
+    hash ^= (~((hash << 11) + (((val & 0xff0000) >> 16) ^ (hash >> 5))));
+    hash ^=    (hash <<  7) ^  ((val & 0xff00) >> 8) * (hash >> 3);
+    hash ^= (~((hash << 11) + (((val & 0xff)) ^ (hash >> 5))));
+    return hash;
+  }
+
+  inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
+  {
+    const unsigned char* itr = begin;
+
+    while (remaining_length >= 4)
+    {
+      hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
+      hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+      hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
+      hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+      remaining_length -= 4;
+    }
+
+    while (remaining_length >= 2)
+    {
+      hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
+      hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+      remaining_length -= 2;
+    }
+
+    if (remaining_length)
+    {
+      hash ^= (hash <<  7) ^ (*itr) * (hash >> 3);
+    }
+
+    return hash;
+  }
+
+public:
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<bloom_filter*>& ls);
+};
+WRITE_CLASS_ENCODER(bloom_filter)
+
+
+class compressible_bloom_filter : public bloom_filter
+{
+public:
+
+  compressible_bloom_filter() : bloom_filter() {}
+
+  compressible_bloom_filter(const std::size_t& predicted_element_count,
+			    const double& false_positive_probability,
+			    const std::size_t& random_seed)
+    : bloom_filter(predicted_element_count, false_positive_probability, random_seed)
+  {
+    size_list.push_back(table_size_);
+  }
+
+  compressible_bloom_filter(const std::size_t& salt_count,
+			    std::size_t table_size,
+			    const std::size_t& random_seed,
+			    std::size_t target_count)
+    : bloom_filter(salt_count, table_size, random_seed, target_count)
+  {
+    size_list.push_back(table_size_);
+  }
+
+  inline std::size_t size() const override
+  {
+    return size_list.back() * CHAR_BIT;
+  }
+
+  inline bool compress(const double& target_ratio)
+  {
+    if (bit_table_.empty())
+      return false;
+
+    if ((0.0 >= target_ratio) || (target_ratio >= 1.0))
+    {
+      return false;
+    }
+
+    std::size_t original_table_size = size_list.back();
+    std::size_t new_table_size = static_cast<std::size_t>(size_list.back() * target_ratio);
+
+    if ((!new_table_size) || (new_table_size >= original_table_size))
+    {
+      return false;
+    }
+
+    table_type tmp(new_table_size);
+    std::copy(bit_table_.begin(), bit_table_.begin() + new_table_size, tmp.begin());
+    auto itr = bit_table_.begin() + new_table_size;
+    auto end = bit_table_.begin() + original_table_size;
+    auto itr_tmp = tmp.begin();
+    auto itr_end = tmp.begin() + new_table_size;
+    while (end != itr) {
+      *(itr_tmp++) |= (*itr++);
+      if (itr_tmp == itr_end) {
+	itr_tmp = tmp.begin();
+      }
+    }
+    std::swap(bit_table_, tmp);
+    size_list.push_back(new_table_size);
+    table_size_ = new_table_size;
+
+    return true;
+  }
+
+  inline double approx_unique_element_count() const override {
+    // this is not a very good estimate; a better solution should have
+    // some asymptotic behavior as density() approaches 1.0.
+    //
+    // the compress() correction is also bad; it tends to under-estimate.
+    return (double)target_element_count_ * 2.0 * density() * (double)size_list.back() / (double)size_list.front();
+  }
+
+private:
+
+  std::pair<size_t /* bit_index */,
+	    size_t /* bit */>
+  compute_indices(const bloom_type& hash) const final
+  {
+    size_t bit_index = hash;
+    for (auto size : size_list) {
+      bit_index %= size << 3;
+    }
+    size_t bit = bit_index & 7;
+    return {bit_index, bit};
+  }
+
+  std::vector<std::size_t> size_list;
+public:
+  void encode(ceph::bufferlist& bl) const;
+  void decode(ceph::bufferlist::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<compressible_bloom_filter*>& ls);
+};
+WRITE_CLASS_ENCODER(compressible_bloom_filter)
+
+#endif
+
+
+/*
+  Note 1:
+  If it can be guaranteed that CHAR_BIT will be of the form 2^n then
+  the following optimization can be used:
+
+  bit_table_[bit_index >> n] |= bit_mask[bit_index & (CHAR_BIT - 1)];
+
+  Note 2:
+  For performance reasons where possible when allocating memory it should
+  be aligned (aligned_alloc) according to the architecture being used.
+*/
diff --git a/src/common/bounded_key_counter.h b/src/common/bounded_key_counter.h
new file mode 100644
index 000000000..ee7fa304a
--- /dev/null
+++ b/src/common/bounded_key_counter.h
@@ -0,0 +1,191 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * Author: Casey Bodley <cbodley@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef BOUNDED_KEY_COUNTER_H
+#define BOUNDED_KEY_COUNTER_H
+
+#include <algorithm>
+#include <map>
+#include <tuple>
+#include <vector>
+
+#include "include/ceph_assert.h"
+
+/**
+ * BoundedKeyCounter
+ *
+ * A data structure that counts the number of times a given key is inserted,
+ * and can return the keys with the highest counters. The number of unique keys
+ * is bounded by the given constructor argument, meaning that new keys will be
+ * rejected if they would exceed this bound.
+ *
+ * It is optimized for use where insertion is frequent, but sorted listings are
+ * both infrequent and tend to request a small subset of the available keys.
+ */
+template <typename Key, typename Count>
+class BoundedKeyCounter {
+  /// map type to associate keys with their counter values
+  using map_type = std::map<Key, Count>;
+  using value_type = typename map_type::value_type;
+
+  /// view type used for sorting key-value pairs by their counter value
+  using view_type = std::vector<const value_type*>;
+
+  /// maximum number of counters to store at once
+  const size_t bound;
+
+  /// map of counters, with a maximum size given by 'bound'
+  map_type counters;
+
+  /// storage for sorted key-value pairs
+  view_type sorted;
+
+  /// remembers how much of the range is actually sorted
+  typename view_type::iterator sorted_position;
+
+  /// invalidate view of sorted entries
+  void invalidate_sorted()
+  {
+    sorted_position = sorted.begin();
+    sorted.clear();
+  }
+
+  /// value_type comparison function for sorting in descending order
+  static bool value_greater(const value_type *lhs, const value_type *rhs)
+  {
+    return lhs->second > rhs->second;
+  }
+
+  /// map iterator that adapts value_type to value_type*
+  struct const_pointer_iterator : public map_type::const_iterator {
+    const_pointer_iterator(typename map_type::const_iterator i)
+      : map_type::const_iterator(i) {}
+
+    using value_type = typename map_type::const_iterator::value_type*;
+    using reference = const typename map_type::const_iterator::value_type*;
+
+    reference operator*() const {
+      return &map_type::const_iterator::operator*();
+    }
+  };
+
+ protected:
+  /// return the number of sorted entries. marked protected for unit testing
+  size_t get_num_sorted() const
+  {
+    using const_iterator = typename view_type::const_iterator;
+    return std::distance<const_iterator>(sorted.begin(), sorted_position);
+  }
+
+ public:
+  BoundedKeyCounter(size_t bound)
+    : bound(bound)
+  {
+    sorted.reserve(bound);
+    sorted_position = sorted.begin();
+  }
+
+  /// return the number of keys stored
+  size_t size() const noexcept { return counters.size(); }
+
+  /// return the maximum number of keys
+  size_t capacity() const noexcept { return bound; }
+
+  /// increment a counter for the given key and return its value. if the key was
+  /// not present, insert it. if the map is full, return 0
+  Count insert(const Key& key, Count n = 1)
+  {
+    typename map_type::iterator i;
+
+    if (counters.size() < bound) {
+      // insert new entries at count=0
+      bool inserted;
+      std::tie(i, inserted) = counters.emplace(key, 0);
+      if (inserted) {
+        sorted.push_back(&*i);
+      }
+    } else {
+      // when full, refuse to insert new entries
+      i = counters.find(key);
+      if (i == counters.end()) {
+        return 0;
+      }
+    }
+
+    i->second += n; // add to the counter
+
+    // update sorted position if necessary. use a binary search for the last
+    // element in the sorted range that's greater than this counter
+    sorted_position = std::lower_bound(sorted.begin(), sorted_position,
+                                       &*i, &value_greater);
+
+    return i->second;
+  }
+
+  /// remove the given key from the map of counters
+  void erase(const Key& key)
+  {
+    auto i = counters.find(key);
+    if (i == counters.end()) {
+      return;
+    }
+    // removing the sorted entry would require linear search; invalidate instead
+    invalidate_sorted();
+
+    counters.erase(i);
+  }
+
+  /// query the highest N key-value pairs sorted by counter value, passing each
+  /// in order to the given callback with arguments (Key, Count)
+  template <typename Callback>
+  void get_highest(size_t count, Callback&& cb)
+  {
+    if (sorted.empty()) {
+      // initialize the vector with pointers to all key-value pairs
+      sorted.assign(const_pointer_iterator{counters.cbegin()},
+                    const_pointer_iterator{counters.cend()});
+      // entire range is unsorted
+      ceph_assert(sorted_position == sorted.begin());
+    }
+
+    const size_t sorted_count = get_num_sorted();
+    if (sorted_count < count) {
+      // move sorted_position to cover the requested number of entries
+      sorted_position = sorted.begin() + std::min(count, sorted.size());
+
+      // sort all entries in descending order up to the given position
+      std::partial_sort(sorted.begin(), sorted_position, sorted.end(),
+                        &value_greater);
+    }
+
+    // return the requested range via callback
+    for (const auto& pair : sorted) {
+      if (count-- == 0) {
+        return;
+      }
+      cb(pair->first, pair->second);
+    }
+  }
+
+  /// remove all keys and counters and invalidate the sorted range
+  void clear()
+  {
+    invalidate_sorted();
+    counters.clear();
+  }
+};
+
+#endif // BOUNDED_KEY_COUNTER_H
diff --git a/src/common/buffer.cc b/src/common/buffer.cc
new file mode 100644
index 000000000..b363b9957
--- /dev/null
+++ b/src/common/buffer.cc
@@ -0,0 +1,2358 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <atomic>
+#include <cstring>
+#include <errno.h>
+#include <limits.h>
+
+#include <sys/uio.h>
+
+#include "include/ceph_assert.h"
+#include "include/types.h"
+#include "include/buffer_raw.h"
+#include "include/compat.h"
+#include "include/mempool.h"
+#include "armor.h"
+#include "common/environment.h"
+#include "common/errno.h"
+#include "common/error_code.h"
+#include "common/safe_io.h"
+#include "common/strtol.h"
+#include "common/likely.h"
+#include "common/valgrind.h"
+#include "common/deleter.h"
+#include "common/error_code.h"
+#include "include/intarith.h"
+#include "include/spinlock.h"
+#include "include/scope_guard.h"
+
+using std::cerr;
+using std::make_pair;
+using std::pair;
+using std::string;
+
+using namespace ceph;
+
+#define CEPH_BUFFER_ALLOC_UNIT  4096u
+#define CEPH_BUFFER_APPEND_SIZE (CEPH_BUFFER_ALLOC_UNIT - sizeof(raw_combined))
+
+// 256K is the maximum "small" object size in tcmalloc above which allocations come from
+// the central heap.  For now let's keep this below that threshold.
+#define CEPH_BUFFER_ALLOC_UNIT_MAX std::size_t { 256*1024 }
+
+#ifdef BUFFER_DEBUG
+static ceph::spinlock debug_lock;
+# define bdout { std::lock_guard<ceph::spinlock> lg(debug_lock); std::cout
+# define bendl std::endl; }
+#else
+# define bdout if (0) { std::cout
+# define bendl std::endl; }
+#endif
+
+  static ceph::atomic<unsigned> buffer_cached_crc { 0 };
+  static ceph::atomic<unsigned> buffer_cached_crc_adjusted { 0 };
+  static ceph::atomic<unsigned> buffer_missed_crc { 0 };
+
+  static bool buffer_track_crc = get_env_bool("CEPH_BUFFER_TRACK");
+
+  void buffer::track_cached_crc(bool b) {
+    buffer_track_crc = b;
+  }
+  int buffer::get_cached_crc() {
+    return buffer_cached_crc;
+  }
+  int buffer::get_cached_crc_adjusted() {
+    return buffer_cached_crc_adjusted;
+  }
+
+  int buffer::get_missed_crc() {
+    return buffer_missed_crc;
+  }
+
+  /*
+   * raw_combined is always placed within a single allocation along
+   * with the data buffer.  the data goes at the beginning, and
+   * raw_combined at the end.
+   */
+  class buffer::raw_combined : public buffer::raw {
+  public:
+    raw_combined(char *dataptr, unsigned l, int mempool)
+      : raw(dataptr, l, mempool) {
+    }
+
+    static ceph::unique_leakable_ptr<buffer::raw>
+    create(unsigned len,
+	   unsigned align,
+	   int mempool = mempool::mempool_buffer_anon)
+    {
+      // posix_memalign() requires a multiple of sizeof(void *)
+      align = std::max<unsigned>(align, sizeof(void *));
+      size_t rawlen = round_up_to(sizeof(buffer::raw_combined),
+				  alignof(buffer::raw_combined));
+      size_t datalen = round_up_to(len, alignof(buffer::raw_combined));
+
+#ifdef DARWIN
+      char *ptr = (char *) valloc(rawlen + datalen);
+#else
+      char *ptr = 0;
+      int r = ::posix_memalign((void**)(void*)&ptr, align, rawlen + datalen);
+      if (r)
+	throw bad_alloc();
+#endif /* DARWIN */
+      if (!ptr)
+	throw bad_alloc();
+
+      // actual data first, since it has presumably larger alignment restriction
+      // then put the raw_combined at the end
+      return ceph::unique_leakable_ptr<buffer::raw>(
+	new (ptr + datalen) raw_combined(ptr, len, mempool));
+    }
+
+    static void operator delete(void *ptr) {
+      raw_combined *raw = (raw_combined *)ptr;
+      aligned_free((void *)raw->data);
+    }
+  };
+
+  class buffer::raw_malloc : public buffer::raw {
+  public:
+    MEMPOOL_CLASS_HELPERS();
+
+    explicit raw_malloc(unsigned l) : raw(l) {
+      if (len) {
+	data = (char *)malloc(len);
+        if (!data)
+          throw bad_alloc();
+      } else {
+	data = 0;
+      }
+      bdout << "raw_malloc " << this << " alloc " << (void *)data << " " << l << bendl;
+    }
+    raw_malloc(unsigned l, char *b) : raw(b, l) {
+      bdout << "raw_malloc " << this << " alloc " << (void *)data << " " << l << bendl;
+    }
+    ~raw_malloc() override {
+      free(data);
+      bdout << "raw_malloc " << this << " free " << (void *)data << " " << bendl;
+    }
+  };
+
+#ifndef __CYGWIN__
+  class buffer::raw_posix_aligned : public buffer::raw {
+  public:
+    MEMPOOL_CLASS_HELPERS();
+
+    raw_posix_aligned(unsigned l, unsigned align) : raw(l) {
+      // posix_memalign() requires a multiple of sizeof(void *)
+      align = std::max<unsigned>(align, sizeof(void *));
+#ifdef DARWIN
+      data = (char *) valloc(len);
+#else
+      int r = ::posix_memalign((void**)(void*)&data, align, len);
+      if (r)
+	throw bad_alloc();
+#endif /* DARWIN */
+      if (!data)
+	throw bad_alloc();
+      bdout << "raw_posix_aligned " << this << " alloc " << (void *)data
+	    << " l=" << l << ", align=" << align << bendl;
+    }
+    ~raw_posix_aligned() override {
+      aligned_free(data);
+      bdout << "raw_posix_aligned " << this << " free " << (void *)data << bendl;
+    }
+  };
+#endif
+
+#ifdef __CYGWIN__
+  class buffer::raw_hack_aligned : public buffer::raw {
+    char *realdata;
+  public:
+    raw_hack_aligned(unsigned l, unsigned align) : raw(l) {
+      realdata = new char[len+align-1];
+      unsigned off = ((uintptr_t)realdata) & (align-1);
+      if (off)
+	data = realdata + align - off;
+      else
+	data = realdata;
+      //cout << "hack aligned " << (unsigned)data
+      //<< " in raw " << (unsigned)realdata
+      //<< " off " << off << std::endl;
+      ceph_assert(((uintptr_t)data & (align-1)) == 0);
+    }
+    ~raw_hack_aligned() {
+      delete[] realdata;
+    }
+  };
+#endif
+
+  /*
+   * primitive buffer types
+   */
+  class buffer::raw_claimed_char : public buffer::raw {
+  public:
+    MEMPOOL_CLASS_HELPERS();
+
+    explicit raw_claimed_char(unsigned l, char *b) : raw(b, l) {
+      bdout << "raw_claimed_char " << this << " alloc " << (void *)data
+	    << " " << l << bendl;
+    }
+    ~raw_claimed_char() override {
+      bdout << "raw_claimed_char " << this << " free " << (void *)data
+	    << bendl;
+    }
+  };
+
+  class buffer::raw_static : public buffer::raw {
+  public:
+    MEMPOOL_CLASS_HELPERS();
+
+    raw_static(const char *d, unsigned l) : raw((char*)d, l) { }
+    ~raw_static() override {}
+  };
+
+  class buffer::raw_claim_buffer : public buffer::raw {
+    deleter del;
+   public:
+    raw_claim_buffer(const char *b, unsigned l, deleter d)
+        : raw((char*)b, l), del(std::move(d)) { }
+    ~raw_claim_buffer() override {}
+  };
+
+  ceph::unique_leakable_ptr<buffer::raw> buffer::copy(const char *c, unsigned len) {
+    auto r = buffer::create_aligned(len, sizeof(size_t));
+    memcpy(r->get_data(), c, len);
+    return r;
+  }
+
+  ceph::unique_leakable_ptr<buffer::raw> buffer::create(unsigned len) {
+    return buffer::create_aligned(len, sizeof(size_t));
+  }
+  ceph::unique_leakable_ptr<buffer::raw> buffer::create(unsigned len, char c) {
+    auto ret = buffer::create_aligned(len, sizeof(size_t));
+    memset(ret->get_data(), c, len);
+    return ret;
+  }
+  ceph::unique_leakable_ptr<buffer::raw>
+  buffer::create_in_mempool(unsigned len, int mempool) {
+    return buffer::create_aligned_in_mempool(len, sizeof(size_t), mempool);
+  }
+  ceph::unique_leakable_ptr<buffer::raw>
+  buffer::claim_char(unsigned len, char *buf) {
+    return ceph::unique_leakable_ptr<buffer::raw>(
+      new raw_claimed_char(len, buf));
+  }
+  ceph::unique_leakable_ptr<buffer::raw> buffer::create_malloc(unsigned len) {
+    return ceph::unique_leakable_ptr<buffer::raw>(new raw_malloc(len));
+  }
+  ceph::unique_leakable_ptr<buffer::raw>
+  buffer::claim_malloc(unsigned len, char *buf) {
+    return ceph::unique_leakable_ptr<buffer::raw>(new raw_malloc(len, buf));
+  }
+  ceph::unique_leakable_ptr<buffer::raw>
+  buffer::create_static(unsigned len, char *buf) {
+    return ceph::unique_leakable_ptr<buffer::raw>(new raw_static(buf, len));
+  }
+  ceph::unique_leakable_ptr<buffer::raw>
+  buffer::claim_buffer(unsigned len, char *buf, deleter del) {
+    return ceph::unique_leakable_ptr<buffer::raw>(
+      new raw_claim_buffer(buf, len, std::move(del)));
+  }
+
+  ceph::unique_leakable_ptr<buffer::raw> buffer::create_aligned_in_mempool(
+    unsigned len, unsigned align, int mempool)
+  {
+    // If alignment is a page multiple, use a separate buffer::raw to
+    // avoid fragmenting the heap.
+    //
+    // Somewhat unexpectedly, I see consistently better performance
+    // from raw_combined than from raw even when the allocation size is
+    // a page multiple (but alignment is not).
+    //
+    // I also see better performance from a separate buffer::raw once the
+    // size passes 8KB.
+    if ((align & ~CEPH_PAGE_MASK) == 0 ||
+	len >= CEPH_PAGE_SIZE * 2) {
+#ifndef __CYGWIN__
+      return ceph::unique_leakable_ptr<buffer::raw>(new raw_posix_aligned(len, align));
+#else
+      return ceph::unique_leakable_ptr<buffer::raw>(new raw_hack_aligned(len, align));
+#endif
+    }
+    return raw_combined::create(len, align, mempool);
+  }
+  ceph::unique_leakable_ptr<buffer::raw> buffer::create_aligned(
+    unsigned len, unsigned align) {
+    return create_aligned_in_mempool(len, align,
+				     mempool::mempool_buffer_anon);
+  }
+
+  ceph::unique_leakable_ptr<buffer::raw> buffer::create_page_aligned(unsigned len) {
+    return create_aligned(len, CEPH_PAGE_SIZE);
+  }
+  ceph::unique_leakable_ptr<buffer::raw> buffer::create_small_page_aligned(unsigned len) {
+    if (len < CEPH_PAGE_SIZE) {
+      return create_aligned(len, CEPH_BUFFER_ALLOC_UNIT);
+    } else {
+      return create_aligned(len, CEPH_PAGE_SIZE);
+    }
+  }
+
+  buffer::ptr::ptr(ceph::unique_leakable_ptr<raw> r)
+    : _raw(r.release()),
+      _off(0),
+      _len(_raw->get_len())
+  {
+    _raw->nref.store(1, std::memory_order_release);
+    bdout << "ptr " << this << " get " << _raw << bendl;
+  }
+  buffer::ptr::ptr(unsigned l) : _off(0), _len(l)
+  {
+    _raw = buffer::create(l).release();
+    _raw->nref.store(1, std::memory_order_release);
+    bdout << "ptr " << this << " get " << _raw << bendl;
+  }
+  buffer::ptr::ptr(const char *d, unsigned l) : _off(0), _len(l)    // ditto.
+  {
+    _raw = buffer::copy(d, l).release();
+    _raw->nref.store(1, std::memory_order_release);
+    bdout << "ptr " << this << " get " << _raw << bendl;
+  }
+  buffer::ptr::ptr(const ptr& p) : _raw(p._raw), _off(p._off), _len(p._len)
+  {
+    if (_raw) {
+      _raw->nref++;
+      bdout << "ptr " << this << " get " << _raw << bendl;
+    }
+  }
+  buffer::ptr::ptr(ptr&& p) noexcept : _raw(p._raw), _off(p._off), _len(p._len)
+  {
+    p._raw = nullptr;
+    p._off = p._len = 0;
+  }
+  buffer::ptr::ptr(const ptr& p, unsigned o, unsigned l)
+    : _raw(p._raw), _off(p._off + o), _len(l)
+  {
+    ceph_assert(o+l <= p._len);
+    ceph_assert(_raw);
+    _raw->nref++;
+    bdout << "ptr " << this << " get " << _raw << bendl;
+  }
+  buffer::ptr::ptr(const ptr& p, ceph::unique_leakable_ptr<raw> r)
+    : _raw(r.release()),
+      _off(p._off),
+      _len(p._len)
+  {
+    _raw->nref.store(1, std::memory_order_release);
+    bdout << "ptr " << this << " get " << _raw << bendl;
+  }
+  buffer::ptr& buffer::ptr::operator= (const ptr& p)
+  {
+    if (p._raw) {
+      p._raw->nref++;
+      bdout << "ptr " << this << " get " << _raw << bendl;
+    }
+    buffer::raw *raw = p._raw; 
+    release();
+    if (raw) {
+      _raw = raw;
+      _off = p._off;
+      _len = p._len;
+    } else {
+      _off = _len = 0;
+    }
+    return *this;
+  }
+  buffer::ptr& buffer::ptr::operator= (ptr&& p) noexcept
+  {
+    release();
+    buffer::raw *raw = p._raw;
+    if (raw) {
+      _raw = raw;
+      _off = p._off;
+      _len = p._len;
+      p._raw = nullptr;
+      p._off = p._len = 0;
+    } else {
+      _off = _len = 0;
+    }
+    return *this;
+  }
+
+  void buffer::ptr::swap(ptr& other) noexcept
+  {
+    raw *r = _raw;
+    unsigned o = _off;
+    unsigned l = _len;
+    _raw = other._raw;
+    _off = other._off;
+    _len = other._len;
+    other._raw = r;
+    other._off = o;
+    other._len = l;
+  }
+
+  void buffer::ptr::release()
+  {
+    // BE CAREFUL: this is called also for hypercombined ptr_node. After
+    // freeing underlying raw, `*this` can become inaccessible as well!
+    //
+    // cache the pointer to avoid unncecessary reloads and repeated
+    // checks.
+    if (auto* const cached_raw = std::exchange(_raw, nullptr);
+	cached_raw) {
+      bdout << "ptr " << this << " release " << cached_raw << bendl;
+      // optimize the common case where a particular `buffer::raw` has
+      // only a single reference. Altogether with initializing `nref` of
+      // freshly fabricated one with `1` through the std::atomic's ctor
+      // (which doesn't impose a memory barrier on the strongly-ordered
+      // x86), this allows to avoid all atomical operations in such case.
+      const bool last_one = \
+        (1 == cached_raw->nref.load(std::memory_order_acquire));
+      if (likely(last_one) || --cached_raw->nref == 0) {
+	bdout << "deleting raw " << static_cast<void*>(cached_raw)
+	      << " len " << cached_raw->get_len() << bendl;
+	ANNOTATE_HAPPENS_AFTER(&cached_raw->nref);
+	ANNOTATE_HAPPENS_BEFORE_FORGET_ALL(&cached_raw->nref);
+	delete cached_raw;  // dealloc old (if any)
+      } else {
+	ANNOTATE_HAPPENS_BEFORE(&cached_raw->nref);
+      }
+    }
+  }
+
+  int buffer::ptr::get_mempool() const {
+    if (_raw) {
+      return _raw->mempool;
+    }
+    return mempool::mempool_buffer_anon;
+  }
+
+  void buffer::ptr::reassign_to_mempool(int pool) {
+    if (_raw) {
+      _raw->reassign_to_mempool(pool);
+    }
+  }
+  void buffer::ptr::try_assign_to_mempool(int pool) {
+    if (_raw) {
+      _raw->try_assign_to_mempool(pool);
+    }
+  }
+
+  const char *buffer::ptr::c_str() const {
+    ceph_assert(_raw);
+    return _raw->get_data() + _off;
+  }
+  char *buffer::ptr::c_str() {
+    ceph_assert(_raw);
+    return _raw->get_data() + _off;
+  }
+  const char *buffer::ptr::end_c_str() const {
+    ceph_assert(_raw);
+    return _raw->get_data() + _off + _len;
+  }
+  char *buffer::ptr::end_c_str() {
+    ceph_assert(_raw);
+    return _raw->get_data() + _off + _len;
+  }
+
+  unsigned buffer::ptr::unused_tail_length() const
+  {
+    return _raw ? _raw->get_len() - (_off + _len) : 0;
+  }
+  const char& buffer::ptr::operator[](unsigned n) const
+  {
+    ceph_assert(_raw);
+    ceph_assert(n < _len);
+    return _raw->get_data()[_off + n];
+  }
+  char& buffer::ptr::operator[](unsigned n)
+  {
+    ceph_assert(_raw);
+    ceph_assert(n < _len);
+    return _raw->get_data()[_off + n];
+  }
+
+  const char *buffer::ptr::raw_c_str() const { ceph_assert(_raw); return _raw->get_data(); }
+  unsigned buffer::ptr::raw_length() const { ceph_assert(_raw); return _raw->get_len(); }
+  int buffer::ptr::raw_nref() const { ceph_assert(_raw); return _raw->nref; }
+
+  void buffer::ptr::copy_out(unsigned o, unsigned l, char *dest) const {
+    ceph_assert(_raw);
+    if (o+l > _len)
+        throw end_of_buffer();
+    char* src =  _raw->get_data() + _off + o;
+    maybe_inline_memcpy(dest, src, l, 8);
+  }
+
+  unsigned buffer::ptr::wasted() const
+  {
+    return _raw->get_len() - _len;
+  }
+
+  int buffer::ptr::cmp(const ptr& o) const
+  {
+    int l = _len < o._len ? _len : o._len;
+    if (l) {
+      int r = memcmp(c_str(), o.c_str(), l);
+      if (r)
+	return r;
+    }
+    if (_len < o._len)
+      return -1;
+    if (_len > o._len)
+      return 1;
+    return 0;
+  }
+
+  bool buffer::ptr::is_zero() const
+  {
+    return mem_is_zero(c_str(), _len);
+  }
+
+  unsigned buffer::ptr::append(char c)
+  {
+    ceph_assert(_raw);
+    ceph_assert(1 <= unused_tail_length());
+    char* ptr = _raw->get_data() + _off + _len;
+    *ptr = c;
+    _len++;
+    return _len + _off;
+  }
+
+  unsigned buffer::ptr::append(const char *p, unsigned l)
+  {
+    ceph_assert(_raw);
+    ceph_assert(l <= unused_tail_length());
+    char* c = _raw->get_data() + _off + _len;
+    maybe_inline_memcpy(c, p, l, 32);
+    _len += l;
+    return _len + _off;
+  }
+
+  unsigned buffer::ptr::append_zeros(unsigned l)
+  {
+    ceph_assert(_raw);
+    ceph_assert(l <= unused_tail_length());
+    char* c = _raw->get_data() + _off + _len;
+    // FIPS zeroization audit 20191115: this memset is not security related.
+    memset(c, 0, l);
+    _len += l;
+    return _len + _off;
+  }
+
+  void buffer::ptr::copy_in(unsigned o, unsigned l, const char *src, bool crc_reset)
+  {
+    ceph_assert(_raw);
+    ceph_assert(o <= _len);
+    ceph_assert(o+l <= _len);
+    char* dest = _raw->get_data() + _off + o;
+    if (crc_reset)
+        _raw->invalidate_crc();
+    maybe_inline_memcpy(dest, src, l, 64);
+  }
+
+  void buffer::ptr::zero(bool crc_reset)
+  {
+    if (crc_reset)
+        _raw->invalidate_crc();
+    // FIPS zeroization audit 20191115: this memset is not security related.
+    memset(c_str(), 0, _len);
+  }
+
+  void buffer::ptr::zero(unsigned o, unsigned l, bool crc_reset)
+  {
+    ceph_assert(o+l <= _len);
+    if (crc_reset)
+        _raw->invalidate_crc();
+    // FIPS zeroization audit 20191115: this memset is not security related.
+    memset(c_str()+o, 0, l);
+  }
+
+  template<bool B>
+  buffer::ptr::iterator_impl<B>& buffer::ptr::iterator_impl<B>::operator +=(size_t len) {
+    pos += len;
+    if (pos > end_ptr)
+      throw end_of_buffer();
+    return *this;
+  }
+
+  template buffer::ptr::iterator_impl<false>&
+  buffer::ptr::iterator_impl<false>::operator +=(size_t len);
+  template buffer::ptr::iterator_impl<true>&
+  buffer::ptr::iterator_impl<true>::operator +=(size_t len);
+
+  // -- buffer::list::iterator --
+  /*
+  buffer::list::iterator operator=(const buffer::list::iterator& other)
+  {
+    if (this != &other) {
+      bl = other.bl;
+      ls = other.ls;
+      off = other.off;
+      p = other.p;
+      p_off = other.p_off;
+    }
+    return *this;
+    }*/
+
+  template<bool is_const>
+  buffer::list::iterator_impl<is_const>::iterator_impl(bl_t *l, unsigned o)
+    : bl(l), ls(&bl->_buffers), p(ls->begin()), off(0), p_off(0)
+  {
+    *this += o;
+  }
+
+  template<bool is_const>
+  buffer::list::iterator_impl<is_const>::iterator_impl(const buffer::list::iterator& i)
+    : iterator_impl<is_const>(i.bl, i.off, i.p, i.p_off) {}
+
+  template<bool is_const>
+  auto buffer::list::iterator_impl<is_const>::operator +=(unsigned o)
+    -> iterator_impl&
+  {
+    //cout << this << " advance " << o << " from " << off
+    //     << " (p_off " << p_off << " in " << p->length() << ")"
+    //     << std::endl;
+
+    p_off +=o;
+    while (p != ls->end()) {
+      if (p_off >= p->length()) {
+        // skip this buffer
+        p_off -= p->length();
+        p++;
+      } else {
+        // somewhere in this buffer!
+        break;
+      }
+    }
+    if (p == ls->end() && p_off) {
+      throw end_of_buffer();
+    }
+    off += o;
+    return *this;
+  }
+
+  template<bool is_const>
+  void buffer::list::iterator_impl<is_const>::seek(unsigned o)
+  {
+    p = ls->begin();
+    off = p_off = 0;
+    *this += o;
+  }
+
+  template<bool is_const>
+  char buffer::list::iterator_impl<is_const>::operator*() const
+  {
+    if (p == ls->end())
+      throw end_of_buffer();
+    return (*p)[p_off];
+  }
+
+  template<bool is_const>
+  buffer::list::iterator_impl<is_const>&
+  buffer::list::iterator_impl<is_const>::operator++()
+  {
+    if (p == ls->end())
+      throw end_of_buffer();
+    *this += 1;
+    return *this;
+  }
+
+  template<bool is_const>
+  buffer::ptr buffer::list::iterator_impl<is_const>::get_current_ptr() const
+  {
+    if (p == ls->end())
+      throw end_of_buffer();
+    return ptr(*p, p_off, p->length() - p_off);
+  }
+
+  template<bool is_const>
+  bool buffer::list::iterator_impl<is_const>::is_pointing_same_raw(
+    const ptr& other) const
+  {
+    if (p == ls->end())
+      throw end_of_buffer();
+    return p->_raw == other._raw;
+  }
+
+  // copy data out.
+  // note that these all _append_ to dest!
+  template<bool is_const>
+  void buffer::list::iterator_impl<is_const>::copy(unsigned len, char *dest)
+  {
+    if (p == ls->end()) seek(off);
+    while (len > 0) {
+      if (p == ls->end())
+	throw end_of_buffer();
+
+      unsigned howmuch = p->length() - p_off;
+      if (len < howmuch) howmuch = len;
+      p->copy_out(p_off, howmuch, dest);
+      dest += howmuch;
+
+      len -= howmuch;
+      *this += howmuch;
+    }
+  }
+
+  template<bool is_const>
+  void buffer::list::iterator_impl<is_const>::copy(unsigned len, ptr &dest)
+  {
+    copy_deep(len, dest);
+  }
+
+  template<bool is_const>
+  void buffer::list::iterator_impl<is_const>::copy_deep(unsigned len, ptr &dest)
+  {
+    if (!len) {
+      return;
+    }
+    if (p == ls->end())
+      throw end_of_buffer();
+    dest = create(len);
+    copy(len, dest.c_str());
+  }
+  template<bool is_const>
+  void buffer::list::iterator_impl<is_const>::copy_shallow(unsigned len,
+							   ptr &dest)
+  {
+    if (!len) {
+      return;
+    }
+    if (p == ls->end())
+      throw end_of_buffer();
+    unsigned howmuch = p->length() - p_off;
+    if (howmuch < len) {
+      dest = create(len);
+      copy(len, dest.c_str());
+    } else {
+      dest = ptr(*p, p_off, len);
+      *this += len;
+    }
+  }
+
+  template<bool is_const>
+  void buffer::list::iterator_impl<is_const>::copy(unsigned len, list &dest)
+  {
+    if (p == ls->end())
+      seek(off);
+    while (len > 0) {
+      if (p == ls->end())
+	throw end_of_buffer();
+
+      unsigned howmuch = p->length() - p_off;
+      if (len < howmuch)
+	howmuch = len;
+      dest.append(*p, p_off, howmuch);
+
+      len -= howmuch;
+      *this += howmuch;
+    }
+  }
+
+  template<bool is_const>
+  void buffer::list::iterator_impl<is_const>::copy(unsigned len, std::string &dest)
+  {
+    if (p == ls->end())
+      seek(off);
+    while (len > 0) {
+      if (p == ls->end())
+	throw end_of_buffer();
+
+      unsigned howmuch = p->length() - p_off;
+      const char *c_str = p->c_str();
+      if (len < howmuch)
+	howmuch = len;
+      dest.append(c_str + p_off, howmuch);
+
+      len -= howmuch;
+      *this += howmuch;
+    }
+  }
+
+  template<bool is_const>
+  void buffer::list::iterator_impl<is_const>::copy_all(list &dest)
+  {
+    if (p == ls->end())
+      seek(off);
+    while (1) {
+      if (p == ls->end())
+	return;
+
+      unsigned howmuch = p->length() - p_off;
+      const char *c_str = p->c_str();
+      dest.append(c_str + p_off, howmuch);
+
+      *this += howmuch;
+    }
+  }
+
+  template<bool is_const>
+  size_t buffer::list::iterator_impl<is_const>::get_ptr_and_advance(
+    size_t want, const char **data)
+  {
+    if (p == ls->end()) {
+      seek(off);
+      if (p == ls->end()) {
+	return 0;
+      }
+    }
+    *data = p->c_str() + p_off;
+    size_t l = std::min<size_t>(p->length() - p_off, want);
+    p_off += l;
+    if (p_off == p->length()) {
+      ++p;
+      p_off = 0;
+    }
+    off += l;
+    return l;
+  }
+
+  template<bool is_const>
+  uint32_t buffer::list::iterator_impl<is_const>::crc32c(
+    size_t length, uint32_t crc)
+  {
+    length = std::min<size_t>(length, get_remaining());
+    while (length > 0) {
+      const char *p;
+      size_t l = get_ptr_and_advance(length, &p);
+      crc = ceph_crc32c(crc, (unsigned char*)p, l);
+      length -= l;
+    }
+    return crc;
+  }
+
+  // explicitly instantiate only the iterator types we need, so we can hide the
+  // details in this compilation unit without introducing unnecessary link time
+  // dependencies.
+  template class buffer::list::iterator_impl<true>;
+  template class buffer::list::iterator_impl<false>;
+
+  buffer::list::iterator::iterator(bl_t *l, unsigned o)
+    : iterator_impl(l, o)
+  {}
+
+  buffer::list::iterator::iterator(bl_t *l, unsigned o, list_iter_t ip, unsigned po)
+    : iterator_impl(l, o, ip, po)
+  {}
+
+  // copy data in
+  void buffer::list::iterator::copy_in(unsigned len, const char *src, bool crc_reset)
+  {
+    // copy
+    if (p == ls->end())
+      seek(off);
+    while (len > 0) {
+      if (p == ls->end())
+	throw end_of_buffer();
+      
+      unsigned howmuch = p->length() - p_off;
+      if (len < howmuch)
+	howmuch = len;
+      p->copy_in(p_off, howmuch, src, crc_reset);
+	
+      src += howmuch;
+      len -= howmuch;
+      *this += howmuch;
+    }
+  }
+  
+  void buffer::list::iterator::copy_in(unsigned len, const list& otherl)
+  {
+    if (p == ls->end())
+      seek(off);
+    unsigned left = len;
+    for (const auto& node : otherl._buffers) {
+      unsigned l = node.length();
+      if (left < l)
+	l = left;
+      copy_in(l, node.c_str());
+      left -= l;
+      if (left == 0)
+	break;
+    }
+  }
+
+  // -- buffer::list --
+
+  void buffer::list::swap(list& other) noexcept
+  {
+    std::swap(_len, other._len);
+    std::swap(_num, other._num);
+    std::swap(_carriage, other._carriage);
+    _buffers.swap(other._buffers);
+  }
+
+  bool buffer::list::contents_equal(const ceph::buffer::list& other) const
+  {
+    if (length() != other.length())
+      return false;
+
+    // buffer-wise comparison
+    if (true) {
+      auto a = std::cbegin(_buffers);
+      auto b = std::cbegin(other._buffers);
+      unsigned aoff = 0, boff = 0;
+      while (a != std::cend(_buffers)) {
+	unsigned len = a->length() - aoff;
+	if (len > b->length() - boff)
+	  len = b->length() - boff;
+	if (memcmp(a->c_str() + aoff, b->c_str() + boff, len) != 0)
+	  return false;
+	aoff += len;
+	if (aoff == a->length()) {
+	  aoff = 0;
+	  ++a;
+	}
+	boff += len;
+	if (boff == b->length()) {
+	  boff = 0;
+	  ++b;
+	}
+      }
+      return true;
+    }
+
+    // byte-wise comparison
+    if (false) {
+      bufferlist::const_iterator me = begin();
+      bufferlist::const_iterator him = other.begin();
+      while (!me.end()) {
+	if (*me != *him)
+	  return false;
+	++me;
+	++him;
+      }
+      return true;
+    }
+  }
+
+  bool buffer::list::contents_equal(const void* const other,
+                                    size_t length) const
+  {
+    if (this->length() != length) {
+      return false;
+    }
+
+    const auto* other_buf = reinterpret_cast<const char*>(other);
+    for (const auto& bp : buffers()) {
+      assert(bp.length() <= length);
+      if (std::memcmp(bp.c_str(), other_buf, bp.length()) != 0) {
+        return false;
+      } else {
+        length -= bp.length();
+        other_buf += bp.length();
+      }
+    }
+
+    return true;
+  }
+
+  bool buffer::list::is_provided_buffer(const char* const dst) const
+  {
+    if (_buffers.empty()) {
+      return false;
+    }
+    return (is_contiguous() && (_buffers.front().c_str() == dst));
+  }
+
+  bool buffer::list::is_aligned(const unsigned align) const
+  {
+    for (const auto& node : _buffers) {
+      if (!node.is_aligned(align)) {
+	return false;
+      }
+    }
+    return true;
+  }
+
+  bool buffer::list::is_n_align_sized(const unsigned align) const
+  {
+    for (const auto& node : _buffers) {
+      if (!node.is_n_align_sized(align)) {
+	return false;
+      }
+    }
+    return true;
+  }
+
+  bool buffer::list::is_aligned_size_and_memory(
+    const unsigned align_size,
+    const unsigned align_memory) const
+  {
+    for (const auto& node : _buffers) {
+      if (!node.is_aligned(align_memory) || !node.is_n_align_sized(align_size)) {
+	return false;
+      }
+    }
+    return true;
+  }
+
+  bool buffer::list::is_zero() const {
+    for (const auto& node : _buffers) {
+      if (!node.is_zero()) {
+	return false;
+      }
+    }
+    return true;
+  }
+
+  void buffer::list::zero()
+  {
+    for (auto& node : _buffers) {
+      node.zero();
+    }
+  }
+
+  void buffer::list::zero(const unsigned o, const unsigned l)
+  {
+    ceph_assert(o+l <= _len);
+    unsigned p = 0;
+    for (auto& node : _buffers) {
+      if (p + node.length() > o) {
+        if (p >= o && p+node.length() <= o+l) {
+          // 'o'------------- l -----------|
+          //      'p'-- node.length() --|
+	  node.zero();
+        } else if (p >= o) {
+          // 'o'------------- l -----------|
+          //    'p'------- node.length() -------|
+	  node.zero(0, o+l-p);
+        } else if (p + node.length() <= o+l) {
+          //     'o'------------- l -----------|
+          // 'p'------- node.length() -------|
+	  node.zero(o-p, node.length()-(o-p));
+        } else {
+          //       'o'----------- l -----------|
+          // 'p'---------- node.length() ----------|
+          node.zero(o-p, l);
+        }
+      }
+      p += node.length();
+      if (o+l <= p) {
+	break;  // done
+      }
+    }
+  }
+
+  bool buffer::list::is_contiguous() const
+  {
+    return _num <= 1;
+  }
+
+  bool buffer::list::is_n_page_sized() const
+  {
+    return is_n_align_sized(CEPH_PAGE_SIZE);
+  }
+
+  bool buffer::list::is_page_aligned() const
+  {
+    return is_aligned(CEPH_PAGE_SIZE);
+  }
+
+  int buffer::list::get_mempool() const
+  {
+    if (_buffers.empty()) {
+      return mempool::mempool_buffer_anon;
+    }
+    return _buffers.back().get_mempool();
+  }
+
+  void buffer::list::reassign_to_mempool(int pool)
+  {
+    for (auto& p : _buffers) {
+      p._raw->reassign_to_mempool(pool);
+    }
+  }
+
+  void buffer::list::try_assign_to_mempool(int pool)
+  {
+    for (auto& p : _buffers) {
+      p._raw->try_assign_to_mempool(pool);
+    }
+  }
+
+  uint64_t buffer::list::get_wasted_space() const
+  {
+    if (_num == 1)
+      return _buffers.back().wasted();
+
+    std::vector<const raw*> raw_vec;
+    raw_vec.reserve(_num);
+    for (const auto& p : _buffers)
+      raw_vec.push_back(p._raw);
+    std::sort(raw_vec.begin(), raw_vec.end());
+
+    uint64_t total = 0;
+    const raw *last = nullptr;
+    for (const auto r : raw_vec) {
+      if (r == last)
+	continue;
+      last = r;
+      total += r->get_len();
+    }
+    // If multiple buffers are sharing the same raw buffer and they overlap
+    // with each other, the wasted space will be underestimated.
+    if (total <= length())
+      return 0;
+    return total - length();
+  }
+
+  void buffer::list::rebuild()
+  {
+    if (_len == 0) {
+      _carriage = &always_empty_bptr;
+      _buffers.clear_and_dispose();
+      _num = 0;
+      return;
+    }
+    if ((_len & ~CEPH_PAGE_MASK) == 0)
+      rebuild(ptr_node::create(buffer::create_page_aligned(_len)));
+    else
+      rebuild(ptr_node::create(buffer::create(_len)));
+  }
+
+  void buffer::list::rebuild(
+    std::unique_ptr<buffer::ptr_node, buffer::ptr_node::disposer> nb)
+  {
+    unsigned pos = 0;
+    int mempool = _buffers.front().get_mempool();
+    nb->reassign_to_mempool(mempool);
+    for (auto& node : _buffers) {
+      nb->copy_in(pos, node.length(), node.c_str(), false);
+      pos += node.length();
+    }
+    _buffers.clear_and_dispose();
+    if (likely(nb->length())) {
+      _carriage = nb.get();
+      _buffers.push_back(*nb.release());
+      _num = 1;
+    } else {
+      _carriage = &always_empty_bptr;
+      _num = 0;
+    }
+    invalidate_crc();
+  }
+
+  bool buffer::list::rebuild_aligned(unsigned align)
+  {
+    return rebuild_aligned_size_and_memory(align, align);
+  }
+  
+  bool buffer::list::rebuild_aligned_size_and_memory(unsigned align_size,
+						    unsigned align_memory,
+						    unsigned max_buffers)
+  {
+    bool had_to_rebuild = false;
+
+    if (max_buffers && _num > max_buffers && _len > (max_buffers * align_size)) {
+      align_size = round_up_to(round_up_to(_len, max_buffers) / max_buffers, align_size);
+    }
+    auto p = std::begin(_buffers);
+    auto p_prev = _buffers.before_begin();
+    while (p != std::end(_buffers)) {
+      // keep anything that's already align and sized aligned
+      if (p->is_aligned(align_memory) && p->is_n_align_sized(align_size)) {
+        /*cout << " segment " << (void*)p->c_str()
+  	     << " offset " << ((unsigned long)p->c_str() & (align - 1))
+  	     << " length " << p->length()
+  	     << " " << (p->length() & (align - 1)) << " ok" << std::endl;
+        */
+        p_prev = p++;
+        continue;
+      }
+      
+      // consolidate unaligned items, until we get something that is sized+aligned
+      list unaligned;
+      unsigned offset = 0;
+      do {
+        /*cout << " segment " << (void*)p->c_str()
+               << " offset " << ((unsigned long)p->c_str() & (align - 1))
+               << " length " << p->length() << " " << (p->length() & (align - 1))
+               << " overall offset " << offset << " " << (offset & (align - 1))
+  	     << " not ok" << std::endl;
+        */
+        offset += p->length();
+        // no need to reallocate, relinking is enough thankfully to bi::list.
+        auto p_after = _buffers.erase_after(p_prev);
+        _num -= 1;
+        unaligned._buffers.push_back(*p);
+        unaligned._len += p->length();
+        unaligned._num += 1;
+        p = p_after;
+      } while (p != std::end(_buffers) &&
+  	     (!p->is_aligned(align_memory) ||
+  	      !p->is_n_align_sized(align_size) ||
+  	      (offset % align_size)));
+      if (!(unaligned.is_contiguous() && unaligned._buffers.front().is_aligned(align_memory))) {
+        unaligned.rebuild(
+          ptr_node::create(
+            buffer::create_aligned(unaligned._len, align_memory)));
+        had_to_rebuild = true;
+      }
+      if (unaligned.get_num_buffers()) {
+        _buffers.insert_after(p_prev, *ptr_node::create(unaligned._buffers.front()).release());
+        _num += 1;
+      } else {
+        // a bufferlist containing only 0-length bptrs is rebuilt as empty
+      }
+      ++p_prev;
+    }
+    return had_to_rebuild;
+  }
+  
+  bool buffer::list::rebuild_page_aligned()
+  {
+   return  rebuild_aligned(CEPH_PAGE_SIZE);
+  }
+
+  void buffer::list::reserve(size_t prealloc)
+  {
+    if (get_append_buffer_unused_tail_length() < prealloc) {
+      auto ptr = ptr_node::create(buffer::create_small_page_aligned(prealloc));
+      ptr->set_length(0);   // unused, so far.
+      _carriage = ptr.get();
+      _buffers.push_back(*ptr.release());
+      _num += 1;
+    }
+  }
+
+  void buffer::list::claim_append(list& bl)
+  {
+    // check overflow
+    assert(_len + bl._len >= _len);
+    // steal the other guy's buffers
+    _len += bl._len;
+    _num += bl._num;
+    _buffers.splice_back(bl._buffers);
+    bl.clear();
+  }
+
+  void buffer::list::append(char c)
+  {
+    // put what we can into the existing append_buffer.
+    unsigned gap = get_append_buffer_unused_tail_length();
+    if (!gap) {
+      // make a new buffer!
+      auto buf = ptr_node::create(
+	raw_combined::create(CEPH_BUFFER_APPEND_SIZE, 0, get_mempool()));
+      buf->set_length(0);   // unused, so far.
+      _carriage = buf.get();
+      _buffers.push_back(*buf.release());
+      _num += 1;
+    } else if (unlikely(_carriage != &_buffers.back())) {
+      auto bptr = ptr_node::create(*_carriage, _carriage->length(), 0);
+      _carriage = bptr.get();
+      _buffers.push_back(*bptr.release());
+      _num += 1;
+    }
+    _carriage->append(c);
+    _len++;
+  }
+
+  buffer::ptr_node buffer::list::always_empty_bptr;
+
+  buffer::ptr_node& buffer::list::refill_append_space(const unsigned len)
+  {
+    // make a new buffer.  fill out a complete page, factoring in the
+    // raw_combined overhead.
+    size_t need = round_up_to(len, sizeof(size_t)) + sizeof(raw_combined);
+    size_t alen = round_up_to(need, CEPH_BUFFER_ALLOC_UNIT);
+    if (_carriage == &_buffers.back()) {
+      size_t nlen = round_up_to(_carriage->raw_length(), CEPH_BUFFER_ALLOC_UNIT) * 2;
+      nlen = std::min(nlen, CEPH_BUFFER_ALLOC_UNIT_MAX);
+      alen = std::max(alen, nlen);
+    }
+    alen -= sizeof(raw_combined);
+
+    auto new_back = \
+      ptr_node::create(raw_combined::create(alen, 0, get_mempool()));
+    new_back->set_length(0);   // unused, so far.
+    _carriage = new_back.get();
+    _buffers.push_back(*new_back.release());
+    _num += 1;
+    return _buffers.back();
+  }
+
+  void buffer::list::append(const char *data, unsigned len)
+  {
+    _len += len;
+
+    const unsigned free_in_last = get_append_buffer_unused_tail_length();
+    const unsigned first_round = std::min(len, free_in_last);
+    if (first_round) {
+      // _buffers and carriage can desynchronize when 1) a new ptr
+      // we don't own has been added into the _buffers 2) _buffers
+      // has been emptied as as a result of std::move or stolen by
+      // claim_append.
+      if (unlikely(_carriage != &_buffers.back())) {
+        auto bptr = ptr_node::create(*_carriage, _carriage->length(), 0);
+	_carriage = bptr.get();
+	_buffers.push_back(*bptr.release());
+        _num += 1;
+      }
+      _carriage->append(data, first_round);
+    }
+
+    const unsigned second_round = len - first_round;
+    if (second_round) {
+      auto& new_back = refill_append_space(second_round);
+      new_back.append(data + first_round, second_round);
+    }
+  }
+
+  buffer::list::reserve_t buffer::list::obtain_contiguous_space(
+    const unsigned len)
+  {
+    // note: if len < the normal append_buffer size it *might*
+    // be better to allocate a normal-sized append_buffer and
+    // use part of it.  however, that optimizes for the case of
+    // old-style types including new-style types.  and in most
+    // such cases, this won't be the very first thing encoded to
+    // the list, so append_buffer will already be allocated.
+    // OTOH if everything is new-style, we *should* allocate
+    // only what we need and conserve memory.
+    if (unlikely(get_append_buffer_unused_tail_length() < len)) {
+      auto new_back = \
+	buffer::ptr_node::create(buffer::create(len)).release();
+      new_back->set_length(0);   // unused, so far.
+      _buffers.push_back(*new_back);
+      _num += 1;
+      _carriage = new_back;
+      return { new_back->c_str(), &new_back->_len, &_len };
+    } else {
+      ceph_assert(!_buffers.empty());
+      if (unlikely(_carriage != &_buffers.back())) {
+        auto bptr = ptr_node::create(*_carriage, _carriage->length(), 0);
+	_carriage = bptr.get();
+	_buffers.push_back(*bptr.release());
+        _num += 1;
+      }
+      return { _carriage->end_c_str(), &_carriage->_len, &_len };
+    }
+  }
+
+  void buffer::list::append(const ptr& bp)
+  {
+      push_back(bp);
+  }
+
+  void buffer::list::append(ptr&& bp)
+  {
+      push_back(std::move(bp));
+  }
+
+  void buffer::list::append(const ptr& bp, unsigned off, unsigned len)
+  {
+    ceph_assert(len+off <= bp.length());
+    if (!_buffers.empty()) {
+      ptr &l = _buffers.back();
+      if (l._raw == bp._raw && l.end() == bp.start() + off) {
+	// yay contiguous with tail bp!
+	l.set_length(l.length()+len);
+	_len += len;
+	return;
+      }
+    }
+    // add new item to list
+    _buffers.push_back(*ptr_node::create(bp, off, len).release());
+    _len += len;
+    _num += 1;
+  }
+
+  void buffer::list::append(const list& bl)
+  {
+    _len += bl._len;
+    _num += bl._num;
+    for (const auto& node : bl._buffers) {
+      _buffers.push_back(*ptr_node::create(node).release());
+    }
+  }
+
+  void buffer::list::append(std::istream& in)
+  {
+    while (!in.eof()) {
+      std::string s;
+      getline(in, s);
+      append(s.c_str(), s.length());
+      if (s.length())
+	append("\n", 1);
+    }
+  }
+
+  buffer::list::contiguous_filler buffer::list::append_hole(const unsigned len)
+  {
+    _len += len;
+
+    if (unlikely(get_append_buffer_unused_tail_length() < len)) {
+      // make a new append_buffer.  fill out a complete page, factoring in
+      // the raw_combined overhead.
+      auto& new_back = refill_append_space(len);
+      new_back.set_length(len);
+      return { new_back.c_str() };
+    } else if (unlikely(_carriage != &_buffers.back())) {
+      auto bptr = ptr_node::create(*_carriage, _carriage->length(), 0);
+      _carriage = bptr.get();
+      _buffers.push_back(*bptr.release());
+      _num += 1;
+    }
+    _carriage->set_length(_carriage->length() + len);
+    return { _carriage->end_c_str() - len };
+  }
+
+  void buffer::list::prepend_zero(unsigned len)
+  {
+    auto bp = ptr_node::create(len);
+    bp->zero(false);
+    _len += len;
+    _num += 1;
+    _buffers.push_front(*bp.release());
+  }
+  
+  void buffer::list::append_zero(unsigned len)
+  {
+    _len += len;
+
+    const unsigned free_in_last = get_append_buffer_unused_tail_length();
+    const unsigned first_round = std::min(len, free_in_last);
+    if (first_round) {
+      if (unlikely(_carriage != &_buffers.back())) {
+        auto bptr = ptr_node::create(*_carriage, _carriage->length(), 0);
+	_carriage = bptr.get();
+	_buffers.push_back(*bptr.release());
+        _num += 1;
+      }
+      _carriage->append_zeros(first_round);
+    }
+
+    const unsigned second_round = len - first_round;
+    if (second_round) {
+      auto& new_back = refill_append_space(second_round);
+      new_back.set_length(second_round);
+      new_back.zero(false);
+    }
+  }
+
+  
+  /*
+   * get a char
+   */
+  const char& buffer::list::operator[](unsigned n) const
+  {
+    if (n >= _len)
+      throw end_of_buffer();
+    
+    for (const auto& node : _buffers) {
+      if (n >= node.length()) {
+	n -= node.length();
+	continue;
+      }
+      return node[n];
+    }
+    ceph_abort();
+  }
+
+  /*
+   * return a contiguous ptr to whole bufferlist contents.
+   */
+  char *buffer::list::c_str()
+  {
+    if (const auto len = length(); len == 0) {
+      return nullptr;                         // no non-empty buffers
+    } else if (len != _buffers.front().length()) {
+      rebuild();
+    } else {
+      // there are two *main* scenarios that hit this branch:
+      //   1. bufferlist with single, non-empty buffer;
+      //   2. bufferlist with single, non-empty buffer followed by
+      //      empty buffer. splice() tries to not waste our appendable
+      //      space; to carry it an empty bptr is added at the end.
+      // we account for these and don't rebuild unnecessarily
+    }
+    return _buffers.front().c_str();
+  }
+
+  string buffer::list::to_str() const {
+    string s;
+    s.reserve(length());
+    for (const auto& node : _buffers) {
+      if (node.length()) {
+	s.append(node.c_str(), node.length());
+      }
+    }
+    return s;
+  }
+
+  void buffer::list::substr_of(const list& other, unsigned off, unsigned len)
+  {
+    if (off + len > other.length())
+      throw end_of_buffer();
+
+    clear();
+
+    // skip off
+    auto curbuf = std::cbegin(other._buffers);
+    while (off > 0 && off >= curbuf->length()) {
+      // skip this buffer
+      //cout << "skipping over " << *curbuf << std::endl;
+      off -= (*curbuf).length();
+      ++curbuf;
+    }
+    ceph_assert(len == 0 || curbuf != std::cend(other._buffers));
+    
+    while (len > 0) {
+      // partial?
+      if (off + len < curbuf->length()) {
+	//cout << "copying partial of " << *curbuf << std::endl;
+	_buffers.push_back(*ptr_node::create(*curbuf, off, len).release());
+	_len += len;
+        _num += 1;
+	break;
+      }
+      
+      // through end
+      //cout << "copying end (all?) of " << *curbuf << std::endl;
+      unsigned howmuch = curbuf->length() - off;
+      _buffers.push_back(*ptr_node::create(*curbuf, off, howmuch).release());
+      _len += howmuch;
+      _num += 1;
+      len -= howmuch;
+      off = 0;
+      ++curbuf;
+    }
+  }
+
+  // funky modifer
+  void buffer::list::splice(unsigned off, unsigned len, list *claim_by /*, bufferlist& replace_with */)
+  {    // fixme?
+    if (len == 0)
+      return;
+
+    if (off >= length())
+      throw end_of_buffer();
+
+    ceph_assert(len > 0);
+    //cout << "splice off " << off << " len " << len << " ... mylen = " << length() << std::endl;
+      
+    // skip off
+    auto curbuf = std::begin(_buffers);
+    auto curbuf_prev = _buffers.before_begin();
+    while (off > 0) {
+      ceph_assert(curbuf != std::end(_buffers));
+      if (off >= (*curbuf).length()) {
+	// skip this buffer
+	//cout << "off = " << off << " skipping over " << *curbuf << std::endl;
+	off -= (*curbuf).length();
+	curbuf_prev = curbuf++;
+      } else {
+	// somewhere in this buffer!
+	//cout << "off = " << off << " somewhere in " << *curbuf << std::endl;
+	break;
+      }
+    }
+    
+    if (off) {
+      // add a reference to the front bit, insert it before curbuf (which
+      // we'll lose).
+      //cout << "keeping front " << off << " of " << *curbuf << std::endl;
+      _buffers.insert_after(curbuf_prev,
+			    *ptr_node::create(*curbuf, 0, off).release());
+      _len += off;
+      _num += 1;
+      ++curbuf_prev;
+    }
+    
+    while (len > 0) {
+      // partial or the last (appendable) one?
+      if (const auto to_drop = off + len; to_drop < curbuf->length()) {
+	//cout << "keeping end of " << *curbuf << ", losing first " << off+len << std::endl;
+	if (claim_by) 
+	  claim_by->append(*curbuf, off, len);
+	curbuf->set_offset(to_drop + curbuf->offset());    // ignore beginning big
+	curbuf->set_length(curbuf->length() - to_drop);
+	_len -= to_drop;
+	//cout << " now " << *curbuf << std::endl;
+	break;
+      }
+
+      // hose though the end
+      unsigned howmuch = curbuf->length() - off;
+      //cout << "discarding " << howmuch << " of " << *curbuf << std::endl;
+      if (claim_by) 
+	claim_by->append(*curbuf, off, howmuch);
+      _len -= curbuf->length();
+      if (curbuf == _carriage) {
+        // no need to reallocate, shrinking and relinking is enough.
+        curbuf = _buffers.erase_after(curbuf_prev);
+	_carriage->set_offset(_carriage->offset() + _carriage->length());
+	_carriage->set_length(0);
+	_buffers.push_back(*_carriage);
+      } else {
+	curbuf = _buffers.erase_after_and_dispose(curbuf_prev);
+	_num -= 1;
+      }
+      len -= howmuch;
+      off = 0;
+    }
+
+    // splice in *replace (implement me later?)
+  }
+
+  void buffer::list::write(int off, int len, std::ostream& out) const
+  {
+    list s;
+    s.substr_of(*this, off, len);
+    for (const auto& node : s._buffers) {
+      if (node.length()) {
+	out.write(node.c_str(), node.length());
+      }
+    }
+  }
+  
+void buffer::list::encode_base64(buffer::list& o)
+{
+  bufferptr bp(length() * 4 / 3 + 3);
+  int l = ceph_armor(bp.c_str(), bp.c_str() + bp.length(), c_str(), c_str() + length());
+  bp.set_length(l);
+  o.push_back(std::move(bp));
+}
+
+void buffer::list::decode_base64(buffer::list& e)
+{
+  bufferptr bp(4 + ((e.length() * 3) / 4));
+  int l = ceph_unarmor(bp.c_str(), bp.c_str() + bp.length(), e.c_str(), e.c_str() + e.length());
+  if (l < 0) {
+    std::ostringstream oss;
+    oss << "decode_base64: decoding failed:\n";
+    hexdump(oss);
+    throw buffer::malformed_input(oss.str().c_str());
+  }
+  ceph_assert(l <= (int)bp.length());
+  bp.set_length(l);
+  push_back(std::move(bp));
+}
+
+ssize_t buffer::list::pread_file(const char *fn, uint64_t off, uint64_t len, std::string *error)
+{
+  int fd = TEMP_FAILURE_RETRY(::open(fn, O_RDONLY|O_CLOEXEC|O_BINARY));
+  if (fd < 0) {
+    int err = errno;
+    std::ostringstream oss;
+    oss << "can't open " << fn << ": " << cpp_strerror(err);
+    *error = oss.str();
+    return -err;
+  }
+
+  struct stat st;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&st, 0, sizeof(st));
+  if (::fstat(fd, &st) < 0) {
+    int err = errno;
+    std::ostringstream oss;
+    oss << "bufferlist::read_file(" << fn << "): stat error: "
+        << cpp_strerror(err);
+    *error = oss.str();
+    VOID_TEMP_FAILURE_RETRY(::close(fd));
+    return -err;
+  }
+
+  if (off > (uint64_t)st.st_size) {
+    std::ostringstream oss;
+    oss << "bufferlist::read_file(" << fn << "): read error: size < offset";
+    *error = oss.str();
+    VOID_TEMP_FAILURE_RETRY(::close(fd));
+    return 0;
+  }
+
+  if (len > st.st_size - off) {
+    len = st.st_size - off;
+  }
+  ssize_t ret = lseek64(fd, off, SEEK_SET);
+  if (ret != (ssize_t)off) {
+    return -errno;
+  }
+
+  ret = read_fd(fd, len);
+  if (ret < 0) {
+    std::ostringstream oss;
+    oss << "bufferlist::read_file(" << fn << "): read error:"
+	<< cpp_strerror(ret);
+    *error = oss.str();
+    VOID_TEMP_FAILURE_RETRY(::close(fd));
+    return ret;
+  } else if (ret != (ssize_t)len) {
+    // Premature EOF.
+    // Perhaps the file changed between stat() and read()?
+    std::ostringstream oss;
+    oss << "bufferlist::read_file(" << fn << "): warning: got premature EOF.";
+    *error = oss.str();
+    // not actually an error, but weird
+  }
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+  return 0;
+}
+
+int buffer::list::read_file(const char *fn, std::string *error)
+{
+  int fd = TEMP_FAILURE_RETRY(::open(fn, O_RDONLY|O_CLOEXEC|O_BINARY));
+  if (fd < 0) {
+    int err = errno;
+    std::ostringstream oss;
+    oss << "can't open " << fn << ": " << cpp_strerror(err);
+    *error = oss.str();
+    return -err;
+  }
+
+  struct stat st;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&st, 0, sizeof(st));
+  if (::fstat(fd, &st) < 0) {
+    int err = errno;
+    std::ostringstream oss;
+    oss << "bufferlist::read_file(" << fn << "): stat error: "
+        << cpp_strerror(err);
+    *error = oss.str();
+    VOID_TEMP_FAILURE_RETRY(::close(fd));
+    return -err;
+  }
+
+  ssize_t ret = read_fd(fd, st.st_size);
+  if (ret < 0) {
+    std::ostringstream oss;
+    oss << "bufferlist::read_file(" << fn << "): read error:"
+	<< cpp_strerror(ret);
+    *error = oss.str();
+    VOID_TEMP_FAILURE_RETRY(::close(fd));
+    return ret;
+  }
+  else if (ret != st.st_size) {
+    // Premature EOF.
+    // Perhaps the file changed between stat() and read()?
+    std::ostringstream oss;
+    oss << "bufferlist::read_file(" << fn << "): warning: got premature EOF.";
+    *error = oss.str();
+    // not actually an error, but weird
+  }
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+  return 0;
+}
+
+ssize_t buffer::list::read_fd(int fd, size_t len)
+{
+  auto bp = ptr_node::create(buffer::create(len));
+  ssize_t ret = safe_read(fd, (void*)bp->c_str(), len);
+  if (ret >= 0) {
+    bp->set_length(ret);
+    push_back(std::move(bp));
+  }
+  return ret;
+}
+
+ssize_t buffer::list::recv_fd(int fd, size_t len)
+{
+  auto bp = ptr_node::create(buffer::create(len));
+  ssize_t ret = safe_recv(fd, (void*)bp->c_str(), len);
+  if (ret >= 0) {
+    bp->set_length(ret);
+    push_back(std::move(bp));
+  }
+  return ret;
+}
+
+int buffer::list::write_file(const char *fn, int mode)
+{
+  int fd = TEMP_FAILURE_RETRY(::open(fn, O_WRONLY|O_CREAT|O_TRUNC|O_CLOEXEC|O_BINARY, mode));
+  if (fd < 0) {
+    int err = errno;
+    cerr << "bufferlist::write_file(" << fn << "): failed to open file: "
+	 << cpp_strerror(err) << std::endl;
+    return -err;
+  }
+  int ret = write_fd(fd);
+  if (ret) {
+    cerr << "bufferlist::write_fd(" << fn << "): write_fd error: "
+	 << cpp_strerror(ret) << std::endl;
+    VOID_TEMP_FAILURE_RETRY(::close(fd));
+    return ret;
+  }
+  if (TEMP_FAILURE_RETRY(::close(fd))) {
+    int err = errno;
+    cerr << "bufferlist::write_file(" << fn << "): close error: "
+	 << cpp_strerror(err) << std::endl;
+    return -err;
+  }
+  return 0;
+}
+
+static int do_writev(int fd, struct iovec *vec, uint64_t offset, unsigned veclen, unsigned bytes)
+{
+  while (bytes > 0) {
+    ssize_t r = 0;
+#ifdef HAVE_PWRITEV
+    r = ::pwritev(fd, vec, veclen, offset);
+#else
+    r = ::lseek64(fd, offset, SEEK_SET);
+    if (r != offset) {
+      return -errno;
+    }
+    r = ::writev(fd, vec, veclen);
+#endif
+    if (r < 0) {
+      if (errno == EINTR)
+        continue;
+      return -errno;
+    }
+
+    bytes -= r;
+    offset += r;
+    if (bytes == 0) break;
+
+    while (r > 0) {
+      if (vec[0].iov_len <= (size_t)r) {
+        // drain this whole item
+        r -= vec[0].iov_len;
+        ++vec;
+        --veclen;
+      } else {
+        vec[0].iov_base = (char *)vec[0].iov_base + r;
+        vec[0].iov_len -= r;
+        break;
+      }
+    }
+  }
+  return 0;
+}
+
+#ifndef _WIN32
+int buffer::list::write_fd(int fd) const
+{
+  // use writev!
+  iovec iov[IOV_MAX];
+  int iovlen = 0;
+  ssize_t bytes = 0;
+
+  auto p = std::cbegin(_buffers);
+  while (p != std::cend(_buffers)) {
+    if (p->length() > 0) {
+      iov[iovlen].iov_base = (void *)p->c_str();
+      iov[iovlen].iov_len = p->length();
+      bytes += p->length();
+      iovlen++;
+    }
+    ++p;
+
+    if (iovlen == IOV_MAX ||
+	p == _buffers.end()) {
+      iovec *start = iov;
+      int num = iovlen;
+      ssize_t wrote;
+    retry:
+      wrote = ::writev(fd, start, num);
+      if (wrote < 0) {
+	int err = errno;
+	if (err == EINTR)
+	  goto retry;
+	return -err;
+      }
+      if (wrote < bytes) {
+	// partial write, recover!
+	while ((size_t)wrote >= start[0].iov_len) {
+	  wrote -= start[0].iov_len;
+	  bytes -= start[0].iov_len;
+	  start++;
+	  num--;
+	}
+	if (wrote > 0) {
+	  start[0].iov_len -= wrote;
+	  start[0].iov_base = (char *)start[0].iov_base + wrote;
+	  bytes -= wrote;
+	}
+	goto retry;
+      }
+      iovlen = 0;
+      bytes = 0;
+    }
+  }
+  return 0;
+}
+
+int buffer::list::send_fd(int fd) const {
+  return buffer::list::write_fd(fd);
+}
+
+int buffer::list::write_fd(int fd, uint64_t offset) const
+{
+  iovec iov[IOV_MAX];
+
+  auto p = std::cbegin(_buffers);
+  uint64_t left_pbrs = get_num_buffers();
+  while (left_pbrs) {
+    ssize_t bytes = 0;
+    unsigned iovlen = 0;
+    uint64_t size = std::min<uint64_t>(left_pbrs, IOV_MAX);
+    left_pbrs -= size;
+    while (size > 0) {
+      iov[iovlen].iov_base = (void *)p->c_str();
+      iov[iovlen].iov_len = p->length();
+      iovlen++;
+      bytes += p->length();
+      ++p;
+      size--;
+    }
+
+    int r = do_writev(fd, iov, offset, iovlen, bytes);
+    if (r < 0)
+      return r;
+    offset += bytes;
+  }
+  return 0;
+}
+#else
+int buffer::list::write_fd(int fd) const
+{
+  // There's no writev on Windows. WriteFileGather may be an option,
+  // but it has strict requirements in terms of buffer size and alignment.
+  auto p = std::cbegin(_buffers);
+  uint64_t left_pbrs = get_num_buffers();
+  while (left_pbrs) {
+    int written = 0;
+    while (written < p->length()) {
+      int r = ::write(fd, p->c_str(), p->length() - written);
+      if (r < 0)
+        return -errno;
+
+      written += r;
+    }
+
+    left_pbrs--;
+    p++;
+  }
+
+  return 0;
+}
+
+int buffer::list::send_fd(int fd) const
+{
+  // There's no writev on Windows. WriteFileGather may be an option,
+  // but it has strict requirements in terms of buffer size and alignment.
+  auto p = std::cbegin(_buffers);
+  uint64_t left_pbrs = get_num_buffers();
+  while (left_pbrs) {
+    int written = 0;
+    while (written < p->length()) {
+      int r = ::send(fd, p->c_str(), p->length() - written, 0);
+      if (r < 0)
+        return -ceph_sock_errno();
+
+      written += r;
+    }
+
+    left_pbrs--;
+    p++;
+  }
+
+  return 0;
+}
+
+int buffer::list::write_fd(int fd, uint64_t offset) const
+{
+  int r = ::lseek64(fd, offset, SEEK_SET);
+  if (r != offset)
+    return -errno;
+
+  return write_fd(fd);
+}
+#endif
+
+buffer::list::iov_vec_t buffer::list::prepare_iovs() const
+{
+  size_t index = 0;
+  uint64_t off = 0;
+  iov_vec_t iovs{_num / IOV_MAX + 1};
+  auto it = iovs.begin();
+  for (auto& bp : _buffers) {
+    if (index == 0) {
+      it->offset = off;
+      it->length = 0;
+      size_t nr_iov_created = std::distance(iovs.begin(), it);
+      it->iov.resize(
+	std::min(_num - IOV_MAX * nr_iov_created, (size_t)IOV_MAX));
+    }
+    it->iov[index].iov_base = (void*)bp.c_str();
+    it->iov[index].iov_len = bp.length();
+    off += bp.length();
+    it->length += bp.length();
+    if (++index == IOV_MAX) {
+      // continue with a new vector<iov> if we have more buf
+      ++it;
+      index = 0;
+    }
+  }
+  return iovs;
+}
+
+__u32 buffer::list::crc32c(__u32 crc) const
+{
+  int cache_misses = 0;
+  int cache_hits = 0;
+  int cache_adjusts = 0;
+
+  for (const auto& node : _buffers) {
+    if (node.length()) {
+      raw* const r = node._raw;
+      pair<size_t, size_t> ofs(node.offset(), node.offset() + node.length());
+      pair<uint32_t, uint32_t> ccrc;
+      if (r->get_crc(ofs, &ccrc)) {
+	if (ccrc.first == crc) {
+	  // got it already
+	  crc = ccrc.second;
+	  cache_hits++;
+	} else {
+	  /* If we have cached crc32c(buf, v) for initial value v,
+	   * we can convert this to a different initial value v' by:
+	   * crc32c(buf, v') = crc32c(buf, v) ^ adjustment
+	   * where adjustment = crc32c(0*len(buf), v ^ v')
+	   *
+	   * http://crcutil.googlecode.com/files/crc-doc.1.0.pdf
+	   * note, u for our crc32c implementation is 0
+	   */
+	  crc = ccrc.second ^ ceph_crc32c(ccrc.first ^ crc, NULL, node.length());
+	  cache_adjusts++;
+	}
+      } else {
+	cache_misses++;
+	uint32_t base = crc;
+	crc = ceph_crc32c(crc, (unsigned char*)node.c_str(), node.length());
+	r->set_crc(ofs, make_pair(base, crc));
+      }
+    }
+  }
+
+  if (buffer_track_crc) {
+    if (cache_adjusts)
+      buffer_cached_crc_adjusted += cache_adjusts;
+    if (cache_hits)
+      buffer_cached_crc += cache_hits;
+    if (cache_misses)
+      buffer_missed_crc += cache_misses;
+  }
+
+  return crc;
+}
+
+void buffer::list::invalidate_crc()
+{
+  for (const auto& node : _buffers) {
+    if (node._raw) {
+      node._raw->invalidate_crc();
+    }
+  }
+}
+
+/**
+ * Binary write all contents to a C++ stream
+ */
+void buffer::list::write_stream(std::ostream &out) const
+{
+  for (const auto& node : _buffers) {
+    if (node.length() > 0) {
+      out.write(node.c_str(), node.length());
+    }
+  }
+}
+
+
+void buffer::list::hexdump(std::ostream &out, bool trailing_newline) const
+{
+  if (!length())
+    return;
+
+  std::ios_base::fmtflags original_flags = out.flags();
+
+  // do our best to match the output of hexdump -C, for better
+  // diff'ing!
+
+  out.setf(std::ios::right);
+  out.fill('0');
+
+  unsigned per = 16;
+  char last_row_char = '\0';
+  bool was_same = false, did_star = false;
+  for (unsigned o=0; o<length(); o += per) {
+    if (o == 0) {
+      last_row_char = (*this)[o];
+    }
+
+    if (o + per < length()) {
+      bool row_is_same = true;
+      for (unsigned i=0; i<per && o+i<length(); i++) {
+        char current_char = (*this)[o+i];
+        if (current_char != last_row_char) {
+          if (i == 0) {
+            last_row_char = current_char;
+            was_same = false;
+            did_star = false;
+          } else {
+	    row_is_same = false;
+          }
+	}
+      }
+      if (row_is_same) {
+	if (was_same) {
+	  if (!did_star) {
+	    out << "\n*";
+	    did_star = true;
+	  }
+	  continue;
+	}
+	was_same = true;
+      } else {
+	was_same = false;
+	did_star = false;
+      }
+    }
+    if (o)
+      out << "\n";
+    out << std::hex << std::setw(8) << o << " ";
+
+    unsigned i;
+    for (i=0; i<per && o+i<length(); i++) {
+      if (i == 8)
+	out << ' ';
+      out << " " << std::setw(2) << ((unsigned)(*this)[o+i] & 0xff);
+    }
+    for (; i<per; i++) {
+      if (i == 8)
+	out << ' ';
+      out << "   ";
+    }
+    
+    out << "  |";
+    for (i=0; i<per && o+i<length(); i++) {
+      char c = (*this)[o+i];
+      if (isupper(c) || islower(c) || isdigit(c) || c == ' ' || ispunct(c))
+	out << c;
+      else
+	out << '.';
+    }
+    out << '|' << std::dec;
+  }
+  if (trailing_newline) {
+    out << "\n" << std::hex << std::setw(8) << length();
+    out << "\n";
+  }
+
+  out.flags(original_flags);
+}
+
+
+buffer::list buffer::list::static_from_mem(char* c, size_t l) {
+  list bl;
+  bl.push_back(ptr_node::create(create_static(l, c)));
+  return bl;
+}
+
+buffer::list buffer::list::static_from_cstring(char* c) {
+  return static_from_mem(c, std::strlen(c));
+}
+
+buffer::list buffer::list::static_from_string(string& s) {
+  // C++14 just has string::data return a char* from a non-const
+  // string.
+  return static_from_mem(const_cast<char*>(s.data()), s.length());
+  // But the way buffer::list mostly doesn't work in a sane way with
+  // const makes me generally sad.
+}
+
+// buffer::raw is not a standard layout type.
+#define BUF_OFFSETOF(type, field)					\
+  (reinterpret_cast<std::uintptr_t>(&(((type*)1024)->field)) - 1024u)
+
+bool buffer::ptr_node::dispose_if_hypercombined(
+  buffer::ptr_node* const delete_this)
+{
+  // in case _raw is nullptr
+  const std::uintptr_t bptr =
+    (reinterpret_cast<std::uintptr_t>(delete_this->_raw) +
+     BUF_OFFSETOF(buffer::raw, bptr_storage));
+  const bool is_hypercombined =
+    reinterpret_cast<std::uintptr_t>(delete_this) == bptr;
+  if (is_hypercombined) {
+    ceph_assert_always("hypercombining is currently disabled" == nullptr);
+    delete_this->~ptr_node();
+    return true;
+  } else {
+    return false;
+  }
+}
+
+std::unique_ptr<buffer::ptr_node, buffer::ptr_node::disposer>
+buffer::ptr_node::create_hypercombined(ceph::unique_leakable_ptr<buffer::raw> r)
+{
+  // FIXME: we don't currently hypercombine buffers due to crashes
+  // observed in the rados suite. After fixing we'll use placement
+  // new to create ptr_node on buffer::raw::bptr_storage.
+  return std::unique_ptr<buffer::ptr_node, buffer::ptr_node::disposer>(
+    new ptr_node(std::move(r)));
+}
+
+buffer::ptr_node* buffer::ptr_node::cloner::operator()(
+  const buffer::ptr_node& clone_this)
+{
+  return new ptr_node(clone_this);
+}
+
+std::ostream& buffer::operator<<(std::ostream& out, const buffer::raw &r) {
+  return out << "buffer::raw("
+             << (void*)r.get_data() << " len " << r.get_len()
+             << " nref " << r.nref.load() << ")";
+}
+
+std::ostream& buffer::operator<<(std::ostream& out, const buffer::ptr& bp) {
+  if (bp.have_raw())
+    out << "buffer::ptr(" << bp.offset() << "~" << bp.length()
+	<< " " << (void*)bp.c_str()
+	<< " in raw " << (void*)bp.raw_c_str()
+	<< " len " << bp.raw_length()
+	<< " nref " << bp.raw_nref() << ")";
+  else
+    out << "buffer:ptr(" << bp.offset() << "~" << bp.length() << " no raw)";
+  return out;
+}
+
+std::ostream& buffer::operator<<(std::ostream& out, const buffer::list& bl) {
+  out << "buffer::list(len=" << bl.length() << ",\n";
+
+  for (const auto& node : bl.buffers()) {
+    out << "\t" << node;
+    if (&node != &bl.buffers().back()) {
+      out << ",\n";
+    }
+  }
+  out << "\n)";
+  return out;
+}
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(buffer::raw_malloc, buffer_raw_malloc,
+			      buffer_meta);
+MEMPOOL_DEFINE_OBJECT_FACTORY(buffer::raw_posix_aligned,
+			      buffer_raw_posix_aligned, buffer_meta);
+MEMPOOL_DEFINE_OBJECT_FACTORY(buffer::raw_claimed_char, buffer_raw_claimed_char,
+			      buffer_meta);
+MEMPOOL_DEFINE_OBJECT_FACTORY(buffer::raw_static, buffer_raw_static,
+			      buffer_meta);
+
+
+void ceph::buffer::list::page_aligned_appender::_refill(size_t len) {
+  const unsigned alloc =
+    std::max(min_alloc,
+	     shift_round_up(static_cast<unsigned>(len),
+			    static_cast<unsigned>(CEPH_PAGE_SHIFT)));
+  auto new_back = \
+    ptr_node::create(buffer::create_page_aligned(alloc));
+  new_back->set_length(0);   // unused, so far.
+  bl.push_back(std::move(new_back));
+}
+
+namespace ceph::buffer {
+inline namespace v15_2_0 {
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wnon-virtual-dtor"
+class buffer_error_category : public ceph::converting_category {
+public:
+  buffer_error_category(){}
+  const char* name() const noexcept override;
+  const char* message(int ev, char*, std::size_t) const noexcept override;
+  std::string message(int ev) const override;
+  boost::system::error_condition default_error_condition(int ev) const noexcept
+    override;
+  using ceph::converting_category::equivalent;
+  bool equivalent(int ev, const boost::system::error_condition& c) const
+    noexcept override;
+  int from_code(int ev) const noexcept override;
+};
+#pragma GCC diagnostic pop
+#pragma clang diagnostic pop
+
+const char* buffer_error_category::name() const noexcept {
+  return "buffer";
+}
+
+const char*
+buffer_error_category::message(int ev, char*, std::size_t) const noexcept {
+  using ceph::buffer::errc;
+  if (ev == 0)
+    return "No error";
+
+  switch (static_cast<errc>(ev)) {
+  case errc::bad_alloc:
+    return "Bad allocation";
+
+  case errc::end_of_buffer:
+    return "End of buffer";
+
+  case errc::malformed_input:
+    return "Malformed input";
+  }
+
+  return "Unknown error";
+}
+
+std::string buffer_error_category::message(int ev) const {
+  return message(ev, nullptr, 0);
+}
+
+boost::system::error_condition
+buffer_error_category::default_error_condition(int ev)const noexcept {
+  using ceph::buffer::errc;
+  switch (static_cast<errc>(ev)) {
+  case errc::bad_alloc:
+    return boost::system::errc::not_enough_memory;
+  case errc::end_of_buffer:
+  case errc::malformed_input:
+    return boost::system::errc::io_error;
+  }
+  return { ev, *this };
+}
+
+bool buffer_error_category::equivalent(int ev, const boost::system::error_condition& c) const noexcept {
+  return default_error_condition(ev) == c;
+}
+
+int buffer_error_category::from_code(int ev) const noexcept {
+  using ceph::buffer::errc;
+  switch (static_cast<errc>(ev)) {
+  case errc::bad_alloc:
+    return -ENOMEM;
+
+  case errc::end_of_buffer:
+    return -EIO;
+
+  case errc::malformed_input:
+    return -EIO;
+  }
+  return -EDOM;
+}
+
+const boost::system::error_category& buffer_category() noexcept {
+  static const buffer_error_category c;
+  return c;
+}
+}
+}
diff --git a/src/common/buffer_instrumentation.h b/src/common/buffer_instrumentation.h
new file mode 100644
index 000000000..fa1e84307
--- /dev/null
+++ b/src/common/buffer_instrumentation.h
@@ -0,0 +1,32 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/buffer.h"
+#include "include/buffer_raw.h"
+
+namespace ceph::buffer_instrumentation {
+
+// this is nothing more than an intermediary for a class hierarchy which
+// can placed between a user's custom raw and the `ceph::buffer::raw` to
+// detect whether a given `ceph::buffer::ptr` instance wraps a particular
+// raw's implementation (via `dynamic_cast` or `typeid`).
+//
+// users are supposed to define marker type (e.g. `class my_marker{}`).
+// this marker. i
+template <class MarkerT>
+struct instrumented_raw : public ceph::buffer::raw {
+  using raw::raw;
+};
+
+struct instrumented_bptr : public ceph::buffer::ptr {
+  const ceph::buffer::raw* get_raw() const {
+    return _raw;
+  }
+
+  template <class MarkerT>
+  bool is_raw_marked() const {
+    return dynamic_cast<const instrumented_raw<MarkerT>*>(get_raw()) != nullptr;
+  }
+};
+
+} // namespace ceph::buffer_instrumentation
diff --git a/src/common/buffer_seastar.cc b/src/common/buffer_seastar.cc
new file mode 100644
index 000000000..bc529c937
--- /dev/null
+++ b/src/common/buffer_seastar.cc
@@ -0,0 +1,104 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <seastar/core/sharded.hh>
+#include <seastar/net/packet.hh>
+
+#include "include/buffer_raw.h"
+#include "buffer_seastar.h"
+
+using temporary_buffer = seastar::temporary_buffer<char>;
+
+namespace ceph::buffer {
+
+class raw_seastar_foreign_ptr : public raw {
+  seastar::foreign_ptr<temporary_buffer> ptr;
+ public:
+  raw_seastar_foreign_ptr(temporary_buffer&& buf)
+    : raw(buf.get_write(), buf.size()), ptr(std::move(buf)) {}
+};
+
+class raw_seastar_local_ptr : public raw {
+  temporary_buffer buf;
+ public:
+  raw_seastar_local_ptr(temporary_buffer&& buf)
+    : raw(buf.get_write(), buf.size()), buf(std::move(buf)) {}
+};
+
+inline namespace v15_2_0 {
+
+ceph::unique_leakable_ptr<buffer::raw> create(temporary_buffer&& buf) {
+  return ceph::unique_leakable_ptr<buffer::raw>(
+    new raw_seastar_foreign_ptr(std::move(buf)));
+}
+
+ceph::unique_leakable_ptr<buffer::raw> create_local(temporary_buffer&& buf) {
+  return ceph::unique_leakable_ptr<buffer::raw>(
+    new raw_seastar_local_ptr(std::move(buf)));
+}
+
+} // inline namespace v15_2_0
+
+// buffer::ptr conversions
+
+ptr::operator seastar::temporary_buffer<char>() &
+{
+  return {c_str(), _len, seastar::make_object_deleter(*this)};
+}
+
+ptr::operator seastar::temporary_buffer<char>() &&
+{
+  auto data = c_str();
+  auto length = _len;
+  return {data, length, seastar::make_object_deleter(std::move(*this))};
+}
+
+// buffer::list conversions
+
+list::operator seastar::net::packet() &&
+{
+  seastar::net::packet p(_num);
+  for (auto& ptr : _buffers) {
+    // append each ptr as a temporary_buffer
+    p = seastar::net::packet(std::move(p), std::move(ptr));
+  }
+  clear();
+  return p;
+}
+
+} // namespace ceph::buffer
+
+namespace {
+
+using ceph::buffer::raw;
+class raw_seastar_local_shared_ptr : public raw {
+  temporary_buffer buf;
+public:
+  raw_seastar_local_shared_ptr(temporary_buffer& buf)
+    : raw(buf.get_write(), buf.size()), buf(buf.share()) {}
+};
+}
+
+buffer::ptr seastar_buffer_iterator::get_ptr(size_t len)
+{
+  buffer::ptr p{ceph::unique_leakable_ptr<buffer::raw>(
+    new raw_seastar_local_shared_ptr{buf})};
+  p.set_length(len);
+  return p;
+}
+
+buffer::ptr const_seastar_buffer_iterator::get_ptr(size_t len)
+{
+  return buffer::ptr{ buffer::copy(get_pos_add(len), len) };
+}
diff --git a/src/common/buffer_seastar.h b/src/common/buffer_seastar.h
new file mode 100644
index 000000000..70a7b9325
--- /dev/null
+++ b/src/common/buffer_seastar.h
@@ -0,0 +1,62 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <seastar/core/temporary_buffer.hh>
+#include "include/buffer.h"
+#include "common/error_code.h"
+
+namespace details {
+
+template<bool is_const>
+class buffer_iterator_impl {
+public:
+  using pointer = std::conditional_t<is_const, const char*, char *>;
+  buffer_iterator_impl(pointer first, const char* last)
+    : pos(first), end_ptr(last)
+  {}
+  pointer get_pos_add(size_t n) {
+    auto r = pos;
+    pos += n;
+    if (pos > end_ptr) {
+      throw buffer::end_of_buffer{};
+    }
+    return r;
+  }
+  pointer get() const {
+    return pos;
+  }
+protected:
+  pointer pos;
+  const char* end_ptr;
+};
+} // namespace details
+
+class seastar_buffer_iterator : details::buffer_iterator_impl<false> {
+  using parent = details::buffer_iterator_impl<false>;
+  using temporary_buffer = seastar::temporary_buffer<char>;
+public:
+  seastar_buffer_iterator(temporary_buffer& b)
+    : parent(b.get_write(), b.end()), buf(b)
+  {}
+  using parent::pointer;
+  using parent::get_pos_add;
+  using parent::get;
+  ceph::buffer::ptr get_ptr(size_t len);
+
+private:
+  // keep the reference to buf around, so it can be "shared" by get_ptr()
+  temporary_buffer& buf;
+};
+
+class const_seastar_buffer_iterator : details::buffer_iterator_impl<true> {
+  using parent = details::buffer_iterator_impl<true>;
+  using temporary_buffer = seastar::temporary_buffer<char>;
+public:
+  const_seastar_buffer_iterator(temporary_buffer& b)
+    : parent(b.get_write(), b.end())
+  {}
+  using parent::pointer;
+  using parent::get_pos_add;
+  using parent::get;
+  ceph::buffer::ptr get_ptr(size_t len);
+};
diff --git a/src/common/ceph_argparse.cc b/src/common/ceph_argparse.cc
new file mode 100644
index 000000000..9b989fe72
--- /dev/null
+++ b/src/common/ceph_argparse.cc
@@ -0,0 +1,597 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include <stdarg.h>
+
+#include "auth/Auth.h"
+#include "common/ceph_argparse.h"
+#include "common/config.h"
+#include "common/version.h"
+#include "include/str_list.h"
+
+/*
+ * Ceph argument parsing library
+ *
+ * We probably should eventually replace this with something standard like popt.
+ * Until we do that, though, this file is the place for argv parsing
+ * stuff to live.
+ */
+
+#undef dout
+#undef pdout
+#undef derr
+#undef generic_dout
+#undef dendl
+
+struct strict_str_convert {
+  const char *str;
+  std::string *err;
+  strict_str_convert(const char *str,  std::string *err)
+    : str(str), err(err) {}
+
+  inline operator float() const
+  {
+    return strict_strtof(str, err);
+  }
+  inline operator int() const
+  {
+    return strict_strtol(str, 10, err);
+  }
+  inline operator long long() const
+  {
+    return  strict_strtoll(str, 10, err);
+  }
+};
+
+void string_to_vec(std::vector<std::string>& args, std::string argstr)
+{
+  std::istringstream iss(argstr);
+  while(iss) {
+    std::string sub;
+    iss >> sub;
+    if (sub == "") break;
+    args.push_back(sub);
+  }
+}
+
+std::pair<std::vector<const char*>, std::vector<const char*>>
+split_dashdash(const std::vector<const char*>& args) {
+  auto dashdash = std::find_if(args.begin(), args.end(),
+			       [](const char* arg) {
+				 return strcmp(arg, "--") == 0;
+			       });
+  std::vector<const char*> options{args.begin(), dashdash};
+  if (dashdash != args.end()) {
+    ++dashdash;
+  }
+  std::vector<const char*> arguments{dashdash, args.end()};
+  return {std::move(options), std::move(arguments)};
+}
+
+static std::mutex g_str_vec_lock;
+static std::vector<std::string> g_str_vec;
+
+void clear_g_str_vec()
+{
+  g_str_vec_lock.lock();
+  g_str_vec.clear();
+  g_str_vec_lock.unlock();
+}
+
+void env_to_vec(std::vector<const char*>& args, const char *name)
+{
+  if (!name)
+    name = "CEPH_ARGS";
+
+  /*
+   * We can only populate str_vec once. Other threads could hold pointers into
+   * it, so clearing it out and replacing it is not currently safe.
+   */
+  g_str_vec_lock.lock();
+  if (g_str_vec.empty()) {
+    char *p = getenv(name);
+    if (!p) {
+      g_str_vec_lock.unlock();
+      return;
+    }
+    get_str_vec(p, " ", g_str_vec);
+  }
+
+  std::vector<const char*> env;
+  for (const auto& s : g_str_vec) {
+    env.push_back(s.c_str());
+  }
+  g_str_vec_lock.unlock();
+  auto [env_options, env_arguments] = split_dashdash(env);
+
+  auto [options, arguments] = split_dashdash(args);
+  args.clear();
+  args.insert(args.end(), env_options.begin(), env_options.end());
+  args.insert(args.end(), options.begin(), options.end());
+  if (arguments.empty() && env_arguments.empty()) {
+    return;
+  }
+  args.push_back("--");
+  args.insert(args.end(), env_arguments.begin(), env_arguments.end());
+  args.insert(args.end(), arguments.begin(), arguments.end());
+}
+
+std::vector<const char*> argv_to_vec(int argc, const char* const * argv)
+{
+  assert(argc > 0);
+  return {argv + 1, argv + argc};
+}
+
+void vec_to_argv(const char *argv0, std::vector<const char*>& args,
+                 int *argc, const char ***argv)
+{
+  *argv = (const char**)malloc(sizeof(char*) * (args.size() + 1));
+  if (!*argv)
+    throw std::bad_alloc();
+  *argc = 1;
+  (*argv)[0] = argv0;
+
+  for (unsigned i=0; i<args.size(); i++)
+    (*argv)[(*argc)++] = args[i];
+}
+
+void ceph_arg_value_type(const char * nextargstr, bool *bool_option, bool *bool_numeric)
+{
+  bool is_numeric = true;
+  bool is_float = false;
+  bool is_option;
+
+  if (nextargstr == NULL) {
+    return;
+  }
+
+  if (strlen(nextargstr) < 2) {
+    is_option = false;
+  } else {
+    is_option = (nextargstr[0] == '-') && (nextargstr[1] == '-');
+  }
+
+  for (unsigned int i = 0; i < strlen(nextargstr); i++) {
+    if (!(nextargstr[i] >= '0' && nextargstr[i] <= '9')) {
+      // May be negative numeral value
+      if ((i == 0) && (strlen(nextargstr) >= 2))  {
+	if (nextargstr[0] == '-')
+	  continue;
+      }
+      if ( (nextargstr[i] == '.') && (is_float == false) ) {
+        is_float = true;
+        continue;
+      }
+        
+      is_numeric = false;
+      break;
+    }
+  }
+
+  // -<option>
+  if (nextargstr[0] == '-' && is_numeric == false) {
+    is_option = true;
+  }
+
+  *bool_option = is_option;
+  *bool_numeric = is_numeric;
+
+  return;
+}
+
+
+bool parse_ip_port_vec(const char *s, std::vector<entity_addrvec_t>& vec, int type)
+{
+  // first split by [ ;], which are not valid for an addrvec
+  std::list<std::string> items;
+  get_str_list(s, " ;", items);
+
+  for (auto& i : items) {
+    const char *s = i.c_str();
+    while (*s) {
+      const char *end;
+
+      // try parsing as an addr
+      entity_addr_t a;
+      if (a.parse(s, &end, type)) {
+	vec.push_back(entity_addrvec_t(a));
+	s = end;
+	if (*s == ',') {
+	  ++s;
+	}
+	continue;
+      }
+
+      // ok, try parsing as an addrvec
+      entity_addrvec_t av;
+      if (!av.parse(s, &end)) {
+	return false;
+      }
+      vec.push_back(av);
+      s = end;
+      if (*s == ',') {
+	++s;
+      }
+    }
+  }
+  return true;
+}
+
+// The defaults for CephInitParameters
+CephInitParameters::CephInitParameters(uint32_t module_type_)
+  : module_type(module_type_)
+{
+  name.set(module_type, "admin");
+}
+
+static void dashes_to_underscores(const char *input, char *output)
+{
+  char c = 0;
+  char *o = output;
+  const char *i = input;
+  // first two characters are copied as-is
+  *o = *i++;
+  if (*o++ == '\0')
+    return;
+  *o = *i++;
+  if (*o++ == '\0')
+    return;
+  for (; ((c = *i)); ++i) {
+    if (c == '=') {
+      strcpy(o, i);
+      return;
+    }
+    if (c == '-')
+      *o++ = '_';
+    else
+      *o++ = c;
+  }
+  *o++ = '\0';
+}
+
+/** Once we see a standalone double dash, '--', we should remove it and stop
+ * looking for any other options and flags. */
+bool ceph_argparse_double_dash(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i)
+{
+  if (strcmp(*i, "--") == 0) {
+    i = args.erase(i);
+    return true;
+  }
+  return false;
+}
+
+bool ceph_argparse_flag(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, ...)
+{
+  const char *first = *i;
+  char tmp[strlen(first)+1];
+  dashes_to_underscores(first, tmp);
+  first = tmp;
+  va_list ap;
+
+  va_start(ap, i);
+  while (1) {
+    const char *a = va_arg(ap, char*);
+    if (a == NULL) {
+      va_end(ap);
+      return false;
+    }
+    char a2[strlen(a)+1];
+    dashes_to_underscores(a, a2);
+    if (strcmp(a2, first) == 0) {
+      i = args.erase(i);
+      va_end(ap);
+      return true;
+    }
+  }
+}
+
+static bool check_bool_str(const char *val, int *ret)
+{
+  if ((strcmp(val, "true") == 0) || (strcmp(val, "1") == 0)) {
+    *ret = 1;
+    return true;
+  } else if ((strcmp(val, "false") == 0) || (strcmp(val, "0") == 0)) {
+    *ret = 0;
+    return true;
+  }
+  return false;
+}
+
+static bool va_ceph_argparse_binary_flag(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, int *ret,
+	std::ostream *oss, va_list ap)
+{
+  const char *first = *i;
+  char tmp[strlen(first)+1];
+  dashes_to_underscores(first, tmp);
+  first = tmp;
+
+  // does this argument match any of the possibilities?
+  while (1) {
+    const char *a = va_arg(ap, char*);
+    if (a == NULL)
+      return false;
+    int strlen_a = strlen(a);
+    char a2[strlen_a+1];
+    dashes_to_underscores(a, a2);
+    if (strncmp(a2, first, strlen(a2)) == 0) {
+      if (first[strlen_a] == '=') {
+	i = args.erase(i);
+	const char *val = first + strlen_a + 1;
+        if (check_bool_str(val, ret)) {
+	  return true;
+	}
+	if (oss) {
+	  (*oss) << "Parse error parsing binary flag  " << a
+	         << ". Expected true or false, but got '" << val << "'\n";
+	}
+	*ret = -EINVAL;
+	return true;
+      }
+      else if (first[strlen_a] == '\0') {
+        auto next = i+1;
+        if (next != args.end() &&
+            *next &&
+            (*next)[0] != '-') {
+          if (check_bool_str(*next, ret)) {
+            i = args.erase(i);
+            i = args.erase(i);
+            return true;
+          }
+        }
+        i = args.erase(i);
+        *ret =  1;
+	return true;
+      }
+    }
+  }
+}
+
+bool ceph_argparse_binary_flag(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, int *ret,
+	std::ostream *oss, ...)
+{
+  bool r;
+  va_list ap;
+  va_start(ap, oss);
+  r = va_ceph_argparse_binary_flag(args, i, ret, oss, ap);
+  va_end(ap);
+  return r;
+}
+
+static int va_ceph_argparse_witharg(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, std::string *ret,
+	std::ostream &oss, va_list ap)
+{
+  const char *first = *i;
+  char tmp[strlen(first)+1];
+  dashes_to_underscores(first, tmp);
+  first = tmp;
+
+  // does this argument match any of the possibilities?
+  while (1) {
+    const char *a = va_arg(ap, char*);
+    if (a == NULL)
+      return 0;
+    int strlen_a = strlen(a);
+    char a2[strlen_a+1];
+    dashes_to_underscores(a, a2);
+    if (strncmp(a2, first, strlen(a2)) == 0) {
+      if (first[strlen_a] == '=') {
+	*ret = first + strlen_a + 1;
+	i = args.erase(i);
+	return 1;
+      }
+      else if (first[strlen_a] == '\0') {
+	// find second part (or not)
+	if (i+1 == args.end()) {
+	  oss << "Option " << *i << " requires an argument." << std::endl;
+	  i = args.erase(i);
+	  return -EINVAL;
+	}
+	i = args.erase(i);
+	*ret = *i;
+	i = args.erase(i);
+	return 1;
+      }
+    }
+  }
+}
+
+template<class T>
+bool ceph_argparse_witharg(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, T *ret,
+	std::ostream &oss, ...)
+{
+  int r;
+  va_list ap;
+  bool is_option = false;
+  bool is_numeric = true;
+  std::string str;
+  va_start(ap, oss);
+  r = va_ceph_argparse_witharg(args, i, &str, oss, ap);
+  va_end(ap);
+  if (r == 0) {
+    return false;
+  } else if (r < 0) {
+    return true;
+  }
+
+  ceph_arg_value_type(str.c_str(), &is_option, &is_numeric);
+  if ((is_option == true) || (is_numeric == false)) {
+    *ret = EXIT_FAILURE;
+    if (is_option == true) {
+      oss << "Missing option value";
+    } else {
+      oss << "The option value '" << str << "' is invalid";
+    }
+    return true;
+  }
+
+  std::string err;
+  T myret = strict_str_convert(str.c_str(), &err);
+  *ret = myret;
+  if (!err.empty()) {
+    oss << err;
+  }
+  return true;
+}
+
+template bool ceph_argparse_witharg<int>(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, int *ret,
+	std::ostream &oss, ...);
+
+template bool ceph_argparse_witharg<long long>(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, long long *ret,
+	std::ostream &oss, ...);
+
+template bool ceph_argparse_witharg<float>(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, float *ret,
+	std::ostream &oss, ...);
+
+bool ceph_argparse_witharg(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, std::string *ret,
+	std::ostream &oss, ...)
+{
+  int r;
+  va_list ap;
+  va_start(ap, oss);
+  r = va_ceph_argparse_witharg(args, i, ret, oss, ap);
+  va_end(ap);
+  return r != 0;
+}
+
+bool ceph_argparse_witharg(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, std::string *ret, ...)
+{
+  int r;
+  va_list ap;
+  va_start(ap, ret);
+  r = va_ceph_argparse_witharg(args, i, ret, std::cerr, ap);
+  va_end(ap);
+  if (r < 0)
+    _exit(1);
+  return r != 0;
+}
+
+CephInitParameters ceph_argparse_early_args
+	  (std::vector<const char*>& args, uint32_t module_type,
+	   std::string *cluster, std::string *conf_file_list)
+{
+  CephInitParameters iparams(module_type);
+  std::string val;
+
+  auto orig_args = args;
+
+  for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
+    if (strcmp(*i, "--") == 0) {
+      /* Normally we would use ceph_argparse_double_dash. However, in this
+       * function we *don't* want to remove the double dash, because later
+       * argument parses will still need to see it. */
+      break;
+    }
+    else if (ceph_argparse_flag(args, i, "--version", "-v", (char*)NULL)) {
+      std::cout << pretty_version_to_str() << std::endl;
+      _exit(0);
+    }
+    else if (ceph_argparse_witharg(args, i, &val, "--conf", "-c", (char*)NULL)) {
+      *conf_file_list = val;
+    }
+    else if (ceph_argparse_flag(args, i, "--no-config-file", (char*)NULL)) {
+      iparams.no_config_file = true;
+    }
+    else if (ceph_argparse_witharg(args, i, &val, "--cluster", (char*)NULL)) {
+      *cluster = val;
+    }
+    else if ((module_type != CEPH_ENTITY_TYPE_CLIENT) &&
+	     (ceph_argparse_witharg(args, i, &val, "-i", (char*)NULL))) {
+      iparams.name.set_id(val);
+    }
+    else if (ceph_argparse_witharg(args, i, &val, "--id", "--user", (char*)NULL)) {
+      iparams.name.set_id(val);
+    }
+    else if (ceph_argparse_witharg(args, i, &val, "--name", "-n", (char*)NULL)) {
+      if (!iparams.name.from_str(val)) {
+	std::cerr << "error parsing '" << val << "': expected string of the form TYPE.ID, "
+		  << "valid types are: " << EntityName::get_valid_types_as_str()
+		  << std::endl;
+	_exit(1);
+      }
+    }
+    else if (ceph_argparse_flag(args, i, "--show_args", (char*)NULL)) {
+      std::cout << "args: ";
+      for (std::vector<const char *>::iterator ci = orig_args.begin(); ci != orig_args.end(); ++ci) {
+	if (ci != orig_args.begin())
+	  std::cout << " ";
+	std::cout << *ci;
+      }
+      std::cout << std::endl;
+    }
+    else {
+      // ignore
+      ++i;
+    }
+  }
+  return iparams;
+}
+
+static void generic_usage(bool is_server)
+{
+  std::cout <<
+    "  --conf/-c FILE    read configuration from the given configuration file" << std::endl <<
+    (is_server ?
+    "  --id/-i ID        set ID portion of my name" :
+    "  --id ID           set ID portion of my name") << std::endl <<
+    "  --name/-n TYPE.ID set name" << std::endl <<
+    "  --cluster NAME    set cluster name (default: ceph)" << std::endl <<
+    "  --setuser USER    set uid to user or uid (and gid to user's gid)" << std::endl <<
+    "  --setgroup GROUP  set gid to group or gid" << std::endl <<
+    "  --version         show version and quit" << std::endl
+    << std::endl;
+
+  if (is_server) {
+    std::cout <<
+      "  -d                run in foreground, log to stderr" << std::endl <<
+      "  -f                run in foreground, log to usual location" << std::endl <<
+      std::endl <<
+      "  --debug_ms N      set message debug level (e.g. 1)" << std::endl;
+  }
+
+  std::cout.flush();
+}
+
+bool ceph_argparse_need_usage(const std::vector<const char*>& args)
+{
+  if (args.empty()) {
+    return true;
+  }
+  for (auto a : args) {
+    if (strcmp(a, "-h") == 0 ||
+	strcmp(a, "--help") == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void generic_server_usage()
+{
+  generic_usage(true);
+}
+
+void generic_client_usage()
+{
+  generic_usage(false);
+}
diff --git a/src/common/ceph_argparse.h b/src/common/ceph_argparse.h
new file mode 100644
index 000000000..d63a2bdd7
--- /dev/null
+++ b/src/common/ceph_argparse.h
@@ -0,0 +1,92 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2008-2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_ARGPARSE_H
+#define CEPH_ARGPARSE_H
+
+/*
+ * Ceph argument parsing library
+ *
+ * We probably should eventually replace this with something standard like popt.
+ * Until we do that, though, this file is the place for argv parsing
+ * stuff to live.
+ */
+
+#include <string>
+#include <vector>
+
+#include "common/entity_name.h"
+#include "include/encoding.h"
+
+/////////////////////// Types ///////////////////////
+class CephInitParameters
+{
+public:
+  explicit CephInitParameters(uint32_t module_type_);
+
+  uint32_t module_type;
+  EntityName name;
+  bool no_config_file = false;
+
+  void encode(ceph::buffer::list& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(module_type, bl);
+    encode(name, bl);
+    encode(no_config_file, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(module_type, bl);
+    decode(name, bl);
+    decode(no_config_file, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(CephInitParameters)
+
+/////////////////////// Functions ///////////////////////
+extern void string_to_vec(std::vector<std::string>& args, std::string argstr);
+extern void clear_g_str_vec();
+extern void env_to_vec(std::vector<const char*>& args, const char *name=nullptr);
+extern std::vector<const char*> argv_to_vec(int argc, const char* const * argv);
+extern void vec_to_argv(const char *argv0, std::vector<const char*>& args,
+			int *argc, const char ***argv);
+
+extern bool parse_ip_port_vec(const char *s, std::vector<entity_addrvec_t>& vec,
+			      int type=0);
+bool ceph_argparse_double_dash(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i);
+bool ceph_argparse_flag(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, ...);
+bool ceph_argparse_witharg(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, std::string *ret,
+	std::ostream &oss, ...);
+bool ceph_argparse_witharg(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, std::string *ret, ...);
+template<class T>
+bool ceph_argparse_witharg(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, T *ret,
+	std::ostream &oss, ...);
+bool ceph_argparse_binary_flag(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, int *ret,
+	std::ostream *oss, ...);
+extern CephInitParameters ceph_argparse_early_args
+	    (std::vector<const char*>& args, uint32_t module_type,
+	     std::string *cluster, std::string *conf_file_list);
+extern bool ceph_argparse_need_usage(const std::vector<const char*>& args);
+extern void generic_server_usage();
+extern void generic_client_usage();
+
+#endif
diff --git a/src/common/ceph_atomic.h b/src/common/ceph_atomic.h
new file mode 100644
index 000000000..4a96ed97e
--- /dev/null
+++ b/src/common/ceph_atomic.h
@@ -0,0 +1,93 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <atomic>
+
+// What and why
+// ============
+//
+// ceph::atomic – thin wrapper to differentiate behavior of atomics.
+//
+// Not all users of the common truly need costly atomic operations to
+// synchronize data between CPUs and threads. Some, like crimson-osd,
+// stick to shared-nothing approach. Enforcing issue of atomics in
+// such cases is wasteful – on x86 any locked instruction works actually
+// like a full memory barrier stalling execution till CPU's store and
+// load buffers are drained.
+
+#if defined(WITH_SEASTAR) && !defined(WITH_BLUESTORE)
+
+#include <type_traits>
+
+namespace ceph {
+  template <class T>
+  class dummy_atomic {
+    T value;
+
+  public:
+    dummy_atomic() = default;
+    dummy_atomic(const dummy_atomic&) = delete;
+    dummy_atomic(T value) : value(std::move(value)) {
+    }
+    bool is_lock_free() const noexcept {
+      return true;
+    }
+    void store(T desired, std::memory_order) noexcept {
+      value = std::move(desired);
+    }
+    T load(std::memory_order = std::memory_order_seq_cst) const noexcept {
+      return value;
+    }
+    T operator=(T desired) noexcept {
+      value = std::move(desired);
+      return *this;
+    }
+    operator T() const noexcept {
+      return value;
+    }
+
+    // We need to differentiate with SFINAE as std::atomic offers beefier
+    // interface for integral types.
+
+    template<class TT=T>
+    std::enable_if_t<!std::is_enum_v<TT> && std::is_integral_v<TT>, TT>  operator++() {
+      return ++value;
+    }
+    template<class TT=T>
+    std::enable_if_t<!std::is_enum_v<TT> && std::is_integral_v<TT>, TT> operator++(int) {
+      return value++;
+    }
+    template<class TT=T>
+    std::enable_if_t<!std::is_enum_v<TT> && std::is_integral_v<TT>, TT> operator--() {
+      return --value;
+    }
+    template<class TT=T>
+    std::enable_if_t<!std::is_enum_v<TT> && std::is_integral_v<TT>, TT> operator--(int) {
+      return value--;
+    }
+    template<class TT=T>
+    std::enable_if_t<!std::is_enum_v<TT> && std::is_integral_v<TT>, TT> operator+=(const dummy_atomic& b) {
+      value += b;
+      return value;
+    }
+    template<class TT=T>
+    std::enable_if_t<!std::is_enum_v<TT> && std::is_integral_v<TT>, TT> operator-=(const dummy_atomic& b) {
+      value -= b;
+      return value;
+    }
+
+    static constexpr bool is_always_lock_free = true;
+  };
+
+  template <class T> using atomic = dummy_atomic<T>;
+} // namespace ceph
+
+#else  // WITH_SEASTAR
+
+namespace ceph {
+  template <class T> using atomic = ::std::atomic<T>;
+} // namespace ceph
+
+#endif	// WITH_SEASTAR
diff --git a/src/common/ceph_context.cc b/src/common/ceph_context.cc
new file mode 100644
index 000000000..d26f24511
--- /dev/null
+++ b/src/common/ceph_context.cc
@@ -0,0 +1,1059 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ * Copyright (C) 2017 OVH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/ceph_context.h"
+
+#include <mutex>
+#include <iostream>
+
+#include <pthread.h>
+
+#include <boost/algorithm/string.hpp>
+
+#include "include/common_fwd.h"
+#include "include/mempool.h"
+#include "include/stringify.h"
+#include "common/admin_socket.h"
+#include "common/code_environment.h"
+#include "common/ceph_mutex.h"
+#include "common/debug.h"
+#include "common/config.h"
+#include "common/ceph_crypto.h"
+#include "common/hostname.h"
+#include "common/HeartbeatMap.h"
+#include "common/errno.h"
+#include "common/Graylog.h"
+#ifdef CEPH_DEBUG_MUTEX
+#include "common/lockdep.h"
+#endif
+
+#include "log/Log.h"
+
+#include "auth/Crypto.h"
+#include "include/str_list.h"
+#include "common/config.h"
+#include "common/config_obs.h"
+#include "common/PluginRegistry.h"
+#include "common/valgrind.h"
+#include "include/spinlock.h"
+#if !(defined(WITH_SEASTAR) && !defined(WITH_ALIEN))
+#include "mon/MonMap.h"
+#endif
+
+// for CINIT_FLAGS
+#include "common/common_init.h"
+
+#include <iostream>
+#include <pthread.h>
+
+using namespace std::literals;
+
+using ceph::bufferlist;
+using ceph::HeartbeatMap;
+
+
+#if defined(WITH_SEASTAR) && !defined(WITH_ALIEN)
+namespace crimson::common {
+CephContext::CephContext()
+  : _conf{crimson::common::local_conf()},
+    _perf_counters_collection{crimson::common::local_perf_coll()},
+    _crypto_random{std::make_unique<CryptoRandom>()}
+{}
+
+// define the dtor in .cc as CryptoRandom is an incomplete type in the header
+CephContext::~CephContext()
+{}
+
+uint32_t CephContext::get_module_type() const
+{
+  return CEPH_ENTITY_TYPE_OSD;
+}
+
+CryptoRandom* CephContext::random() const
+{
+  return _crypto_random.get();
+}
+
+CephContext* CephContext::get()
+{
+  ++nref;
+  return this;
+}
+
+void CephContext::put()
+{
+  if (--nref == 0) {
+    delete this;
+  }
+}
+
+PerfCountersCollectionImpl* CephContext::get_perfcounters_collection()
+{
+  return _perf_counters_collection.get_perf_collection();
+}
+
+}
+#else  // WITH_SEASTAR
+namespace {
+
+#ifdef CEPH_DEBUG_MUTEX
+class LockdepObs : public md_config_obs_t {
+public:
+  explicit LockdepObs(CephContext *cct)
+    : m_cct(cct), m_registered(false), lock(ceph::make_mutex("lock_dep_obs")) {
+  }
+  ~LockdepObs() override {
+    if (m_registered) {
+      lockdep_unregister_ceph_context(m_cct);
+    }
+  }
+
+  const char** get_tracked_conf_keys() const override {
+    static const char *KEYS[] = {"lockdep", NULL};
+    return KEYS;
+  }
+
+  void handle_conf_change(const ConfigProxy& conf,
+                          const std::set <std::string> &changed) override {
+    std::unique_lock locker(lock);
+    if (conf->lockdep && !m_registered) {
+      lockdep_register_ceph_context(m_cct);
+      m_registered = true;
+    } else if (!conf->lockdep && m_registered) {
+      lockdep_unregister_ceph_context(m_cct);
+      m_registered = false;
+    }
+  }
+private:
+  CephContext *m_cct;
+  bool m_registered;
+  ceph::mutex lock;
+};
+#endif // CEPH_DEBUG_MUTEX
+
+class MempoolObs : public md_config_obs_t,
+		  public AdminSocketHook {
+  CephContext *cct;
+  ceph::mutex lock;
+
+public:
+  explicit MempoolObs(CephContext *cct)
+    : cct(cct), lock(ceph::make_mutex("mem_pool_obs")) {
+    cct->_conf.add_observer(this);
+    int r = cct->get_admin_socket()->register_command(
+      "dump_mempools",
+      this,
+      "get mempool stats");
+    ceph_assert(r == 0);
+  }
+  ~MempoolObs() override {
+    cct->_conf.remove_observer(this);
+    cct->get_admin_socket()->unregister_commands(this);
+  }
+
+  // md_config_obs_t
+  const char** get_tracked_conf_keys() const override {
+    static const char *KEYS[] = {
+      "mempool_debug",
+      NULL
+    };
+    return KEYS;
+  }
+
+  void handle_conf_change(const ConfigProxy& conf,
+                          const std::set <std::string> &changed) override {
+    std::unique_lock locker(lock);
+    if (changed.count("mempool_debug")) {
+      mempool::set_debug_mode(cct->_conf->mempool_debug);
+    }
+  }
+
+  // AdminSocketHook
+  int call(std::string_view command, const cmdmap_t& cmdmap,
+	   const bufferlist& inbl,
+	   ceph::Formatter *f,
+	   std::ostream& errss,
+	   bufferlist& out) override {
+    if (command == "dump_mempools") {
+      f->open_object_section("mempools");
+      mempool::dump(f);
+      f->close_section();
+      return 0;
+    }
+    return -ENOSYS;
+  }
+};
+
+} // anonymous namespace
+
+namespace ceph::common {
+class CephContextServiceThread : public Thread
+{
+public:
+  explicit CephContextServiceThread(CephContext *cct)
+    : _reopen_logs(false), _exit_thread(false), _cct(cct)
+  {
+  }
+
+  ~CephContextServiceThread() override {}
+
+  void *entry() override
+  {
+    while (1) {
+      std::unique_lock l(_lock);
+      if (_exit_thread) {
+        break;
+      }
+
+      if (_cct->_conf->heartbeat_interval) {
+        auto interval = ceph::make_timespan(_cct->_conf->heartbeat_interval);
+        _cond.wait_for(l, interval);
+      } else
+        _cond.wait(l);
+
+      if (_exit_thread) {
+        break;
+      }
+
+      if (_reopen_logs) {
+        _cct->_log->reopen_log_file();
+        _reopen_logs = false;
+      }
+      _cct->_heartbeat_map->check_touch_file();
+
+      // refresh the perf coutners
+      _cct->_refresh_perf_values();
+    }
+    return NULL;
+  }
+
+  void reopen_logs()
+  {
+    std::lock_guard l(_lock);
+    _reopen_logs = true;
+    _cond.notify_all();
+  }
+
+  void exit_thread()
+  {
+    std::lock_guard l(_lock);
+    _exit_thread = true;
+    _cond.notify_all();
+  }
+
+private:
+  ceph::mutex _lock = ceph::make_mutex("CephContextServiceThread::_lock");
+  ceph::condition_variable _cond;
+  bool _reopen_logs;
+  bool _exit_thread;
+  CephContext *_cct;
+};
+}
+
+/**
+ * observe logging config changes
+ *
+ * The logging subsystem sits below most of the ceph code, including
+ * the config subsystem, to keep it simple and self-contained.  Feed
+ * logging-related config changes to the log.
+ */
+class LogObs : public md_config_obs_t {
+  ceph::logging::Log *log;
+  ceph::mutex lock;
+
+public:
+  explicit LogObs(ceph::logging::Log *l)
+    : log(l), lock(ceph::make_mutex("log_obs")) {
+  }
+
+  const char** get_tracked_conf_keys() const override {
+    static const char *KEYS[] = {
+      "log_file",
+      "log_max_new",
+      "log_max_recent",
+      "log_to_file",
+      "log_to_syslog",
+      "err_to_syslog",
+      "log_stderr_prefix",
+      "log_to_stderr",
+      "err_to_stderr",
+      "log_to_graylog",
+      "err_to_graylog",
+      "log_graylog_host",
+      "log_graylog_port",
+      "log_to_journald",
+      "err_to_journald",
+      "log_coarse_timestamps",
+      "fsid",
+      "host",
+      NULL
+    };
+    return KEYS;
+  }
+
+  void handle_conf_change(const ConfigProxy& conf,
+                          const std::set <std::string> &changed) override {
+    std::unique_lock locker(lock);
+    // stderr
+    if (changed.count("log_to_stderr") || changed.count("err_to_stderr")) {
+      int l = conf->log_to_stderr ? 99 : (conf->err_to_stderr ? -1 : -2);
+      log->set_stderr_level(l, l);
+    }
+
+    // syslog
+    if (changed.count("log_to_syslog")) {
+      int l = conf->log_to_syslog ? 99 : (conf->err_to_syslog ? -1 : -2);
+      log->set_syslog_level(l, l);
+    }
+
+    // file
+    if (changed.count("log_file") ||
+	changed.count("log_to_file")) {
+      if (conf->log_to_file) {
+	log->set_log_file(conf->log_file);
+      } else {
+	log->set_log_file({});
+      }
+      log->reopen_log_file();
+    }
+
+    if (changed.count("log_stderr_prefix")) {
+      log->set_log_stderr_prefix(conf.get_val<std::string>("log_stderr_prefix"));
+    }
+
+    if (changed.count("log_max_new")) {
+
+      log->set_max_new(conf->log_max_new);
+    }
+
+    if (changed.count("log_max_recent")) {
+      log->set_max_recent(conf->log_max_recent);
+    }
+
+    // graylog
+    if (changed.count("log_to_graylog") || changed.count("err_to_graylog")) {
+      int l = conf->log_to_graylog ? 99 : (conf->err_to_graylog ? -1 : -2);
+      log->set_graylog_level(l, l);
+
+      if (conf->log_to_graylog || conf->err_to_graylog) {
+	log->start_graylog(conf->host, conf.get_val<uuid_d>("fsid"));
+      } else if (! (conf->log_to_graylog && conf->err_to_graylog)) {
+	log->stop_graylog();
+      }
+    }
+
+    if (log->graylog() && (changed.count("log_graylog_host") || changed.count("log_graylog_port"))) {
+      log->graylog()->set_destination(conf->log_graylog_host, conf->log_graylog_port);
+    }
+
+    // journald
+    if (changed.count("log_to_journald") || changed.count("err_to_journald")) {
+      int l = conf.get_val<bool>("log_to_journald") ? 99 : (conf.get_val<bool>("err_to_journald") ? -1 : -2);
+      log->set_journald_level(l, l);
+
+      if (l > -2) {
+        log->start_journald_logger();
+      } else {
+        log->stop_journald_logger();
+      }
+    }
+
+    if (changed.find("log_coarse_timestamps") != changed.end()) {
+      log->set_coarse_timestamps(conf.get_val<bool>("log_coarse_timestamps"));
+    }
+
+    // metadata
+    if (log->graylog() && changed.count("host")) {
+      log->graylog()->set_hostname(conf->host);
+    }
+
+    if (log->graylog() && changed.count("fsid")) {
+      log->graylog()->set_fsid(conf.get_val<uuid_d>("fsid"));
+    }
+  }
+};
+
+
+namespace ceph::common {
+// cct config watcher
+class CephContextObs : public md_config_obs_t {
+  CephContext *cct;
+
+public:
+  explicit CephContextObs(CephContext *cct) : cct(cct) {}
+
+  const char** get_tracked_conf_keys() const override {
+    static const char *KEYS[] = {
+      "enable_experimental_unrecoverable_data_corrupting_features",
+      "crush_location",
+      "container_image",  // just so we don't hear complaints about it!
+      NULL
+    };
+    return KEYS;
+  }
+
+  void handle_conf_change(const ConfigProxy& conf,
+                          const std::set <std::string> &changed) override {
+    if (changed.count(
+	  "enable_experimental_unrecoverable_data_corrupting_features")) {
+      std::lock_guard lg(cct->_feature_lock);
+
+      cct->_experimental_features.clear();
+      auto add_experimental_feature = [this] (auto feature) {
+        cct->_experimental_features.emplace(std::string{feature});
+      };
+      for_each_substr(conf->enable_experimental_unrecoverable_data_corrupting_features,
+          ";,= \t", add_experimental_feature);
+
+      if (getenv("CEPH_DEV") == NULL) {
+        if (!cct->_experimental_features.empty()) {
+          if (cct->_experimental_features.count("*")) {
+            lderr(cct) << "WARNING: all dangerous and experimental features are enabled." << dendl;
+          } else {
+            lderr(cct) << "WARNING: the following dangerous and experimental features are enabled: "
+              << cct->_experimental_features << dendl;
+          }
+        }
+      }
+
+    }
+    if (changed.count("crush_location")) {
+      cct->crush_location.update_from_conf();
+    }
+  }
+};
+// perfcounter hooks
+
+class CephContextHook : public AdminSocketHook {
+  CephContext *m_cct;
+
+public:
+  explicit CephContextHook(CephContext *cct) : m_cct(cct) {}
+
+  int call(std::string_view command, const cmdmap_t& cmdmap,
+	   const bufferlist& inbl,
+	   Formatter *f,
+	   std::ostream& errss,
+	   bufferlist& out) override {
+    try {
+      return m_cct->do_command(command, cmdmap, f, errss, &out);
+    } catch (const bad_cmd_get& e) {
+      return -EINVAL;
+    }
+  }
+};
+
+
+bool CephContext::check_experimental_feature_enabled(const std::string& feat)
+{
+  std::stringstream message;
+  bool enabled = check_experimental_feature_enabled(feat, &message);
+  lderr(this) << message.str() << dendl;
+  return enabled;
+}
+
+bool CephContext::check_experimental_feature_enabled(const std::string& feat,
+						     std::ostream *message)
+{
+  std::unique_lock<ceph::spinlock> lg(_feature_lock);
+
+  bool enabled = (_experimental_features.count(feat) ||
+		  _experimental_features.count("*"));
+
+  if (enabled) {
+    (*message) << "WARNING: experimental feature '" << feat << "' is enabled\n";
+    (*message) << "Please be aware that this feature is experimental, untested,\n";
+    (*message) << "unsupported, and may result in data corruption, data loss,\n";
+    (*message) << "and/or irreparable damage to your cluster.  Do not use\n";
+    (*message) << "feature with important data.\n";
+  } else {
+    (*message) << "*** experimental feature '" << feat << "' is not enabled ***\n";
+    (*message) << "This feature is marked as experimental, which means it\n";
+    (*message) << " - is untested\n";
+    (*message) << " - is unsupported\n";
+    (*message) << " - may corrupt your data\n";
+    (*message) << " - may break your cluster is an unrecoverable fashion\n";
+    (*message) << "To enable this feature, add this to your ceph.conf:\n";
+    (*message) << "  enable experimental unrecoverable data corrupting features = " << feat << "\n";
+  }
+  return enabled;
+}
+
+int CephContext::do_command(std::string_view command, const cmdmap_t& cmdmap,
+			    Formatter *f,
+			    std::ostream& ss,
+			    bufferlist *out)
+{
+  try {
+    return _do_command(command, cmdmap, f, ss, out);
+  } catch (const bad_cmd_get& e) {
+    ss << e.what();
+    return -EINVAL;
+  }
+}
+
+#pragma GCC push_options
+#pragma GCC optimize ("O0")
+static void leak_some_memory() {
+  volatile char *foo = new char[1234];
+  (void)foo;
+}
+#pragma GCC pop_options
+
+int CephContext::_do_command(
+  std::string_view command, const cmdmap_t& cmdmap,
+  Formatter *f,
+  std::ostream& ss,
+  bufferlist *out)
+{
+  int r = 0;
+  lgeneric_dout(this, 1) << "do_command '" << command << "' '" << cmdmap << "'"
+			 << dendl;
+  ceph_assert_always(!(command == "assert" && _conf->debug_asok_assert_abort));
+  if (command == "abort") {
+    if (_conf->debug_asok_assert_abort) {
+      ceph_abort();
+    } else {
+      return -EPERM;
+    }
+  }
+  if (command == "leak_some_memory") {
+    leak_some_memory();
+  }
+  else if (command == "perfcounters_dump" || command == "1" ||
+      command == "perf dump") {
+    std::string logger;
+    std::string counter;
+    cmd_getval(cmdmap, "logger", logger);
+    cmd_getval(cmdmap, "counter", counter);
+    _perf_counters_collection->dump_formatted(f, false, false, logger, counter);
+  }
+  else if (command == "perfcounters_schema" || command == "2" ||
+    command == "perf schema") {
+    _perf_counters_collection->dump_formatted(f, true, false);
+  }
+  else if (command == "counter dump") {
+    _perf_counters_collection->dump_formatted(f, false, true);
+  }
+  else if (command == "counter schema") {
+    _perf_counters_collection->dump_formatted(f, true, true);
+  }
+  else if (command == "perf histogram dump") {
+    std::string logger;
+    std::string counter;
+    cmd_getval(cmdmap, "logger", logger);
+    cmd_getval(cmdmap, "counter", counter);
+    _perf_counters_collection->dump_formatted_histograms(f, false, logger,
+                                                         counter);
+  }
+  else if (command == "perf histogram schema") {
+    _perf_counters_collection->dump_formatted_histograms(f, true);
+  }
+  else if (command == "perf reset") {
+    std::string var;
+    std::string section(command);
+    f->open_object_section(section.c_str());
+    if (!cmd_getval(cmdmap, "var", var)) {
+      f->dump_string("error", "syntax error: 'perf reset <var>'");
+    } else {
+     if(!_perf_counters_collection->reset(var))
+        f->dump_stream("error") << "Not find: " << var;
+     else
+       f->dump_string("success", std::string(command) + ' ' + var);
+    }
+    f->close_section();
+  }
+  else {
+    std::string section(command);
+    boost::replace_all(section, " ", "_");
+    f->open_object_section(section.c_str());
+    if (command == "config show") {
+      _conf.show_config(f);
+    }
+    else if (command == "config unset") {
+      std::string var;
+      if (!(cmd_getval(cmdmap, "var", var))) {
+	r = -EINVAL;
+      } else {
+        r = _conf.rm_val(var.c_str());
+        if (r < 0 && r != -ENOENT) {
+          ss << "error unsetting '" << var << "': "
+	     << cpp_strerror(r);
+        } else {
+          _conf.apply_changes(&ss);
+	  r = 0;
+        }
+      }
+
+    }
+    else if (command == "config set") {
+      std::string var;
+      std::vector<std::string> val;
+
+      if (!(cmd_getval(cmdmap, "var", var)) ||
+          !(cmd_getval(cmdmap, "val", val))) {
+	r = -EINVAL;
+      } else {
+	// val may be multiple words
+	auto valstr = str_join(val, " ");
+        r = _conf.set_val(var.c_str(), valstr.c_str());
+        if (r < 0) {
+          ss << "error setting '" << var << "' to '" << valstr << "': "
+	     << cpp_strerror(r);
+        } else {
+	  std::stringstream ss;
+          _conf.apply_changes(&ss);
+	  f->dump_string("success", ss.str());
+        }
+      }
+    } else if (command == "config get") {
+      std::string var;
+      if (!cmd_getval(cmdmap, "var", var)) {
+	r = -EINVAL;
+      } else {
+	char buf[4096];
+	// FIPS zeroization audit 20191115: this memset is not security related.
+	memset(buf, 0, sizeof(buf));
+	char *tmp = buf;
+	r = _conf.get_val(var.c_str(), &tmp, sizeof(buf));
+	if (r < 0) {
+	  ss << "error getting '" << var << "': " << cpp_strerror(r);
+	} else {
+	  f->dump_string(var.c_str(), buf);
+	}
+      }
+    } else if (command == "config help") {
+      std::string var;
+      if (cmd_getval(cmdmap, "var", var)) {
+        // Output a single one
+        std::string key = ConfFile::normalize_key_name(var);
+	auto schema = _conf.get_schema(key);
+        if (!schema) {
+          ss << "Setting not found: '" << key << "'";
+	  r = -ENOENT;
+        } else {
+          f->dump_object("option", *schema);
+        }
+      } else {
+        // Output all
+        f->open_array_section("options");
+        for (const auto &option : ceph_options) {
+          f->dump_object("option", option);
+        }
+        f->close_section();
+      }
+    } else if (command == "config diff") {
+      f->open_object_section("diff");
+      _conf.diff(f);
+      f->close_section(); // unknown
+    } else if (command == "config diff get") {
+      std::string setting;
+      f->open_object_section("diff");
+      _conf.diff(f, setting);
+      f->close_section(); // unknown
+    }
+    else if (command == "injectargs") {
+      std::vector<std::string> argsvec;
+      cmd_getval(cmdmap, "injected_args", argsvec);
+      if (!argsvec.empty()) {
+	auto args = joinify<std::string>(argsvec.begin(), argsvec.end(), " ");
+	r = _conf.injectargs(args, &ss);
+      }
+    }
+    else if (command == "log flush") {
+      _log->flush();
+    }
+    else if (command == "log dump") {
+      _log->dump_recent();
+    }
+    else if (command == "log reopen") {
+      _log->reopen_log_file();
+    }
+    else {
+      ceph_abort_msg("registered under wrong command?");    
+    }
+    f->close_section();
+  }
+  lgeneric_dout(this, 1) << "do_command '" << command << "' '" << cmdmap
+		         << "' result is " << out->length() << " bytes" << dendl;
+  return r;
+}
+
+CephContext::CephContext(uint32_t module_type_,
+                         enum code_environment_t code_env,
+                         int init_flags_)
+  : CephContext(module_type_, create_options{code_env, init_flags_, nullptr})
+{}
+
+CephContext::CephContext(uint32_t module_type_,
+			 const create_options& options)
+  : nref(1),
+    _conf{options.code_env == CODE_ENVIRONMENT_DAEMON},
+    _log(NULL),
+    _module_type(module_type_),
+    _init_flags(options.init_flags),
+    _set_uid(0),
+    _set_gid(0),
+    _set_uid_string(),
+    _set_gid_string(),
+    _crypto_inited(0),
+    _service_thread(NULL),
+    _log_obs(NULL),
+    _admin_socket(NULL),
+    _perf_counters_collection(NULL),
+    _perf_counters_conf_obs(NULL),
+    _heartbeat_map(NULL),
+    _crypto_none(NULL),
+    _crypto_aes(NULL),
+    _plugin_registry(NULL),
+#ifdef CEPH_DEBUG_MUTEX
+    _lockdep_obs(NULL),
+#endif
+    crush_location(this)
+{
+  if (options.create_log) {
+    _log = options.create_log(&_conf->subsys);
+  } else {
+    _log = new ceph::logging::Log(&_conf->subsys);
+  }
+
+  _log_obs = new LogObs(_log);
+  _conf.add_observer(_log_obs);
+
+  _cct_obs = new CephContextObs(this);
+  _conf.add_observer(_cct_obs);
+#ifdef CEPH_DEBUG_MUTEX
+  _lockdep_obs = new LockdepObs(this);
+  _conf.add_observer(_lockdep_obs);
+#endif
+  _perf_counters_collection = new PerfCountersCollection(this);
+ 
+  _admin_socket = new AdminSocket(this);
+  _heartbeat_map = new HeartbeatMap(this);
+
+  _plugin_registry = new PluginRegistry(this);
+
+  _admin_hook = new CephContextHook(this);
+  _admin_socket->register_command("assert", _admin_hook, "");
+  _admin_socket->register_command("abort", _admin_hook, "");
+  _admin_socket->register_command("leak_some_memory", _admin_hook, "");
+  _admin_socket->register_command("perfcounters_dump", _admin_hook, "");
+  _admin_socket->register_command("1", _admin_hook, "");
+  _admin_socket->register_command("perf dump name=logger,type=CephString,req=false name=counter,type=CephString,req=false", _admin_hook, "dump non-labeled counters and their values");
+  _admin_socket->register_command("perfcounters_schema", _admin_hook, "");
+  _admin_socket->register_command("perf histogram dump name=logger,type=CephString,req=false name=counter,type=CephString,req=false", _admin_hook, "dump perf histogram values");
+  _admin_socket->register_command("2", _admin_hook, "");
+  _admin_socket->register_command("perf schema", _admin_hook, "dump non-labeled counters schemas");
+  _admin_socket->register_command("counter dump", _admin_hook, "dump all labeled and non-labeled counters and their values");
+  _admin_socket->register_command("counter schema", _admin_hook, "dump all labeled and non-labeled counters schemas");
+  _admin_socket->register_command("perf histogram schema", _admin_hook, "dump perf histogram schema");
+  _admin_socket->register_command("perf reset name=var,type=CephString", _admin_hook, "perf reset <name>: perf reset all or one perfcounter name");
+  _admin_socket->register_command("config show", _admin_hook, "dump current config settings");
+  _admin_socket->register_command("config help name=var,type=CephString,req=false", _admin_hook, "get config setting schema and descriptions");
+  _admin_socket->register_command("config set name=var,type=CephString name=val,type=CephString,n=N",  _admin_hook, "config set <field> <val> [<val> ...]: set a config variable");
+  _admin_socket->register_command("config unset name=var,type=CephString",  _admin_hook, "config unset <field>: unset a config variable");
+  _admin_socket->register_command("config get name=var,type=CephString", _admin_hook, "config get <field>: get the config value");
+  _admin_socket->register_command(
+      "config diff", _admin_hook,
+      "dump diff of current config and default config");
+  _admin_socket->register_command(
+      "config diff get name=var,type=CephString", _admin_hook,
+      "dump diff get <field>: dump diff of current and default config setting <field>");
+  _admin_socket->register_command("injectargs name=injected_args,type=CephString,n=N", _admin_hook, "inject configuration arguments into running daemon"),
+  _admin_socket->register_command("log flush", _admin_hook, "flush log entries to log file");
+  _admin_socket->register_command("log dump", _admin_hook, "dump recent log entries to log file");
+  _admin_socket->register_command("log reopen", _admin_hook, "reopen log file");
+
+  _crypto_none = CryptoHandler::create(CEPH_CRYPTO_NONE);
+  _crypto_aes = CryptoHandler::create(CEPH_CRYPTO_AES);
+  _crypto_random.reset(new CryptoRandom());
+
+  lookup_or_create_singleton_object<MempoolObs>("mempool_obs", false, this);
+}
+
+CephContext::~CephContext()
+{
+  associated_objs.clear();
+  join_service_thread();
+
+  if (_cct_perf) {
+    _perf_counters_collection->remove(_cct_perf);
+    delete _cct_perf;
+    _cct_perf = NULL;
+  }
+
+  delete _plugin_registry;
+
+  _admin_socket->unregister_commands(_admin_hook);
+  delete _admin_hook;
+  delete _admin_socket;
+
+  delete _heartbeat_map;
+
+  delete _perf_counters_collection;
+  _perf_counters_collection = NULL;
+
+  delete _perf_counters_conf_obs;
+  _perf_counters_conf_obs = NULL;
+
+  _conf.remove_observer(_log_obs);
+  delete _log_obs;
+  _log_obs = NULL;
+
+  _conf.remove_observer(_cct_obs);
+  delete _cct_obs;
+  _cct_obs = NULL;
+#ifdef CEPH_DEBUG_MUTEX
+  _conf.remove_observer(_lockdep_obs);
+  delete _lockdep_obs;
+  _lockdep_obs = NULL;
+#endif
+  _log->stop();
+  delete _log;
+  _log = NULL;
+
+  delete _crypto_none;
+  delete _crypto_aes;
+  if (_crypto_inited > 0) {
+    ceph_assert(_crypto_inited == 1);  // or else someone explicitly did
+				  // init but not shutdown
+    shutdown_crypto();
+  }
+}
+
+void CephContext::put() {
+  if (--nref == 0) {
+    ANNOTATE_HAPPENS_AFTER(&nref);
+    ANNOTATE_HAPPENS_BEFORE_FORGET_ALL(&nref);
+    if (g_ceph_context == this)
+      g_ceph_context = nullptr;
+    delete this;
+  } else {
+    ANNOTATE_HAPPENS_BEFORE(&nref);
+  }
+}
+
+void CephContext::init_crypto()
+{
+  if (_crypto_inited++ == 0) {
+    TOPNSPC::crypto::init();
+  }
+}
+
+void CephContext::shutdown_crypto()
+{
+  if (--_crypto_inited == 0) {
+    TOPNSPC::crypto::shutdown(g_code_env == CODE_ENVIRONMENT_LIBRARY);
+  }
+}
+
+void CephContext::start_service_thread()
+{
+  {
+    std::lock_guard lg(_service_thread_lock);
+    if (_service_thread) {
+      return;
+    }
+    _service_thread = new CephContextServiceThread(this);
+    _service_thread->create("service");
+  }
+
+  if (!(get_init_flags() & CINIT_FLAG_NO_CCT_PERF_COUNTERS))
+    _enable_perf_counter();
+
+  // make logs flush on_exit()
+  if (_conf->log_flush_on_exit)
+    _log->set_flush_on_exit();
+
+  // Trigger callbacks on any config observers that were waiting for
+  // it to become safe to start threads.
+  _conf.set_safe_to_start_threads();
+  _conf.call_all_observers();
+
+  // start admin socket
+  if (_conf->admin_socket.length())
+    _admin_socket->init(_conf->admin_socket);
+}
+
+void CephContext::reopen_logs()
+{
+  std::lock_guard lg(_service_thread_lock);
+  if (_service_thread)
+    _service_thread->reopen_logs();
+}
+
+void CephContext::join_service_thread()
+{
+  std::unique_lock<ceph::spinlock> lg(_service_thread_lock);
+
+  CephContextServiceThread *thread = _service_thread;
+  if (!thread) {
+    return;
+  }
+  _service_thread = NULL;
+
+  lg.unlock();
+
+  thread->exit_thread();
+  thread->join();
+  delete thread;
+
+  if (!(get_init_flags() & CINIT_FLAG_NO_CCT_PERF_COUNTERS))
+    _disable_perf_counter();
+}
+
+uint32_t CephContext::get_module_type() const
+{
+  return _module_type;
+}
+
+void CephContext::set_init_flags(int flags)
+{
+  _init_flags = flags;
+}
+
+int CephContext::get_init_flags() const
+{
+  return _init_flags;
+}
+
+PerfCountersCollection *CephContext::get_perfcounters_collection()
+{
+  return _perf_counters_collection;
+}
+
+void CephContext::_enable_perf_counter()
+{
+  assert(!_cct_perf);
+  PerfCountersBuilder plb(this, "cct", l_cct_first, l_cct_last);
+  plb.add_u64(l_cct_total_workers, "total_workers", "Total workers");
+  plb.add_u64(l_cct_unhealthy_workers, "unhealthy_workers", "Unhealthy workers");
+  _cct_perf = plb.create_perf_counters();
+  _perf_counters_collection->add(_cct_perf);
+
+  assert(_mempool_perf_names.empty());
+  assert(_mempool_perf_descriptions.empty());
+  _mempool_perf_names.reserve(mempool::num_pools * 2);
+  _mempool_perf_descriptions.reserve(mempool::num_pools * 2);
+  for (unsigned i = 0; i < mempool::num_pools; ++i) {
+    std::string n = mempool::get_pool_name(mempool::pool_index_t(i));
+    _mempool_perf_names.push_back(n + "_bytes"s);
+    _mempool_perf_descriptions.push_back(
+      "mempool "s + n + " total bytes");
+    _mempool_perf_names.push_back(n + "_items"s);
+    _mempool_perf_descriptions.push_back(
+      "mempool "s + n + " total items"s);
+  }
+
+  PerfCountersBuilder plb2(this, "mempool", l_mempool_first,
+			  l_mempool_first + 1 + 2*mempool::num_pools);
+  unsigned l = l_mempool_first + 1;
+  for (unsigned i = 0; i < mempool::num_pools; ++i) {
+    plb2.add_u64(l++, _mempool_perf_names[i*2].c_str(),
+		 _mempool_perf_descriptions[i*2].c_str());
+    plb2.add_u64(l++, _mempool_perf_names[i*2+1].c_str(),
+		 _mempool_perf_descriptions[i*2+1].c_str());
+  }
+  _mempool_perf = plb2.create_perf_counters();
+  _perf_counters_collection->add(_mempool_perf);
+}
+
+void CephContext::_disable_perf_counter()
+{
+  if (!_cct_perf) {
+    return;
+  }
+  _perf_counters_collection->remove(_cct_perf);
+  delete _cct_perf;
+  _cct_perf = nullptr;
+
+  _perf_counters_collection->remove(_mempool_perf);
+  delete _mempool_perf;
+  _mempool_perf = nullptr;
+  _mempool_perf_names.clear();
+  _mempool_perf_descriptions.clear();
+}
+
+void CephContext::_refresh_perf_values()
+{
+  if (_cct_perf) {
+    _cct_perf->set(l_cct_total_workers, _heartbeat_map->get_total_workers());
+    _cct_perf->set(l_cct_unhealthy_workers, _heartbeat_map->get_unhealthy_workers());
+  }
+  unsigned l = l_mempool_first + 1;
+  for (unsigned i = 0; i < mempool::num_pools; ++i) {
+    mempool::pool_t& p = mempool::get_pool(mempool::pool_index_t(i));
+    _mempool_perf->set(l++, p.allocated_bytes());
+    _mempool_perf->set(l++, p.allocated_items());
+  }
+}
+
+AdminSocket *CephContext::get_admin_socket()
+{
+  return _admin_socket;
+}
+
+CryptoHandler *CephContext::get_crypto_handler(int type)
+{
+  switch (type) {
+  case CEPH_CRYPTO_NONE:
+    return _crypto_none;
+  case CEPH_CRYPTO_AES:
+    return _crypto_aes;
+  default:
+    return NULL;
+  }
+}
+
+void CephContext::notify_pre_fork()
+{
+  {
+    std::lock_guard lg(_fork_watchers_lock);
+    for (auto &&t : _fork_watchers) {
+      t->handle_pre_fork();
+    }
+  }
+  {
+    // note: we don't hold a lock here, but we assume we are idle at
+    // fork time, which happens during process init and startup.
+    auto i = associated_objs.begin();
+    while (i != associated_objs.end()) {
+      if (associated_objs_drop_on_fork.count(i->first.first)) {
+	i = associated_objs.erase(i);
+      } else {
+	++i;
+      }
+    }
+    associated_objs_drop_on_fork.clear();
+  }
+}
+
+void CephContext::notify_post_fork()
+{
+  ceph::spin_unlock(&_fork_watchers_lock);
+  for (auto &&t : _fork_watchers)
+    t->handle_post_fork();
+}
+
+void CephContext::set_mon_addrs(const MonMap& mm) {
+  std::vector<entity_addrvec_t> mon_addrs;
+  for (auto& i : mm.mon_info) {
+    mon_addrs.push_back(i.second.public_addrs);
+  }
+
+  set_mon_addrs(mon_addrs);
+}
+}
+#endif	// WITH_SEASTAR
diff --git a/src/common/ceph_context.h b/src/common/ceph_context.h
new file mode 100644
index 000000000..f18776478
--- /dev/null
+++ b/src/common/ceph_context.h
@@ -0,0 +1,424 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_CEPHCONTEXT_H
+#define CEPH_CEPHCONTEXT_H
+
+#include <atomic>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <set>
+#include <string>
+#include <string_view>
+#include <typeinfo>
+#include <typeindex>
+
+#include <boost/intrusive_ptr.hpp>
+
+#include "include/any.h"
+#include "include/common_fwd.h"
+#include "include/compat.h"
+
+#include "common/cmdparse.h"
+#include "common/code_environment.h"
+#include "msg/msg_types.h"
+#if defined(WITH_SEASTAR) && !defined(WITH_ALIEN)
+#include "crimson/common/config_proxy.h"
+#include "crimson/common/perf_counters_collection.h"
+#else
+#include "common/config_proxy.h"
+#include "include/spinlock.h"
+#include "common/perf_counters_collection.h"
+#endif
+
+
+#include "crush/CrushLocation.h"
+
+class AdminSocket;
+class CryptoHandler;
+class CryptoRandom;
+class MonMap;
+
+namespace ceph::common {
+  class CephContextServiceThread;
+  class CephContextObs;
+  class CephContextHook;
+}
+
+namespace ceph {
+  class PluginRegistry;
+  class HeartbeatMap;
+  namespace logging {
+    class Log;
+    class SubsystemMap;
+  }
+}
+
+#if defined(WITH_SEASTAR) && !defined(WITH_ALIEN)
+namespace crimson::common {
+class CephContext {
+public:
+  CephContext();
+  CephContext(uint32_t,
+	      code_environment_t=CODE_ENVIRONMENT_UTILITY,
+	      int = 0)
+    : CephContext{}
+  {}
+  CephContext(CephContext&&) = default;
+  ~CephContext();
+
+  uint32_t get_module_type() const;
+  bool check_experimental_feature_enabled(const std::string& feature) {
+    // everything crimson is experimental...
+    return true;
+  }
+  ceph::PluginRegistry* get_plugin_registry() {
+    return _plugin_registry;
+  }
+  CryptoRandom* random() const;
+  PerfCountersCollectionImpl* get_perfcounters_collection();
+  crimson::common::ConfigProxy& _conf;
+  crimson::common::PerfCountersCollection& _perf_counters_collection;
+  CephContext* get();
+  void put();
+private:
+  std::unique_ptr<CryptoRandom> _crypto_random;
+  unsigned nref;
+  ceph::PluginRegistry* _plugin_registry;
+};
+}
+#else
+#ifdef __cplusplus
+namespace ceph::common {
+#endif
+/* A CephContext represents the context held by a single library user.
+ * There can be multiple CephContexts in the same process.
+ *
+ * For daemons and utility programs, there will be only one CephContext.  The
+ * CephContext contains the configuration, the dout object, and anything else
+ * that you might want to pass to libcommon with every function call.
+ */
+class CephContext {
+public:
+  CephContext(uint32_t module_type_,
+              enum code_environment_t code_env=CODE_ENVIRONMENT_UTILITY,
+              int init_flags_ = 0);
+  struct create_options {
+    enum code_environment_t code_env=CODE_ENVIRONMENT_UTILITY;
+    int init_flags = 0;
+    std::function<ceph::logging::Log* (const ceph::logging::SubsystemMap *)> create_log;
+  };
+  CephContext(uint32_t module_type_,
+	      const create_options& options);
+  CephContext(const CephContext&) = delete;
+  CephContext& operator =(const CephContext&) = delete;
+  CephContext(CephContext&&) = delete;
+  CephContext& operator =(CephContext&&) = delete;
+
+  bool _finished = false;
+  ~CephContext();
+
+  // ref count!
+private:
+  std::atomic<unsigned> nref;
+public:
+  CephContext *get() {
+    ++nref;
+    return this;
+  }
+  void put();
+
+  ConfigProxy _conf;
+  ceph::logging::Log *_log;
+
+  /* init ceph::crypto */
+  void init_crypto();
+
+  /// shutdown crypto (should match init_crypto calls)
+  void shutdown_crypto();
+
+  /* Start the Ceph Context's service thread */
+  void start_service_thread();
+
+  /* Reopen the log files */
+  void reopen_logs();
+
+  /* Get the module type (client, mon, osd, mds, etc.) */
+  uint32_t get_module_type() const;
+
+  // this is here only for testing purposes!
+  void _set_module_type(uint32_t t) {
+    _module_type = t;
+  }
+
+  void set_init_flags(int flags);
+  int get_init_flags() const;
+
+  /* Get the PerfCountersCollection of this CephContext */
+  PerfCountersCollection *get_perfcounters_collection();
+
+  ceph::HeartbeatMap *get_heartbeat_map() {
+    return _heartbeat_map;
+  }
+
+  /**
+   * Get the admin socket associated with this CephContext.
+   *
+   * Currently there is always an admin socket object,
+   * so this will never return NULL.
+   *
+   * @return the admin socket
+   */
+  AdminSocket *get_admin_socket();
+
+  /**
+   * process an admin socket command
+   */
+  int do_command(std::string_view command, const cmdmap_t& cmdmap,
+		 Formatter *f,
+		 std::ostream& errss,
+		 ceph::bufferlist *out);
+  int _do_command(std::string_view command, const cmdmap_t& cmdmap,
+		  Formatter *f,
+		  std::ostream& errss,
+		  ceph::bufferlist *out);
+
+  static constexpr std::size_t largest_singleton = 8 * 72;
+
+  template<typename T, typename... Args>
+  T& lookup_or_create_singleton_object(std::string_view name,
+				       bool drop_on_fork,
+				       Args&&... args) {
+    static_assert(sizeof(T) <= largest_singleton,
+		  "Please increase largest singleton.");
+    std::lock_guard lg(associated_objs_lock);
+    std::type_index type = typeid(T);
+
+    auto i = associated_objs.find(std::make_pair(name, type));
+    if (i == associated_objs.cend()) {
+      if (drop_on_fork) {
+	associated_objs_drop_on_fork.insert(std::string(name));
+      }
+      i = associated_objs.emplace_hint(
+	i,
+	std::piecewise_construct,
+	std::forward_as_tuple(name, type),
+	std::forward_as_tuple(std::in_place_type<T>,
+			      std::forward<Args>(args)...));
+    }
+    return ceph::any_cast<T&>(i->second);
+  }
+
+  /**
+   * get a crypto handler
+   */
+  CryptoHandler *get_crypto_handler(int type);
+
+  CryptoRandom* random() const { return _crypto_random.get(); }
+
+  /// check if experimental feature is enable, and emit appropriate warnings
+  bool check_experimental_feature_enabled(const std::string& feature);
+  bool check_experimental_feature_enabled(const std::string& feature,
+					  std::ostream *message);
+
+  ceph::PluginRegistry *get_plugin_registry() {
+    return _plugin_registry;
+  }
+
+  void set_uid_gid(uid_t u, gid_t g) {
+    _set_uid = u;
+    _set_gid = g;
+  }
+  uid_t get_set_uid() const {
+    return _set_uid;
+  }
+  gid_t get_set_gid() const {
+    return _set_gid;
+  }
+
+  void set_uid_gid_strings(const std::string &u, const std::string &g) {
+    _set_uid_string = u;
+    _set_gid_string = g;
+  }
+  std::string get_set_uid_string() const {
+    return _set_uid_string;
+  }
+  std::string get_set_gid_string() const {
+    return _set_gid_string;
+  }
+
+  class ForkWatcher {
+   public:
+    virtual ~ForkWatcher() {}
+    virtual void handle_pre_fork() = 0;
+    virtual void handle_post_fork() = 0;
+  };
+
+  void register_fork_watcher(ForkWatcher *w) {
+    std::lock_guard lg(_fork_watchers_lock);
+    _fork_watchers.push_back(w);
+  }
+
+  void notify_pre_fork();
+  void notify_post_fork();
+
+  /**
+   * update CephContext with a copy of the passed in MonMap mon addrs
+   *
+   * @param mm MonMap to extract and update mon addrs
+   */
+  void set_mon_addrs(const MonMap& mm);
+  void set_mon_addrs(const std::vector<entity_addrvec_t>& in) {
+    auto ptr = std::make_shared<std::vector<entity_addrvec_t>>(in);
+    atomic_store_explicit(&_mon_addrs, std::move(ptr), std::memory_order_relaxed);
+  }
+  std::shared_ptr<std::vector<entity_addrvec_t>> get_mon_addrs() const {
+    auto ptr = atomic_load_explicit(&_mon_addrs, std::memory_order_relaxed);
+    return ptr;
+  }
+
+private:
+
+
+  /* Stop and join the Ceph Context's service thread */
+  void join_service_thread();
+
+  uint32_t _module_type;
+
+  int _init_flags;
+
+  uid_t _set_uid; ///< uid to drop privs to
+  gid_t _set_gid; ///< gid to drop privs to
+  std::string _set_uid_string;
+  std::string _set_gid_string;
+
+  int _crypto_inited;
+
+  std::shared_ptr<std::vector<entity_addrvec_t>> _mon_addrs;
+
+  /* libcommon service thread.
+   * SIGHUP wakes this thread, which then reopens logfiles */
+  friend class CephContextServiceThread;
+  CephContextServiceThread *_service_thread;
+
+  using md_config_obs_t = ceph::md_config_obs_impl<ConfigProxy>;
+
+  md_config_obs_t *_log_obs;
+
+  /* The admin socket associated with this context */
+  AdminSocket *_admin_socket;
+
+  /* lock which protects service thread creation, destruction, etc. */
+  ceph::spinlock _service_thread_lock;
+
+  /* The collection of profiling loggers associated with this context */
+  PerfCountersCollection *_perf_counters_collection;
+
+  md_config_obs_t *_perf_counters_conf_obs;
+
+  CephContextHook *_admin_hook;
+
+  ceph::HeartbeatMap *_heartbeat_map;
+
+  ceph::spinlock associated_objs_lock;
+
+  struct associated_objs_cmp {
+    using is_transparent = std::true_type;
+    template<typename T, typename U>
+    bool operator ()(const std::pair<T, std::type_index>& l,
+		     const std::pair<U, std::type_index>& r) const noexcept {
+      return ((l.first < r.first)  ||
+	      (l.first == r.first && l.second < r.second));
+    }
+  };
+
+  std::map<std::pair<std::string, std::type_index>,
+	   ceph::immobile_any<largest_singleton>,
+	   associated_objs_cmp> associated_objs;
+  std::set<std::string> associated_objs_drop_on_fork;
+
+  ceph::spinlock _fork_watchers_lock;
+  std::vector<ForkWatcher*> _fork_watchers;
+
+  // crypto
+  CryptoHandler *_crypto_none;
+  CryptoHandler *_crypto_aes;
+  std::unique_ptr<CryptoRandom> _crypto_random;
+
+  // experimental
+  CephContextObs *_cct_obs;
+  ceph::spinlock _feature_lock;
+  std::set<std::string> _experimental_features;
+
+  ceph::PluginRegistry* _plugin_registry;
+#ifdef CEPH_DEBUG_MUTEX
+  md_config_obs_t *_lockdep_obs;
+#endif
+public:
+  TOPNSPC::crush::CrushLocation crush_location;
+private:
+
+  enum {
+    l_cct_first,
+    l_cct_total_workers,
+    l_cct_unhealthy_workers,
+    l_cct_last
+  };
+  enum {
+    l_mempool_first = 873222,
+    l_mempool_bytes,
+    l_mempool_items,
+    l_mempool_last
+  };
+  PerfCounters *_cct_perf = nullptr;
+  PerfCounters* _mempool_perf = nullptr;
+  std::vector<std::string> _mempool_perf_names, _mempool_perf_descriptions;
+
+  /**
+   * Enable the performance counters.
+   */
+  void _enable_perf_counter();
+
+  /**
+   * Disable the performance counter.
+   */
+  void _disable_perf_counter();
+
+  /**
+   * Refresh perf counter values.
+   */
+  void _refresh_perf_values();
+
+  friend class CephContextObs;
+};
+#ifdef __cplusplus
+}
+#endif
+#endif	// WITH_SEASTAR
+
+#if !(defined(WITH_SEASTAR) && !defined(WITH_ALIEN)) && defined(__cplusplus)
+namespace ceph::common {
+inline void intrusive_ptr_add_ref(CephContext* cct)
+{
+  cct->get();
+}
+
+inline void intrusive_ptr_release(CephContext* cct)
+{
+  cct->put();
+}
+}
+#endif // !(defined(WITH_SEASTAR) && !defined(WITH_ALIEN)) && defined(__cplusplus)
+#endif
diff --git a/src/common/ceph_crypto.cc b/src/common/ceph_crypto.cc
new file mode 100644
index 000000000..18e655b93
--- /dev/null
+++ b/src/common/ceph_crypto.cc
@@ -0,0 +1,239 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2010-2011 Dreamhost
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <vector>
+
+#include "common/ceph_context.h"
+#include "common/ceph_mutex.h"
+#include "common/config.h"
+#include "ceph_crypto.h"
+
+#include <openssl/evp.h>
+
+#if OPENSSL_VERSION_NUMBER < 0x10100000L
+#  include <openssl/conf.h>
+#  include <openssl/engine.h>
+#  include <openssl/err.h>
+#endif /* OPENSSL_VERSION_NUMBER < 0x10100000L */
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+
+namespace TOPNSPC::crypto::ssl {
+
+#if OPENSSL_VERSION_NUMBER < 0x10100000L
+static std::atomic_uint32_t crypto_refs;
+
+
+static auto ssl_mutexes = ceph::make_lock_container<ceph::shared_mutex>(
+  static_cast<size_t>(std::max(CRYPTO_num_locks(), 0)),
+  [](const size_t i) {
+    return ceph::make_shared_mutex(
+      std::string("ssl-mutex-") + std::to_string(i));
+  });
+
+static struct {
+  // we could use e.g. unordered_set instead at the price of providing
+  // std::hash<...> specialization. However, we can live with duplicates
+  // quite well while the benefit is not worth the effort.
+  std::vector<CRYPTO_THREADID> tids;
+  ceph::mutex lock = ceph::make_mutex("crypto::ssl::init_records::lock");;
+} init_records;
+
+static void
+ssl_locking_callback(
+  const int mode,
+  const int mutex_num,
+  [[maybe_unused]] const char *file,
+  [[maybe_unused]] const int line)
+{
+  if (mutex_num < 0 || static_cast<size_t>(mutex_num) >= ssl_mutexes.size()) {
+    ceph_assert_always("openssl passed wrong mutex index" == nullptr);
+  }
+
+  if (mode & CRYPTO_READ) {
+    if (mode & CRYPTO_LOCK) {
+      ssl_mutexes[mutex_num].lock_shared();
+    } else if (mode & CRYPTO_UNLOCK) {
+      ssl_mutexes[mutex_num].unlock_shared();
+    }
+  } else if (mode & CRYPTO_WRITE) {
+    if (mode & CRYPTO_LOCK) {
+      ssl_mutexes[mutex_num].lock();
+    } else if (mode & CRYPTO_UNLOCK) {
+      ssl_mutexes[mutex_num].unlock();
+    }
+  }
+}
+
+static unsigned long
+ssl_get_thread_id(void)
+{
+  static_assert(sizeof(unsigned long) >= sizeof(pthread_t));
+  /* pthread_t may be any data type, so a simple cast to unsigned long
+   * can rise a warning/error, depending on the platform.
+   * Here memcpy is used as an anything-to-anything cast. */
+  unsigned long ret = 0;
+  pthread_t t = pthread_self();
+  memcpy(&ret, &t, sizeof(pthread_t));
+  return ret;
+}
+#endif /* not OPENSSL_VERSION_NUMBER < 0x10100000L */
+
+static void init() {
+#if OPENSSL_VERSION_NUMBER < 0x10100000L
+  if (++crypto_refs == 1) {
+    // according to
+    // https://wiki.openssl.org/index.php/Library_Initialization#libcrypto_Initialization
+    OpenSSL_add_all_algorithms();
+    ERR_load_crypto_strings();
+
+    // initialize locking callbacks, needed for thread safety.
+    // http://www.openssl.org/support/faq.html#PROG1
+    CRYPTO_set_locking_callback(&ssl_locking_callback);
+    CRYPTO_set_id_callback(&ssl_get_thread_id);
+
+    OPENSSL_config(nullptr);
+  }
+
+  // we need to record IDs of all threads calling the initialization in
+  // order to *manually* free per-thread memory OpenSSL *automagically*
+  // allocated in ERR_get_state().
+  // XXX: this solution/nasty hack is IMPERFECT. A leak will appear when
+  // a client init()ializes the crypto subsystem with one thread and then
+  // uses it from another one in a way that results in ERR_get_state().
+  // XXX: for discussion about another approaches please refer to:
+  // https://www.mail-archive.com/openssl-users@openssl.org/msg59070.html
+  {
+    std::lock_guard l(init_records.lock);
+    CRYPTO_THREADID tmp;
+    CRYPTO_THREADID_current(&tmp);
+    init_records.tids.emplace_back(std::move(tmp));
+  }
+#endif /* OPENSSL_VERSION_NUMBER < 0x10100000L */
+}
+
+static void shutdown() {
+#if OPENSSL_VERSION_NUMBER < 0x10100000L
+  if (--crypto_refs != 0) {
+    return;
+  }
+
+  // drop error queue for each thread that called the init() function to
+  // satisfy valgrind.
+  {
+    std::lock_guard l(init_records.lock);
+
+    // NOTE: in OpenSSL 1.0.2g the signature is:
+    //    void ERR_remove_thread_state(const CRYPTO_THREADID *tid);
+    // but in 1.1.0j it has been changed to
+    //    void ERR_remove_thread_state(void *);
+    // We're basing on the OPENSSL_VERSION_NUMBER check to preserve
+    // const-correctness without failing builds on modern envs.
+    for (const auto& tid : init_records.tids) {
+      ERR_remove_thread_state(&tid);
+    }
+  }
+
+  // Shutdown according to
+  // https://wiki.openssl.org/index.php/Library_Initialization#Cleanup
+  // http://stackoverflow.com/questions/29845527/how-to-properly-uninitialize-openssl
+  //
+  // The call to CONF_modules_free() has been introduced after a valgring run.
+  CRYPTO_set_locking_callback(nullptr);
+  CRYPTO_set_id_callback(nullptr);
+  ENGINE_cleanup();
+  CONF_modules_free();
+  CONF_modules_unload(1);
+  ERR_free_strings();
+  EVP_cleanup();
+  CRYPTO_cleanup_all_ex_data();
+
+  // NOTE: don't clear ssl_mutexes as we should be ready for init-deinit-init
+  // sequence.
+#endif /* OPENSSL_VERSION_NUMBER < 0x10100000L */
+}
+
+void zeroize_for_security(void* const s, const size_t n) {
+  OPENSSL_cleanse(s, n);
+}
+
+} // namespace TOPNSPC::crypto::openssl
+
+
+namespace TOPNSPC::crypto {
+void init() {
+  ssl::init();
+}
+
+void shutdown([[maybe_unused]] const bool shared) {
+  ssl::shutdown();
+}
+
+void zeroize_for_security(void* const s, const size_t n) {
+  ssl::zeroize_for_security(s, n);
+}
+
+ssl::OpenSSLDigest::OpenSSLDigest(const EVP_MD * _type)
+  : mpContext(EVP_MD_CTX_create())
+  , mpType(_type) {
+  this->Restart();
+}
+
+ssl::OpenSSLDigest::~OpenSSLDigest() {
+  EVP_MD_CTX_destroy(mpContext);
+  if (mpType_FIPS) {
+#if OPENSSL_VERSION_NUMBER >= 0x30000000L
+    EVP_MD_free(mpType_FIPS);
+#endif  // OPENSSL_VERSION_NUMBER >= 0x30000000L
+  }
+}
+
+void ssl::OpenSSLDigest::Restart() {
+  if (mpType_FIPS) {
+    EVP_DigestInit_ex(mpContext, mpType_FIPS, NULL);
+  } else {
+    EVP_DigestInit_ex(mpContext, mpType, NULL);
+  }
+}
+
+void ssl::OpenSSLDigest::SetFlags(int flags) {
+  if (flags == EVP_MD_CTX_FLAG_NON_FIPS_ALLOW && OpenSSL_version_num() >= 0x30000000L && mpType == EVP_md5() && !mpType_FIPS) {
+#if OPENSSL_VERSION_NUMBER >= 0x30000000L
+    mpType_FIPS = EVP_MD_fetch(NULL, "MD5", "fips=no");
+#endif  // OPENSSL_VERSION_NUMBER >= 0x30000000L
+  } else {
+    EVP_MD_CTX_set_flags(mpContext, flags);
+  }
+  this->Restart();
+}
+
+void ssl::OpenSSLDigest::Update(const unsigned char *input, size_t length) {
+  if (length) {
+    EVP_DigestUpdate(mpContext, const_cast<void *>(reinterpret_cast<const void *>(input)), length);
+  }
+}
+
+void ssl::OpenSSLDigest::Final(unsigned char *digest) {
+  unsigned int s;
+  EVP_DigestFinal_ex(mpContext, digest, &s);
+}
+
+}
+
+#pragma clang diagnostic pop
+#pragma GCC diagnostic pop
diff --git a/src/common/ceph_crypto.h b/src/common/ceph_crypto.h
new file mode 100644
index 000000000..bcdc0044c
--- /dev/null
+++ b/src/common/ceph_crypto.h
@@ -0,0 +1,215 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+#ifndef CEPH_CRYPTO_H
+#define CEPH_CRYPTO_H
+
+#include "acconfig.h"
+#include <stdexcept>
+
+#include "include/common_fwd.h"
+#include "include/buffer.h"
+#include "include/types.h"
+
+#define CEPH_CRYPTO_MD5_DIGESTSIZE 16
+#define CEPH_CRYPTO_HMACSHA1_DIGESTSIZE 20
+#define CEPH_CRYPTO_SHA1_DIGESTSIZE 20
+#define CEPH_CRYPTO_HMACSHA256_DIGESTSIZE 32
+#define CEPH_CRYPTO_SHA256_DIGESTSIZE 32
+#define CEPH_CRYPTO_SHA512_DIGESTSIZE 64
+
+#include <openssl/evp.h>
+#include <openssl/ossl_typ.h>
+#include <openssl/hmac.h>
+
+#include "include/ceph_assert.h"
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+
+extern "C" {
+  const EVP_MD *EVP_md5(void);
+  const EVP_MD *EVP_sha1(void);
+  const EVP_MD *EVP_sha256(void);
+  const EVP_MD *EVP_sha512(void);
+}
+
+namespace TOPNSPC::crypto {
+  void assert_init();
+  void init();
+  void shutdown(bool shared=true);
+
+  void zeroize_for_security(void *s, size_t n);
+
+  class DigestException : public std::runtime_error
+  {
+    public:
+      DigestException(const char* what_arg) : runtime_error(what_arg)
+	{}
+  };
+
+  namespace ssl {
+    class OpenSSLDigest {
+      private:
+	EVP_MD_CTX *mpContext;
+	const EVP_MD *mpType;
+        EVP_MD *mpType_FIPS = nullptr;
+      public:
+	OpenSSLDigest (const EVP_MD *_type);
+	~OpenSSLDigest ();
+	void Restart();
+	void SetFlags(int flags);
+	void Update (const unsigned char *input, size_t length);
+	void Final (unsigned char *digest);
+    };
+
+    class MD5 : public OpenSSLDigest {
+      public:
+	static constexpr size_t digest_size = CEPH_CRYPTO_MD5_DIGESTSIZE;
+	MD5 () : OpenSSLDigest(EVP_md5()) { }
+    };
+
+    class SHA1 : public OpenSSLDigest {
+      public:
+        static constexpr size_t digest_size = CEPH_CRYPTO_SHA1_DIGESTSIZE;
+        SHA1 () : OpenSSLDigest(EVP_sha1()) { }
+    };
+
+    class SHA256 : public OpenSSLDigest {
+      public:
+        static constexpr size_t digest_size = CEPH_CRYPTO_SHA256_DIGESTSIZE;
+        SHA256 () : OpenSSLDigest(EVP_sha256()) { }
+    };
+
+    class SHA512 : public OpenSSLDigest {
+      public:
+        static constexpr size_t digest_size = CEPH_CRYPTO_SHA512_DIGESTSIZE;
+        SHA512 () : OpenSSLDigest(EVP_sha512()) { }
+    };
+
+
+# if OPENSSL_VERSION_NUMBER < 0x10100000L
+  class HMAC {
+  private:
+    HMAC_CTX mContext;
+    const EVP_MD *mpType;
+
+  public:
+    HMAC (const EVP_MD *type, const unsigned char *key, size_t length)
+      : mpType(type) {
+      // the strict FIPS zeroization doesn't seem to be necessary here.
+      // just in the case.
+      ::TOPNSPC::crypto::zeroize_for_security(&mContext, sizeof(mContext));
+      const auto r = HMAC_Init_ex(&mContext, key, length, mpType, nullptr);
+      if (r != 1) {
+	  throw DigestException("HMAC_Init_ex() failed");
+      }
+    }
+    ~HMAC () {
+      HMAC_CTX_cleanup(&mContext);
+    }
+
+    void Restart () {
+      const auto r = HMAC_Init_ex(&mContext, nullptr, 0, mpType, nullptr);
+      if (r != 1) {
+	throw DigestException("HMAC_Init_ex() failed");
+      }
+    }
+    void Update (const unsigned char *input, size_t length) {
+      if (length) {
+        const auto r = HMAC_Update(&mContext, input, length);
+	if (r != 1) {
+	  throw DigestException("HMAC_Update() failed");
+	}
+      }
+    }
+    void Final (unsigned char *digest) {
+      unsigned int s;
+      const auto r = HMAC_Final(&mContext, digest, &s);
+      if (r != 1) {
+	throw DigestException("HMAC_Final() failed");
+      }
+    }
+  };
+# else
+  class HMAC {
+  private:
+    HMAC_CTX *mpContext;
+
+  public:
+    HMAC (const EVP_MD *type, const unsigned char *key, size_t length)
+      : mpContext(HMAC_CTX_new()) {
+      const auto r = HMAC_Init_ex(mpContext, key, length, type, nullptr);
+      if (r != 1) {
+	throw DigestException("HMAC_Init_ex() failed");
+      }
+    }
+    ~HMAC () {
+      HMAC_CTX_free(mpContext);
+    }
+
+    void Restart () {
+      const EVP_MD * const type = HMAC_CTX_get_md(mpContext);
+      const auto r = HMAC_Init_ex(mpContext, nullptr, 0, type, nullptr);
+      if (r != 1) {
+	throw DigestException("HMAC_Init_ex() failed");
+      }
+    }
+    void Update (const unsigned char *input, size_t length) {
+      if (length) {
+        const auto r = HMAC_Update(mpContext, input, length);
+	if (r != 1) {
+	  throw DigestException("HMAC_Update() failed");
+	}
+      }
+    }
+    void Final (unsigned char *digest) {
+      unsigned int s;
+      const auto r = HMAC_Final(mpContext, digest, &s);
+      if (r != 1) {
+	throw DigestException("HMAC_Final() failed");
+      }
+    }
+  };
+# endif // OPENSSL_VERSION_NUMBER < 0x10100000L
+
+  struct HMACSHA1 : public HMAC {
+    HMACSHA1 (const unsigned char *key, size_t length)
+      : HMAC(EVP_sha1(), key, length) {
+    }
+  };
+
+  struct HMACSHA256 : public HMAC {
+    HMACSHA256 (const unsigned char *key, size_t length)
+      : HMAC(EVP_sha256(), key, length) {
+    }
+  };
+}
+
+
+  using ssl::SHA256;
+  using ssl::MD5;
+  using ssl::SHA1;
+  using ssl::SHA512;
+
+  using ssl::HMACSHA256;
+  using ssl::HMACSHA1;
+
+template<class Digest>
+auto digest(const ceph::buffer::list& bl)
+{
+  unsigned char fingerprint[Digest::digest_size];
+  Digest gen;
+  for (auto& p : bl.buffers()) {
+    gen.Update((const unsigned char *)p.c_str(), p.length());
+  }
+  gen.Final(fingerprint);
+  return sha_digest_t<Digest::digest_size>{fingerprint};
+}
+}
+
+#pragma clang diagnostic pop
+#pragma GCC diagnostic pop
+
+#endif
diff --git a/src/common/ceph_frag.cc b/src/common/ceph_frag.cc
new file mode 100644
index 000000000..444b910c2
--- /dev/null
+++ b/src/common/ceph_frag.cc
@@ -0,0 +1,21 @@
+/*
+ * Ceph 'frag' type
+ */
+#include "include/types.h"
+
+int ceph_frag_compare(__u32 a, __u32 b)
+{
+	unsigned va = ceph_frag_value(a);
+	unsigned vb = ceph_frag_value(b);
+	if (va < vb)
+		return -1;
+	if (va > vb)
+		return 1;
+	va = ceph_frag_bits(a);
+	vb = ceph_frag_bits(b);
+	if (va < vb)
+		return -1;
+	if (va > vb)
+		return 1;
+	return 0;
+}
diff --git a/src/common/ceph_fs.cc b/src/common/ceph_fs.cc
new file mode 100644
index 000000000..380b401df
--- /dev/null
+++ b/src/common/ceph_fs.cc
@@ -0,0 +1,91 @@
+/*
+ * ceph_fs.cc - Some Ceph functions that are shared between kernel space and
+ * user space.
+ *
+ */
+
+/*
+ * Some non-inline ceph helpers
+ */
+#include "include/types.h"
+
+int ceph_flags_to_mode(int flags)
+{
+	/* because CEPH_FILE_MODE_PIN is zero, so mode = -1 is error */
+	int mode = -1;
+
+	if ((flags & CEPH_O_DIRECTORY) == CEPH_O_DIRECTORY)
+		return CEPH_FILE_MODE_PIN;
+
+	switch (flags & O_ACCMODE) {
+	case CEPH_O_WRONLY:
+		mode = CEPH_FILE_MODE_WR;
+		break;
+	case CEPH_O_RDONLY:
+		mode = CEPH_FILE_MODE_RD;
+		break;
+	case CEPH_O_RDWR:
+	case O_ACCMODE: /* this is what the VFS does */
+		mode = CEPH_FILE_MODE_RDWR;
+		break;
+	}
+
+	if (flags & CEPH_O_LAZY)
+		mode |= CEPH_FILE_MODE_LAZY;
+
+	return mode;
+}
+
+int ceph_caps_for_mode(int mode)
+{
+	int caps = CEPH_CAP_PIN;
+
+	if (mode & CEPH_FILE_MODE_RD)
+		caps |= CEPH_CAP_FILE_SHARED |
+			CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
+	if (mode & CEPH_FILE_MODE_WR)
+		caps |= CEPH_CAP_FILE_EXCL |
+			CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+			CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+			CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+	if (mode & CEPH_FILE_MODE_LAZY)
+		caps |= CEPH_CAP_FILE_LAZYIO;
+
+	return caps;
+}
+
+int ceph_flags_sys2wire(int flags)
+{
+       int wire_flags = 0;
+
+       switch (flags & O_ACCMODE) {
+       case O_RDONLY:
+               wire_flags |= CEPH_O_RDONLY;
+               break;
+       case O_WRONLY:
+               wire_flags |= CEPH_O_WRONLY;
+               break;
+       case O_RDWR:
+               wire_flags |= CEPH_O_RDWR;
+               break;
+       }
+       flags &= ~O_ACCMODE;
+
+#define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; }
+
+       ceph_sys2wire(O_CREAT);
+       ceph_sys2wire(O_EXCL);
+       ceph_sys2wire(O_TRUNC);
+
+       #ifndef _WIN32
+       ceph_sys2wire(O_DIRECTORY);
+       ceph_sys2wire(O_NOFOLLOW);
+       // In some cases, FILE_FLAG_BACKUP_SEMANTICS may be used instead
+       // of O_DIRECTORY. We may need some workarounds in order to handle
+       // the fact that those flags are not available on Windows.
+       #endif
+
+#undef ceph_sys2wire
+
+       return wire_flags;
+}
diff --git a/src/common/ceph_hash.cc b/src/common/ceph_hash.cc
new file mode 100644
index 000000000..061926d27
--- /dev/null
+++ b/src/common/ceph_hash.cc
@@ -0,0 +1,128 @@
+
+#include "include/types.h"
+
+/*
+ * Robert Jenkin's hash function.
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * This is in the public domain.
+ */
+#define mix(a, b, c)						\
+	do {							\
+		a = a - b;  a = a - c;  a = a ^ (c >> 13);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 8);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 13);	\
+		a = a - b;  a = a - c;  a = a ^ (c >> 12);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 16);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 5);	\
+		a = a - b;  a = a - c;  a = a ^ (c >> 3);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 10);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 15);	\
+	} while (0)
+
+unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
+{
+	const unsigned char *k = (const unsigned char *)str;
+	__u32 a, b, c;  /* the internal state */
+	__u32 len;      /* how many key bytes still need mixing */
+
+	/* Set up the internal state */
+	len = length;
+	a = 0x9e3779b9;      /* the golden ratio; an arbitrary value */
+	b = a;
+	c = 0;               /* variable initialization of internal state */
+
+	/* handle most of the key */
+	while (len >= 12) {
+		a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
+			 ((__u32)k[3] << 24));
+		b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
+			 ((__u32)k[7] << 24));
+		c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
+			 ((__u32)k[11] << 24));
+		mix(a, b, c);
+		k = k + 12;
+		len = len - 12;
+	}
+
+	/* handle the last 11 bytes */
+	c = c + length;
+	switch (len) {            /* all the case statements fall through */
+	case 11:
+		c = c + ((__u32)k[10] << 24);
+	case 10:
+		c = c + ((__u32)k[9] << 16);
+	case 9:
+		c = c + ((__u32)k[8] << 8);
+		/* the first byte of c is reserved for the length */
+	case 8:
+		b = b + ((__u32)k[7] << 24);
+	case 7:
+		b = b + ((__u32)k[6] << 16);
+	case 6:
+		b = b + ((__u32)k[5] << 8);
+	case 5:
+		b = b + k[4];
+	case 4:
+		a = a + ((__u32)k[3] << 24);
+	case 3:
+		a = a + ((__u32)k[2] << 16);
+	case 2:
+		a = a + ((__u32)k[1] << 8);
+	case 1:
+		a = a + k[0];
+		/* case 0: nothing left to add */
+	}
+	mix(a, b, c);
+
+	return c;
+}
+
+/*
+ * linux dcache hash
+ */
+unsigned ceph_str_hash_linux(const char *str, unsigned length)
+{
+	unsigned hash = 0;
+
+	while (length--) {
+		unsigned char c = *str++;
+		hash = (hash + (c << 4) + (c >> 4)) * 11;
+	}
+	return hash;
+}
+
+
+unsigned ceph_str_hash(int type, const char *s, unsigned len)
+{
+	switch (type) {
+	case CEPH_STR_HASH_LINUX:
+		return ceph_str_hash_linux(s, len);
+	case CEPH_STR_HASH_RJENKINS:
+		return ceph_str_hash_rjenkins(s, len);
+	default:
+		return -1;
+	}
+}
+
+const char *ceph_str_hash_name(int type)
+{
+	switch (type) {
+	case CEPH_STR_HASH_LINUX:
+		return "linux";
+	case CEPH_STR_HASH_RJENKINS:
+		return "rjenkins";
+	default:
+		return "unknown";
+	}
+}
+
+bool ceph_str_hash_valid(int type)
+{
+        switch (type) {
+        case CEPH_STR_HASH_LINUX:
+        case CEPH_STR_HASH_RJENKINS:
+                return true;
+        default:
+                return false;
+        }
+}
diff --git a/src/common/ceph_json.cc b/src/common/ceph_json.cc
new file mode 100644
index 000000000..b63d73e5a
--- /dev/null
+++ b/src/common/ceph_json.cc
@@ -0,0 +1,995 @@
+#include "common/ceph_json.h"
+#include "include/utime.h"
+
+// for testing DELETE ME
+#include <fstream>
+#include <include/types.h>
+
+#include <boost/algorithm/string.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/lexical_cast.hpp>
+
+#include "json_spirit/json_spirit_writer_template.h"
+
+using namespace json_spirit;
+
+using std::ifstream;
+using std::pair;
+using std::ostream;
+using std::string;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::Formatter;
+
+#define dout_subsys ceph_subsys_rgw
+
+static JSONFormattable default_formattable;
+
+void encode_json(const char *name, const JSONObj::data_val& v, Formatter *f)
+{
+  if (v.quoted) {
+    encode_json(name, v.str, f);
+  } else {
+    f->dump_format_unquoted(name, "%s", v.str.c_str());
+  }
+}
+
+JSONObjIter::JSONObjIter()
+{
+}
+
+JSONObjIter::~JSONObjIter()
+{
+}
+
+void JSONObjIter::set(const JSONObjIter::map_iter_t &_cur, const JSONObjIter::map_iter_t &_last)
+{
+  cur = _cur;
+  last = _last;
+}
+
+void JSONObjIter::operator++()
+{
+  if (cur != last)
+    ++cur;
+}
+
+JSONObj *JSONObjIter::operator*()
+{
+  return cur->second;
+}
+
+// does not work, FIXME
+ostream& operator<<(ostream &out, const JSONObj &obj) {
+   out << obj.name << ": " << obj.val;
+   return out;
+}
+
+JSONObj::~JSONObj()
+{
+  for (auto iter = children.begin(); iter != children.end(); ++iter) {
+    JSONObj *obj = iter->second;
+    delete obj;
+  }
+}
+
+
+void JSONObj::add_child(string el, JSONObj *obj)
+{
+  children.insert(pair<string, JSONObj *>(el, obj));
+}
+
+bool JSONObj::get_attr(string name, data_val& attr)
+{
+  auto iter = attr_map.find(name);
+  if (iter == attr_map.end())
+    return false;
+  attr = iter->second;
+  return true;
+}
+
+JSONObjIter JSONObj::find(const string& name)
+{
+  JSONObjIter iter;
+  auto first = children.find(name);
+  if (first != children.end()) {
+    auto last = children.upper_bound(name);
+    iter.set(first, last);
+  }
+  return iter;
+}
+
+JSONObjIter JSONObj::find_first()
+{
+  JSONObjIter iter;
+  iter.set(children.begin(), children.end());
+  return iter;
+}
+
+JSONObjIter JSONObj::find_first(const string& name)
+{
+  JSONObjIter iter;
+  auto first = children.find(name);
+  iter.set(first, children.end());
+  return iter;
+}
+
+JSONObj *JSONObj::find_obj(const string& name)
+{
+  JSONObjIter iter = find(name);
+  if (iter.end())
+    return NULL;
+
+  return *iter;
+}
+
+bool JSONObj::get_data(const string& key, data_val *dest)
+{
+  JSONObj *obj = find_obj(key);
+  if (!obj)
+    return false;
+
+  *dest = obj->get_data_val();
+
+  return true;
+}
+
+/* accepts a JSON Array or JSON Object contained in
+ * a JSON Spirit Value, v,  and creates a JSONObj for each
+ * child contained in v
+ */
+void JSONObj::handle_value(Value v)
+{
+  if (v.type() == obj_type) {
+    Object temp_obj = v.get_obj();
+    for (Object::size_type i = 0; i < temp_obj.size(); i++) {
+      Pair temp_pair = temp_obj[i];
+      string temp_name = temp_pair.name_;
+      Value temp_value = temp_pair.value_;
+      JSONObj *child = new JSONObj;
+      child->init(this, temp_value, temp_name);
+      add_child(temp_name, child);
+    }
+  } else if (v.type() == array_type) {
+    Array temp_array = v.get_array();
+    Value value;
+
+    for (unsigned j = 0; j < temp_array.size(); j++) {
+      Value cur = temp_array[j];
+      string temp_name;
+
+      JSONObj *child = new JSONObj;
+      child->init(this, cur, temp_name);
+      add_child(child->get_name(), child);
+    }
+  }
+}
+
+void JSONObj::init(JSONObj *p, Value v, string n)
+{
+  name = n;
+  parent = p;
+  data = v;
+
+  handle_value(v);
+  if (v.type() == str_type) {
+    val.set(v.get_str(), true);
+  } else {
+    val.set(json_spirit::write_string(v), false);
+  }
+  attr_map.insert(pair<string,data_val>(name, val));
+}
+
+JSONObj *JSONObj::get_parent()
+{
+  return parent;
+}
+
+bool JSONObj::is_object()
+{
+  return (data.type() == obj_type);
+}
+
+bool JSONObj::is_array()
+{
+  return (data.type() == array_type);
+}
+
+vector<string> JSONObj::get_array_elements()
+{
+  vector<string> elements;
+  Array temp_array;
+
+  if (data.type() == array_type)
+    temp_array = data.get_array();
+
+  int array_size = temp_array.size();
+  if (array_size > 0)
+    for (int i = 0; i < array_size; i++) {
+      Value temp_value = temp_array[i];
+      string temp_string;
+      temp_string = write(temp_value, raw_utf8);
+      elements.push_back(temp_string);
+    }
+
+  return elements;
+}
+
+JSONParser::JSONParser() : buf_len(0), success(true)
+{
+}
+
+JSONParser::~JSONParser()
+{
+}
+
+
+
+void JSONParser::handle_data(const char *s, int len)
+{
+  json_buffer.append(s, len); // check for problems with null termination FIXME
+  buf_len += len;
+}
+
+// parse a supplied JSON fragment
+bool JSONParser::parse(const char *buf_, int len)
+{
+  if (!buf_) {
+    set_failure();
+    return false;
+  }
+
+  string json_string(buf_, len);
+  success = read(json_string, data);
+  if (success) {
+    handle_value(data);
+    if (data.type() != obj_type &&
+        data.type() != array_type) {
+      if (data.type() == str_type) {
+        val.set(data.get_str(), true);
+      } else {
+        const std::string& s = json_spirit::write_string(data);
+        if (s.size() == (uint64_t)len) { /* Check if entire string is read */
+          val.set(s, false);
+        } else {
+          set_failure();
+        }
+      }
+    }
+  } else {
+    set_failure();
+  }
+
+  return success;
+}
+
+// parse the internal json_buffer up to len
+bool JSONParser::parse(int len)
+{
+  string json_string = json_buffer.substr(0, len);
+  success = read(json_string, data);
+  if (success)
+    handle_value(data);
+  else
+    set_failure();
+
+  return success;
+}
+
+// parse the complete internal json_buffer
+bool JSONParser::parse()
+{
+  success = read(json_buffer, data);
+  if (success)
+    handle_value(data);
+  else
+    set_failure();
+
+  return success;
+}
+
+// parse a supplied ifstream, for testing. DELETE ME
+bool JSONParser::parse(const char *file_name)
+{
+  ifstream is(file_name);
+  success = read(is, data);
+  if (success)
+    handle_value(data);
+  else
+    set_failure();
+
+  return success;
+}
+
+
+void decode_json_obj(long& val, JSONObj *obj)
+{
+  string s = obj->get_data();
+  const char *start = s.c_str();
+  char *p;
+
+  errno = 0;
+  val = strtol(start, &p, 10);
+
+  /* Check for various possible errors */
+
+ if ((errno == ERANGE && (val == LONG_MAX || val == LONG_MIN)) ||
+     (errno != 0 && val == 0)) {
+   throw JSONDecoder::err("failed to parse number");
+ }
+
+ if (p == start) {
+   throw JSONDecoder::err("failed to parse number");
+ }
+
+ while (*p != '\0') {
+   if (!isspace(*p)) {
+     throw JSONDecoder::err("failed to parse number");
+   }
+   p++;
+ }
+}
+
+void decode_json_obj(unsigned long& val, JSONObj *obj)
+{
+  string s = obj->get_data();
+  const char *start = s.c_str();
+  char *p;
+
+  errno = 0;
+  val = strtoul(start, &p, 10);
+
+  /* Check for various possible errors */
+
+ if ((errno == ERANGE && val == ULONG_MAX) ||
+     (errno != 0 && val == 0)) {
+   throw JSONDecoder::err("failed to number");
+ }
+
+ if (p == start) {
+   throw JSONDecoder::err("failed to parse number");
+ }
+
+ while (*p != '\0') {
+   if (!isspace(*p)) {
+     throw JSONDecoder::err("failed to parse number");
+   }
+   p++;
+ }
+}
+
+void decode_json_obj(long long& val, JSONObj *obj)
+{
+  string s = obj->get_data();
+  const char *start = s.c_str();
+  char *p;
+
+  errno = 0;
+  val = strtoll(start, &p, 10);
+
+  /* Check for various possible errors */
+
+ if ((errno == ERANGE && (val == LLONG_MAX || val == LLONG_MIN)) ||
+     (errno != 0 && val == 0)) {
+   throw JSONDecoder::err("failed to parse number");
+ }
+
+ if (p == start) {
+   throw JSONDecoder::err("failed to parse number");
+ }
+
+ while (*p != '\0') {
+   if (!isspace(*p)) {
+     throw JSONDecoder::err("failed to parse number");
+   }
+   p++;
+ }
+}
+
+void decode_json_obj(unsigned long long& val, JSONObj *obj)
+{
+  string s = obj->get_data();
+  const char *start = s.c_str();
+  char *p;
+
+  errno = 0;
+  val = strtoull(start, &p, 10);
+
+  /* Check for various possible errors */
+
+ if ((errno == ERANGE && val == ULLONG_MAX) ||
+     (errno != 0 && val == 0)) {
+   throw JSONDecoder::err("failed to number");
+ }
+
+ if (p == start) {
+   throw JSONDecoder::err("failed to parse number");
+ }
+
+ while (*p != '\0') {
+   if (!isspace(*p)) {
+     throw JSONDecoder::err("failed to parse number");
+   }
+   p++;
+ }
+}
+
+void decode_json_obj(int& val, JSONObj *obj)
+{
+  long l;
+  decode_json_obj(l, obj);
+#if LONG_MAX > INT_MAX
+  if (l > INT_MAX || l < INT_MIN) {
+    throw JSONDecoder::err("integer out of range");
+  }
+#endif
+
+  val = (int)l;
+}
+
+void decode_json_obj(unsigned& val, JSONObj *obj)
+{
+  unsigned long l;
+  decode_json_obj(l, obj);
+#if ULONG_MAX > UINT_MAX
+  if (l > UINT_MAX) {
+    throw JSONDecoder::err("unsigned integer out of range");
+  }
+#endif
+
+  val = (unsigned)l;
+}
+
+void decode_json_obj(bool& val, JSONObj *obj)
+{
+  string s = obj->get_data();
+  if (strcasecmp(s.c_str(), "true") == 0) {
+    val = true;
+    return;
+  }
+  if (strcasecmp(s.c_str(), "false") == 0) {
+    val = false;
+    return;
+  }
+  int i;
+  decode_json_obj(i, obj);
+  val = (bool)i;
+}
+
+void decode_json_obj(bufferlist& val, JSONObj *obj)
+{
+  string s = obj->get_data();
+
+  bufferlist bl;
+  bl.append(s.c_str(), s.size());
+  try {
+    val.decode_base64(bl);
+  } catch (ceph::buffer::error& err) {
+   throw JSONDecoder::err("failed to decode base64");
+  }
+}
+
+void decode_json_obj(utime_t& val, JSONObj *obj)
+{
+  string s = obj->get_data();
+  uint64_t epoch;
+  uint64_t nsec;
+  int r = utime_t::parse_date(s, &epoch, &nsec);
+  if (r == 0) {
+    val = utime_t(epoch, nsec);
+  } else {
+    throw JSONDecoder::err("failed to decode utime_t");
+  }
+}
+
+void decode_json_obj(ceph::real_time& val, JSONObj *obj)
+{
+  const std::string& s = obj->get_data();
+  uint64_t epoch;
+  uint64_t nsec;
+  int r = utime_t::parse_date(s, &epoch, &nsec);
+  if (r == 0) {
+    using namespace std::chrono;
+    val = real_time{seconds(epoch) + nanoseconds(nsec)};
+  } else {
+    throw JSONDecoder::err("failed to decode real_time");
+  }
+}
+
+void decode_json_obj(ceph::coarse_real_time& val, JSONObj *obj)
+{
+  const std::string& s = obj->get_data();
+  uint64_t epoch;
+  uint64_t nsec;
+  int r = utime_t::parse_date(s, &epoch, &nsec);
+  if (r == 0) {
+    using namespace std::chrono;
+    val = coarse_real_time{seconds(epoch) + nanoseconds(nsec)};
+  } else {
+    throw JSONDecoder::err("failed to decode coarse_real_time");
+  }
+}
+
+void decode_json_obj(ceph_dir_layout& i, JSONObj *obj){
+
+    unsigned tmp;
+    JSONDecoder::decode_json("dir_hash", tmp, obj, true);
+    i.dl_dir_hash = tmp;
+    JSONDecoder::decode_json("unused1", tmp, obj, true);
+    i.dl_unused1 = tmp;
+    JSONDecoder::decode_json("unused2", tmp, obj, true);
+    i.dl_unused2 = tmp;
+    JSONDecoder::decode_json("unused3", tmp, obj, true);
+    i.dl_unused3 = tmp;
+}
+
+void encode_json(const char *name, std::string_view val, Formatter *f)
+{
+  f->dump_string(name, val);
+}
+
+void encode_json(const char *name, const string& val, Formatter *f)
+{
+  f->dump_string(name, val);
+}
+
+void encode_json(const char *name, const char *val, Formatter *f)
+{
+  f->dump_string(name, val);
+}
+
+void encode_json(const char *name, bool val, Formatter *f)
+{
+  f->dump_bool(name, val);
+}
+
+void encode_json(const char *name, int val, Formatter *f)
+{
+  f->dump_int(name, val);
+}
+
+void encode_json(const char *name, long val, Formatter *f)
+{
+  f->dump_int(name, val);
+}
+
+void encode_json(const char *name, unsigned val, Formatter *f)
+{
+  f->dump_unsigned(name, val);
+}
+
+void encode_json(const char *name, unsigned long val, Formatter *f)
+{
+  f->dump_unsigned(name, val);
+}
+
+void encode_json(const char *name, unsigned long long val, Formatter *f)
+{
+  f->dump_unsigned(name, val);
+}
+
+void encode_json(const char *name, long long val, Formatter *f)
+{
+  f->dump_int(name, val);
+}
+
+void encode_json(const char *name, const utime_t& val, Formatter *f)
+{
+  val.gmtime(f->dump_stream(name));
+}
+
+void encode_json(const char *name, const ceph::real_time& val, Formatter *f)
+{
+  encode_json(name, utime_t{val}, f);
+}
+
+void encode_json(const char *name, const ceph::coarse_real_time& val, Formatter *f)
+{
+  encode_json(name, utime_t{val}, f);
+}
+
+void encode_json(const char *name, const bufferlist& bl, Formatter *f)
+{
+  /* need to copy data from bl, as it is const bufferlist */
+  bufferlist src = bl;
+
+  bufferlist b64;
+  src.encode_base64(b64);
+
+  string s(b64.c_str(), b64.length());
+
+  encode_json(name, s, f);
+}
+
+
+
+/* JSONFormattable */
+
+const JSONFormattable& JSONFormattable::operator[](const string& name) const
+{
+  auto i = obj.find(name);
+  if (i == obj.end()) {
+    return default_formattable;
+  }
+  return i->second;
+}
+
+const JSONFormattable& JSONFormattable::operator[](size_t index) const
+{
+  if (index >= arr.size()) {
+    return default_formattable;
+  }
+  return arr[index];
+}
+
+JSONFormattable& JSONFormattable::operator[](const string& name)
+{
+  auto i = obj.find(name);
+  if (i == obj.end()) {
+    return default_formattable;
+  }
+  return i->second;
+}
+
+JSONFormattable& JSONFormattable::operator[](size_t index)
+{
+  if (index >= arr.size()) {
+    return default_formattable;
+  }
+  return arr[index];
+}
+
+bool JSONFormattable::exists(const string& name) const
+{
+  auto i = obj.find(name);
+  return (i != obj.end());
+}
+
+bool JSONFormattable::exists(size_t index) const
+{
+  return (index < arr.size());
+}
+
+bool JSONFormattable::find(const string& name, string *val) const
+{
+  auto i = obj.find(name);
+  if (i == obj.end()) {
+    return false;
+  }
+  *val = i->second.val();
+  return true;
+}
+
+int JSONFormattable::val_int() const {
+  return atoi(value.str.c_str());
+}
+
+long JSONFormattable::val_long() const {
+  return atol(value.str.c_str());
+}
+
+long long JSONFormattable::val_long_long() const {
+  return atoll(value.str.c_str());
+}
+
+bool JSONFormattable::val_bool() const {
+  return (boost::iequals(value.str, "true") ||
+          boost::iequals(value.str, "on") ||
+          boost::iequals(value.str, "yes") ||
+          boost::iequals(value.str, "1"));
+}
+
+string JSONFormattable::def(const string& def_val) const {
+  if (type == FMT_NONE) {
+    return def_val;
+  }
+  return val();
+}
+
+int JSONFormattable::def(int def_val) const {
+  if (type == FMT_NONE) {
+    return def_val;
+  }
+  return val_int();
+}
+
+bool JSONFormattable::def(bool def_val) const {
+  if (type == FMT_NONE) {
+    return def_val;
+  }
+  return val_bool();
+}
+
+string JSONFormattable::get(const string& name, const string& def_val) const
+{
+  return (*this)[name].def(def_val);
+}
+
+int JSONFormattable::get_int(const string& name, int def_val) const
+{
+  return (*this)[name].def(def_val);
+}
+
+bool JSONFormattable::get_bool(const string& name, bool def_val) const
+{
+  return (*this)[name].def(def_val);
+}
+
+struct field_entity {
+  bool is_obj{false}; /* either obj field or array entity */
+  string name; /* if obj */
+  int index{0}; /* if array */
+  bool append{false};
+
+  field_entity() {}
+  explicit field_entity(const string& n) : is_obj(true), name(n) {}
+  explicit field_entity(int i) : is_obj(false), index(i) {}
+};
+
+static int parse_entity(const string& s, vector<field_entity> *result)
+{
+  size_t ofs = 0;
+
+  while (ofs < s.size()) {
+    size_t next_arr = s.find('[', ofs);
+    if (next_arr == string::npos) {
+      if (ofs != 0) {
+        return -EINVAL;
+      }
+      result->push_back(field_entity(s));
+      return 0;
+    }
+    if (next_arr > ofs) {
+      string field = s.substr(ofs, next_arr - ofs);
+      result->push_back(field_entity(field));
+      ofs = next_arr;
+    }
+    size_t end_arr = s.find(']', next_arr + 1);
+    if (end_arr == string::npos) {
+      return -EINVAL;
+    }
+
+    string index_str = s.substr(next_arr + 1, end_arr - next_arr - 1);
+
+    ofs = end_arr + 1;
+
+    if (!index_str.empty()) {
+      result->push_back(field_entity(atoi(index_str.c_str())));
+    } else {
+      field_entity f;
+      f.append = true;
+      result->push_back(f);
+    }
+  }
+  return 0;
+}
+
+static bool is_numeric(const string& val)
+{
+  try {
+    boost::lexical_cast<double>(val);
+  } catch (const boost::bad_lexical_cast& e) {
+    return false;
+  }
+  return true;
+}
+
+int JSONFormattable::set(const string& name, const string& val)
+{
+  boost::escaped_list_separator<char> els('\\', '.', '"');
+  boost::tokenizer<boost::escaped_list_separator<char> > tok(name, els);
+
+  JSONFormattable *f = this;
+
+  JSONParser jp;
+
+  bool is_valid_json = jp.parse(val.c_str(), val.size());
+
+  for (const auto& i : tok) {
+    vector<field_entity> v;
+    int ret = parse_entity(i, &v);
+    if (ret < 0) {
+      return ret;
+    }
+    for (const auto& vi : v) {
+      if (f->type == FMT_NONE) {
+        if (vi.is_obj) {
+          f->type = FMT_OBJ;
+        } else {
+          f->type = FMT_ARRAY;
+        }
+      }
+
+      if (f->type == FMT_OBJ) {
+        if (!vi.is_obj) {
+          return -EINVAL;
+        }
+        f = &f->obj[vi.name];
+      } else if (f->type == FMT_ARRAY) {
+        if (vi.is_obj) {
+          return -EINVAL;
+        }
+        int index = vi.index;
+        if (vi.append) {
+          index = f->arr.size();
+        } else if (index < 0) {
+          index = f->arr.size() + index;
+          if (index < 0) {
+            return -EINVAL; /* out of bounds */
+          }
+        }
+        if ((size_t)index >= f->arr.size()) {
+          f->arr.resize(index + 1);
+        }
+        f = &f->arr[index];
+      }
+    }
+  }
+
+  if (is_valid_json) {
+    f->decode_json(&jp);
+  } else {
+    f->type = FMT_VALUE;
+    f->value.set(val, !is_numeric(val));
+  }
+
+  return 0;
+}
+
+int JSONFormattable::erase(const string& name)
+{
+  boost::escaped_list_separator<char> els('\\', '.', '"');
+  boost::tokenizer<boost::escaped_list_separator<char> > tok(name, els);
+
+  JSONFormattable *f = this;
+  JSONFormattable *parent = nullptr;
+  field_entity last_entity;
+
+  for (auto& i : tok) {
+    vector<field_entity> v;
+    int ret = parse_entity(i, &v);
+    if (ret < 0) {
+      return ret;
+    }
+    for (const auto& vi : v) {
+      if (f->type == FMT_NONE ||
+          f->type == FMT_VALUE) {
+        if (vi.is_obj) {
+          f->type = FMT_OBJ;
+        } else {
+          f->type = FMT_ARRAY;
+        }
+      }
+
+      parent = f;
+
+      if (f->type == FMT_OBJ) {
+        if (!vi.is_obj) {
+          return -EINVAL;
+        }
+        auto iter = f->obj.find(vi.name);
+        if (iter == f->obj.end()) {
+          return 0; /* nothing to erase */
+        }
+        f = &iter->second;
+      } else if (f->type == FMT_ARRAY) {
+        if (vi.is_obj) {
+          return -EINVAL;
+        }
+        int index = vi.index;
+        if (index < 0) {
+          index = f->arr.size() + index;
+          if (index < 0) { /* out of bounds, nothing to remove */
+            return 0;
+          }
+        }
+        if ((size_t)index >= f->arr.size()) {
+          return 0; /* index beyond array boundaries */
+        }
+        f = &f->arr[index];
+      }
+      last_entity = vi;
+    }
+  }
+
+  if (!parent) {
+    *this = JSONFormattable(); /* erase everything */
+  } else {
+    if (last_entity.is_obj) {
+      parent->obj.erase(last_entity.name);
+    } else {
+      int index = (last_entity.index >= 0 ? last_entity.index : parent->arr.size() + last_entity.index);
+      if (index < 0 || (size_t)index >= parent->arr.size()) {
+        return 0;
+      }
+      parent->arr.erase(parent->arr.begin() + index);
+    }
+  }
+
+  return 0;
+}
+
+void JSONFormattable::derive_from(const JSONFormattable& parent)
+{
+  for (auto& o : parent.obj) {
+    if (obj.find(o.first) == obj.end()) {
+      obj[o.first] = o.second;
+    }
+  }
+}
+
+void encode_json(const char *name, const JSONFormattable& v, Formatter *f)
+{
+  v.encode_json(name, f);
+}
+
+void JSONFormattable::encode_json(const char *name, Formatter *f) const
+{
+  switch (type) {
+    case JSONFormattable::FMT_VALUE:
+      ::encode_json(name, value, f);
+      break;
+    case JSONFormattable::FMT_ARRAY:
+      ::encode_json(name, arr, f);
+      break;
+    case JSONFormattable::FMT_OBJ:
+      f->open_object_section(name);
+      for (auto iter : obj) {
+        ::encode_json(iter.first.c_str(), iter.second, f);
+      }
+      f->close_section();
+      break;
+    case JSONFormattable::FMT_NONE:
+      break;
+  }
+}
+
+bool JSONFormattable::handle_value(std::string_view name, std::string_view s, bool quoted) {
+  JSONFormattable *new_val;
+  if (cur_enc->is_array()) {
+    cur_enc->arr.push_back(JSONFormattable());
+    new_val = &cur_enc->arr.back();
+  } else {
+    cur_enc->set_type(JSONFormattable::FMT_OBJ);
+    new_val  = &cur_enc->obj[string{name}];
+  }
+  new_val->set_type(JSONFormattable::FMT_VALUE);
+  new_val->value.set(s, quoted);
+
+  return false;
+}
+
+bool JSONFormattable::handle_open_section(std::string_view name,
+                                          const char *ns,
+                                          bool section_is_array) {
+  if (cur_enc->is_array()) {
+    cur_enc->arr.push_back(JSONFormattable());
+    cur_enc = &cur_enc->arr.back();
+  } else if (enc_stack.size() > 1) {
+      /* only open a new section if already nested,
+       * otherwise root is the container
+       */
+    cur_enc = &cur_enc->obj[string{name}];
+  }
+  enc_stack.push_back(cur_enc);
+
+  if (section_is_array) {
+    cur_enc->set_type(JSONFormattable::FMT_ARRAY);
+  } else {
+    cur_enc->set_type(JSONFormattable::FMT_OBJ);
+  }
+
+  return false; /* continue processing */
+}
+
+bool JSONFormattable::handle_close_section() {
+  if (enc_stack.size() <= 1) {
+    return false;
+  }
+
+  enc_stack.pop_back();
+  cur_enc = enc_stack.back();
+  return false; /* continue processing */
+}
+
diff --git a/src/common/ceph_json.h b/src/common/ceph_json.h
new file mode 100644
index 000000000..08e8d9e46
--- /dev/null
+++ b/src/common/ceph_json.h
@@ -0,0 +1,933 @@
+#ifndef CEPH_JSON_H
+#define CEPH_JSON_H
+
+#include <stdexcept>
+#include <typeindex>
+#include <include/types.h>
+#include <boost/container/flat_map.hpp>
+#include <boost/container/flat_set.hpp>
+#include <include/ceph_fs.h>
+#include "common/ceph_time.h"
+
+#include "json_spirit/json_spirit.h"
+
+#include "Formatter.h"
+
+
+
+class JSONObj;
+
+class JSONObjIter {
+  typedef std::map<std::string, JSONObj *>::iterator map_iter_t;
+  map_iter_t cur;
+  map_iter_t last;
+
+public:
+  JSONObjIter();
+  ~JSONObjIter();
+  void set(const JSONObjIter::map_iter_t &_cur, const JSONObjIter::map_iter_t &_end);
+
+  void operator++();
+  JSONObj *operator*();
+
+  bool end() const {
+    return (cur == last);
+  }
+};
+
+class JSONObj
+{
+  JSONObj *parent;
+public:
+  struct data_val {
+    std::string str;
+    bool quoted{false};
+
+    void set(std::string_view s, bool q) {
+      str = s;
+      quoted = q;
+    }
+  };
+protected:
+  std::string name; // corresponds to obj_type in XMLObj
+  json_spirit::Value data;
+  struct data_val val;
+  bool data_quoted{false};
+  std::multimap<std::string, JSONObj *> children;
+  std::map<std::string, data_val> attr_map;
+  void handle_value(json_spirit::Value v);
+
+public:
+
+  JSONObj() : parent(NULL){}
+
+  virtual ~JSONObj();
+
+  void init(JSONObj *p, json_spirit::Value v, std::string n);
+
+  std::string& get_name() { return name; }
+  data_val& get_data_val() { return val; }
+  const std::string& get_data() { return val.str; }
+  bool get_data(const std::string& key, data_val *dest);
+  JSONObj *get_parent();
+  void add_child(std::string el, JSONObj *child);
+  bool get_attr(std::string name, data_val& attr);
+  JSONObjIter find(const std::string& name);
+  JSONObjIter find_first();
+  JSONObjIter find_first(const std::string& name);
+  JSONObj *find_obj(const std::string& name);
+
+  friend std::ostream& operator<<(std::ostream &out,
+                                  const JSONObj &obj); // does not work, FIXME
+
+  bool is_array();
+  bool is_object();
+  std::vector<std::string> get_array_elements();
+};
+
+inline std::ostream& operator<<(std::ostream &out, const JSONObj::data_val& dv) {
+  const char *q = (dv.quoted ? "\"" : "");
+   out << q << dv.str << q;
+   return out;
+}
+
+class JSONParser : public JSONObj
+{
+  int buf_len;
+  std::string json_buffer;
+  bool success;
+public:
+  JSONParser();
+  ~JSONParser() override;
+  void handle_data(const char *s, int len);
+
+  bool parse(const char *buf_, int len);
+  bool parse(int len);
+  bool parse();
+  bool parse(const char *file_name);
+
+  const char *get_json() { return json_buffer.c_str(); }
+  void set_failure() { success = false; }
+};
+
+void encode_json(const char *name, const JSONObj::data_val& v, ceph::Formatter *f);
+
+class JSONDecoder {
+public:
+  struct err : std::runtime_error {
+    using runtime_error::runtime_error;
+  };
+
+  JSONParser parser;
+
+  JSONDecoder(ceph::buffer::list& bl) {
+    if (!parser.parse(bl.c_str(), bl.length())) {
+      std::cout << "JSONDecoder::err()" << std::endl;
+      throw JSONDecoder::err("failed to parse JSON input");
+    }
+  }
+
+  template<class T>
+  static bool decode_json(const char *name, T& val, JSONObj *obj, bool mandatory = false);
+
+  template<class C>
+  static bool decode_json(const char *name, C& container, void (*cb)(C&, JSONObj *obj), JSONObj *obj, bool mandatory = false);
+
+  template<class T>
+  static void decode_json(const char *name, T& val, const T& default_val, JSONObj *obj);
+
+  template<class T>
+  static bool decode_json(const char *name, boost::optional<T>& val, JSONObj *obj, bool mandatory = false);
+
+  template<class T>
+  static bool decode_json(const char *name, std::optional<T>& val, JSONObj *obj, bool mandatory = false);
+
+};
+
+template<class T>
+void decode_json_obj(T& val, JSONObj *obj)
+{
+  val.decode_json(obj);
+}
+
+inline void decode_json_obj(std::string& val, JSONObj *obj)
+{
+  val = obj->get_data();
+}
+
+static inline void decode_json_obj(JSONObj::data_val& val, JSONObj *obj)
+{
+  val = obj->get_data_val();
+}
+
+void decode_json_obj(unsigned long long& val, JSONObj *obj);
+void decode_json_obj(long long& val, JSONObj *obj);
+void decode_json_obj(unsigned long& val, JSONObj *obj);
+void decode_json_obj(long& val, JSONObj *obj);
+void decode_json_obj(unsigned& val, JSONObj *obj);
+void decode_json_obj(int& val, JSONObj *obj);
+void decode_json_obj(bool& val, JSONObj *obj);
+void decode_json_obj(ceph::buffer::list& val, JSONObj *obj);
+class utime_t;
+void decode_json_obj(utime_t& val, JSONObj *obj);
+void decode_json_obj(ceph_dir_layout& i, JSONObj *obj);
+
+void decode_json_obj(ceph::real_time& val, JSONObj *obj);
+void decode_json_obj(ceph::coarse_real_time& val, JSONObj *obj);
+
+template<class T>
+void decode_json_obj(std::list<T>& l, JSONObj *obj)
+{
+  l.clear();
+
+  JSONObjIter iter = obj->find_first();
+
+  for (; !iter.end(); ++iter) {
+    T val;
+    JSONObj *o = *iter;
+    decode_json_obj(val, o);
+    l.push_back(val);
+  }
+}
+
+template<class T>
+void decode_json_obj(std::deque<T>& l, JSONObj *obj)
+{
+  l.clear();
+
+  JSONObjIter iter = obj->find_first();
+
+  for (; !iter.end(); ++iter) {
+    T val;
+    JSONObj *o = *iter;
+    decode_json_obj(val, o);
+    l.push_back(val);
+  }
+}
+
+template<class T>
+void decode_json_obj(std::set<T>& l, JSONObj *obj)
+{
+  l.clear();
+
+  JSONObjIter iter = obj->find_first();
+
+  for (; !iter.end(); ++iter) {
+    T val;
+    JSONObj *o = *iter;
+    decode_json_obj(val, o);
+    l.insert(val);
+  }
+}
+
+template<class T, class Compare, class Alloc>
+void decode_json_obj(boost::container::flat_set<T, Compare, Alloc>& l, JSONObj *obj)
+{
+  l.clear();
+
+  JSONObjIter iter = obj->find_first();
+
+  for (; !iter.end(); ++iter) {
+    T val;
+    JSONObj *o = *iter;
+    decode_json_obj(val, o);
+    l.insert(val);
+  }
+}
+
+template<class T>
+void decode_json_obj(std::vector<T>& l, JSONObj *obj)
+{
+  l.clear();
+
+  JSONObjIter iter = obj->find_first();
+
+  for (; !iter.end(); ++iter) {
+    T val;
+    JSONObj *o = *iter;
+    decode_json_obj(val, o);
+    l.push_back(val);
+  }
+}
+
+template<class K, class V, class C = std::less<K> >
+void decode_json_obj(std::map<K, V, C>& m, JSONObj *obj)
+{
+  m.clear();
+
+  JSONObjIter iter = obj->find_first();
+
+  for (; !iter.end(); ++iter) {
+    K key;
+    V val;
+    JSONObj *o = *iter;
+    JSONDecoder::decode_json("key", key, o);
+    JSONDecoder::decode_json("val", val, o);
+    m[key] = val;
+  }
+}
+
+template<class K, class V, class C = std::less<K> >
+void decode_json_obj(boost::container::flat_map<K, V, C>& m, JSONObj *obj)
+{
+  m.clear();
+
+  JSONObjIter iter = obj->find_first();
+
+  for (; !iter.end(); ++iter) {
+    K key;
+    V val;
+    JSONObj *o = *iter;
+    JSONDecoder::decode_json("key", key, o);
+    JSONDecoder::decode_json("val", val, o);
+    m[key] = val;
+  }
+}
+
+template<class K, class V>
+void decode_json_obj(std::multimap<K, V>& m, JSONObj *obj)
+{
+  m.clear();
+
+  JSONObjIter iter = obj->find_first();
+
+  for (; !iter.end(); ++iter) {
+    K key;
+    V val;
+    JSONObj *o = *iter;
+    JSONDecoder::decode_json("key", key, o);
+    JSONDecoder::decode_json("val", val, o);
+    m.insert(make_pair(key, val));
+  }
+}
+
+template<class K, class V>
+void decode_json_obj(boost::container::flat_map<K, V>& m, JSONObj *obj)
+{
+  m.clear();
+
+  JSONObjIter iter = obj->find_first();
+
+  for (; !iter.end(); ++iter) {
+    K key;
+    V val;
+    JSONObj *o = *iter;
+    JSONDecoder::decode_json("key", key, o);
+    JSONDecoder::decode_json("val", val, o);
+    m[key] = val;
+  }
+}
+template<class C>
+void decode_json_obj(C& container, void (*cb)(C&, JSONObj *obj), JSONObj *obj)
+{
+  container.clear();
+
+  JSONObjIter iter = obj->find_first();
+
+  for (; !iter.end(); ++iter) {
+    JSONObj *o = *iter;
+    cb(container, o);
+  }
+}
+
+template<class T>
+bool JSONDecoder::decode_json(const char *name, T& val, JSONObj *obj, bool mandatory)
+{
+  JSONObjIter iter = obj->find_first(name);
+  if (iter.end()) {
+    if (mandatory) {
+      std::string s = "missing mandatory field " + std::string(name);
+      throw err(s);
+    }
+    if constexpr (std::is_default_constructible_v<T>) {
+      val = T();
+    }
+    return false;
+  }
+
+  try {
+    decode_json_obj(val, *iter);
+  } catch (const err& e) {
+    std::string s = std::string(name) + ": ";
+    s.append(e.what());
+    throw err(s);
+  }
+
+  return true;
+}
+
+template<class C>
+bool JSONDecoder::decode_json(const char *name, C& container, void (*cb)(C&, JSONObj *), JSONObj *obj, bool mandatory)
+{
+  container.clear();
+
+  JSONObjIter iter = obj->find_first(name);
+  if (iter.end()) {
+    if (mandatory) {
+      std::string s = "missing mandatory field " + std::string(name);
+      throw err(s);
+    }
+    return false;
+  }
+
+  try {
+    decode_json_obj(container, cb, *iter);
+  } catch (const err& e) {
+    std::string s = std::string(name) + ": ";
+    s.append(e.what());
+    throw err(s);
+  }
+
+  return true;
+}
+
+template<class T>
+void JSONDecoder::decode_json(const char *name, T& val, const T& default_val, JSONObj *obj)
+{
+  JSONObjIter iter = obj->find_first(name);
+  if (iter.end()) {
+    val = default_val;
+    return;
+  }
+
+  try {
+    decode_json_obj(val, *iter);
+  } catch (const err& e) {
+    val = default_val;
+    std::string s = std::string(name) + ": ";
+    s.append(e.what());
+    throw err(s);
+  }
+}
+
+template<class T>
+bool JSONDecoder::decode_json(const char *name, boost::optional<T>& val, JSONObj *obj, bool mandatory)
+{
+  JSONObjIter iter = obj->find_first(name);
+  if (iter.end()) {
+    if (mandatory) {
+      std::string s = "missing mandatory field " + std::string(name);
+      throw err(s);
+    }
+    val = boost::none;
+    return false;
+  }
+
+  try {
+    val.reset(T());
+    decode_json_obj(val.get(), *iter);
+  } catch (const err& e) {
+    val.reset();
+    std::string s = std::string(name) + ": ";
+    s.append(e.what());
+    throw err(s);
+  }
+
+  return true;
+}
+
+template<class T>
+bool JSONDecoder::decode_json(const char *name, std::optional<T>& val, JSONObj *obj, bool mandatory)
+{
+  JSONObjIter iter = obj->find_first(name);
+  if (iter.end()) {
+    if (mandatory) {
+      std::string s = "missing mandatory field " + std::string(name);
+      throw err(s);
+    }
+    val.reset();
+    return false;
+  }
+
+  try {
+    val.emplace();
+    decode_json_obj(*val, *iter);
+  } catch (const err& e) {
+    val.reset();
+    std::string s = std::string(name) + ": ";
+    s.append(e.what());
+    throw err(s);
+  }
+
+  return true;
+}
+
+class JSONEncodeFilter
+{
+public:
+  class HandlerBase {
+  public:
+    virtual ~HandlerBase() {}
+
+    virtual std::type_index get_type() = 0;
+    virtual void encode_json(const char *name, const void *pval, ceph::Formatter *) const = 0;
+  };
+
+  template <class T>
+  class Handler : public HandlerBase {
+  public:
+    virtual ~Handler() {}
+
+    std::type_index get_type() override {
+      return std::type_index(typeid(const T&));
+    }
+  };
+
+private:
+  std::map<std::type_index, HandlerBase *> handlers;
+
+public:
+  void register_type(HandlerBase *h) {
+    handlers[h->get_type()] = h;
+  }
+
+  template <class T>
+  bool encode_json(const char *name, const T& val, ceph::Formatter *f) {
+    auto iter = handlers.find(std::type_index(typeid(val)));
+    if (iter == handlers.end()) {
+      return false;
+    }
+
+    iter->second->encode_json(name, (const void *)&val, f);
+    return true;
+  }
+};
+
+template<class T>
+static void encode_json_impl(const char *name, const T& val, ceph::Formatter *f)
+{
+  f->open_object_section(name);
+  val.dump(f);
+  f->close_section();
+}
+
+template<class T>
+static void encode_json(const char *name, const T& val, ceph::Formatter *f)
+{
+  JSONEncodeFilter *filter = static_cast<JSONEncodeFilter *>(f->get_external_feature_handler("JSONEncodeFilter"));
+
+  if (!filter ||
+      !filter->encode_json(name, val, f)) {
+    encode_json_impl(name, val, f);
+  }
+}
+
+class utime_t;
+
+void encode_json(const char *name, std::string_view val, ceph::Formatter *f);
+void encode_json(const char *name, const std::string& val, ceph::Formatter *f);
+void encode_json(const char *name, const char *val, ceph::Formatter *f);
+void encode_json(const char *name, bool val, ceph::Formatter *f);
+void encode_json(const char *name, int val, ceph::Formatter *f);
+void encode_json(const char *name, unsigned val, ceph::Formatter *f);
+void encode_json(const char *name, long val, ceph::Formatter *f);
+void encode_json(const char *name, unsigned long val, ceph::Formatter *f);
+void encode_json(const char *name, long long val, ceph::Formatter *f);
+void encode_json(const char *name, const utime_t& val, ceph::Formatter *f);
+void encode_json(const char *name, const ceph::buffer::list& bl, ceph::Formatter *f);
+void encode_json(const char *name, long long unsigned val, ceph::Formatter *f);
+
+void encode_json(const char *name, const ceph::real_time& val, ceph::Formatter *f);
+void encode_json(const char *name, const ceph::coarse_real_time& val, ceph::Formatter *f);
+
+template<class T>
+static void encode_json(const char *name, const std::list<T>& l, ceph::Formatter *f)
+{
+  f->open_array_section(name);
+  for (auto iter = l.cbegin(); iter != l.cend(); ++iter) {
+    encode_json("obj", *iter, f);
+  }
+  f->close_section();
+}
+
+template<class T>
+static void encode_json(const char *name, const std::deque<T>& l, ceph::Formatter *f)
+{
+  f->open_array_section(name);
+  for (auto iter = l.cbegin(); iter != l.cend(); ++iter) {
+    encode_json("obj", *iter, f);
+  }
+  f->close_section();
+}
+
+template<class T, class Compare = std::less<T> >
+static void encode_json(const char *name, const std::set<T, Compare>& l, ceph::Formatter *f)
+{
+  f->open_array_section(name);
+  for (auto iter = l.cbegin(); iter != l.cend(); ++iter) {
+    encode_json("obj", *iter, f);
+  }
+  f->close_section();
+}
+
+template<class T, class Compare, class Alloc>
+static void encode_json(const char *name,
+                        const boost::container::flat_set<T, Compare, Alloc>& l,
+                        ceph::Formatter *f)
+{
+  f->open_array_section(name);
+  for (auto iter = l.cbegin(); iter != l.cend(); ++iter) {
+    encode_json("obj", *iter, f);
+  }
+  f->close_section();
+}
+
+template<class T>
+static void encode_json(const char *name, const std::vector<T>& l, ceph::Formatter *f)
+{
+  f->open_array_section(name);
+  for (auto iter = l.cbegin(); iter != l.cend(); ++iter) {
+    encode_json("obj", *iter, f);
+  }
+  f->close_section();
+}
+
+template<class K, class V, class C = std::less<K>>
+static void encode_json(const char *name, const std::map<K, V, C>& m, ceph::Formatter *f)
+{
+  f->open_array_section(name);
+  for (auto i = m.cbegin(); i != m.cend(); ++i) {
+    f->open_object_section("entry");
+    encode_json("key", i->first, f);
+    encode_json("val", i->second, f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+template<class K, class V, class C = std::less<K> >
+static void encode_json(const char *name, const boost::container::flat_map<K, V, C>& m, ceph::Formatter *f)
+{
+  f->open_array_section(name);
+  for (auto i = m.cbegin(); i != m.cend(); ++i) {
+    f->open_object_section("entry");
+    encode_json("key", i->first, f);
+    encode_json("val", i->second, f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+template<class K, class V>
+static void encode_json(const char *name, const std::multimap<K, V>& m, ceph::Formatter *f)
+{
+  f->open_array_section(name);
+  for (auto i = m.begin(); i != m.end(); ++i) {
+    f->open_object_section("entry");
+    encode_json("key", i->first, f);
+    encode_json("val", i->second, f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+template<class K, class V>
+static void encode_json(const char *name, const boost::container::flat_map<K, V>& m, ceph::Formatter *f)
+{
+  f->open_array_section(name);
+  for (auto i = m.begin(); i != m.end(); ++i) {
+    f->open_object_section("entry");
+    encode_json("key", i->first, f);
+    encode_json("val", i->second, f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+template<class K, class V>
+void encode_json_map(const char *name, const std::map<K, V>& m, ceph::Formatter *f)
+{
+  f->open_array_section(name);
+  for (auto iter = m.cbegin(); iter != m.cend(); ++iter) {
+    encode_json("obj", iter->second, f);
+  }
+  f->close_section();
+}
+
+
+template<class K, class V>
+void encode_json_map(const char *name, const char *index_name,
+                     const char *object_name, const char *value_name,
+                     void (*cb)(const char *, const V&, ceph::Formatter *, void *), void *parent,
+                     const std::map<K, V>& m, ceph::Formatter *f)
+{
+  f->open_array_section(name);
+  for (auto iter = m.cbegin(); iter != m.cend(); ++iter) {
+    if (index_name) {
+      f->open_object_section("key_value");
+      f->dump_string(index_name, iter->first);
+    }
+
+    if (object_name) {
+      f->open_object_section(object_name);
+    }
+
+    if (cb) {
+      cb(value_name, iter->second, f, parent);
+    } else {
+      encode_json(value_name, iter->second, f);
+    }
+
+    if (object_name) {
+      f->close_section();
+    }
+    if (index_name) {
+      f->close_section();
+    }
+  }
+  f->close_section(); 
+}
+
+template<class K, class V>
+void encode_json_map(const char *name, const char *index_name,
+                     const char *object_name, const char *value_name,
+                     const std::map<K, V>& m, ceph::Formatter *f)
+{
+  encode_json_map<K, V>(name, index_name, object_name, value_name, NULL, NULL, m, f);
+}
+
+template<class K, class V>
+void encode_json_map(const char *name, const char *index_name, const char *value_name,
+                     const std::map<K, V>& m, ceph::Formatter *f)
+{
+  encode_json_map<K, V>(name, index_name, NULL, value_name, NULL, NULL, m, f);
+}
+
+template <class T>
+static void encode_json(const char *name, const std::optional<T>& o, ceph::Formatter *f)
+{
+  if (!o) {
+    return;
+  }
+  encode_json(name, *o, f);
+}
+
+
+template<class K, class V>
+void encode_json_map(const char *name, const boost::container::flat_map<K, V>& m, ceph::Formatter *f)
+{
+  f->open_array_section(name);
+  for (auto iter = m.cbegin(); iter != m.cend(); ++iter) {
+    encode_json("obj", iter->second, f);
+  }
+  f->close_section();
+}
+
+
+template<class K, class V>
+void encode_json_map(const char *name, const char *index_name,
+                     const char *object_name, const char *value_name,
+                     void (*cb)(const char *, const V&, ceph::Formatter *, void *), void *parent,
+                     const boost::container::flat_map<K, V>& m, ceph::Formatter *f)
+{
+  f->open_array_section(name);
+  for (auto iter = m.cbegin(); iter != m.cend(); ++iter) {
+    if (index_name) {
+      f->open_object_section("key_value");
+      f->dump_string(index_name, iter->first);
+    }
+
+    if (object_name) {
+      f->open_object_section(object_name);
+    }
+
+    if (cb) {
+      cb(value_name, iter->second, f, parent);
+    } else {
+      encode_json(value_name, iter->second, f);
+    }
+
+    if (object_name) {
+      f->close_section();
+    }
+    if (index_name) {
+      f->close_section();
+    }
+  }
+  f->close_section(); 
+}
+
+template<class K, class V>
+void encode_json_map(const char *name, const char *index_name,
+                     const char *object_name, const char *value_name,
+                     const boost::container::flat_map<K, V>& m, ceph::Formatter *f)
+{
+  encode_json_map<K, V>(name, index_name, object_name, value_name, NULL, NULL, m, f);
+}
+
+template<class K, class V>
+void encode_json_map(const char *name, const char *index_name, const char *value_name,
+                     const boost::container::flat_map<K, V>& m, ceph::Formatter *f)
+{
+  encode_json_map<K, V>(name, index_name, NULL, value_name, NULL, NULL, m, f);
+}
+
+
+class JSONFormattable : public ceph::JSONFormatter {
+  JSONObj::data_val value;
+  std::vector<JSONFormattable> arr;
+  std::map<std::string, JSONFormattable> obj;
+
+  std::vector<JSONFormattable *> enc_stack;
+  JSONFormattable *cur_enc;
+
+protected:
+  bool handle_value(std::string_view name, std::string_view s, bool quoted) override;
+  bool handle_open_section(std::string_view name, const char *ns, bool section_is_array) override;
+  bool handle_close_section() override;
+
+public:
+  JSONFormattable(bool p = false) : JSONFormatter(p) {
+    cur_enc = this;
+    enc_stack.push_back(cur_enc);
+  }
+
+  enum Type {
+    FMT_NONE,
+    FMT_VALUE,
+    FMT_ARRAY,
+    FMT_OBJ,
+  } type{FMT_NONE};
+
+  void set_type(Type t) {
+    type = t;
+  }
+
+  void decode_json(JSONObj *jo) {
+    if (jo->is_array()) {
+      set_type(JSONFormattable::FMT_ARRAY);
+      decode_json_obj(arr, jo);
+    } else if (jo->is_object()) {
+      set_type(JSONFormattable::FMT_OBJ);
+      auto iter = jo->find_first();
+      for (;!iter.end(); ++iter) {
+        JSONObj *field = *iter;
+        decode_json_obj(obj[field->get_name()], field);
+      }
+    } else {
+      set_type(JSONFormattable::FMT_VALUE);
+      decode_json_obj(value, jo);
+    }
+  }
+
+  void encode(ceph::buffer::list& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode((uint8_t)type, bl);
+    encode(value.str, bl);
+    encode(arr, bl);
+    encode(obj, bl);
+    encode(value.quoted, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    DECODE_START(2, bl);
+    uint8_t t;
+    decode(t, bl);
+    type = (Type)t;
+    decode(value.str, bl);
+    decode(arr, bl);
+    decode(obj, bl);
+    if (struct_v >= 2) {
+      decode(value.quoted, bl);
+    } else {
+      value.quoted = true;
+    }
+    DECODE_FINISH(bl);
+  }
+
+  const std::string& val() const {
+    return value.str;
+  }
+
+  int val_int() const;
+  long val_long() const;
+  long long val_long_long() const;
+  bool val_bool() const;
+
+  const std::map<std::string, JSONFormattable> object() const {
+    return obj;
+  }
+
+  const std::vector<JSONFormattable>& array() const {
+    return arr;
+  }
+
+  const JSONFormattable& operator[](const std::string& name) const;
+  const JSONFormattable& operator[](size_t index) const;
+
+  JSONFormattable& operator[](const std::string& name);
+  JSONFormattable& operator[](size_t index);
+
+  operator std::string() const {
+    return value.str;
+  }
+
+  explicit operator int() const {
+    return val_int();
+  }
+
+  explicit operator long() const {
+    return val_long();
+  }
+
+  explicit operator long long() const {
+    return val_long_long();
+  }
+
+  explicit operator bool() const {
+    return val_bool();
+  }
+
+  template<class T>
+  T operator[](const std::string& name) const {
+    return this->operator[](name)(T());
+  }
+
+  template<class T>
+  T operator[](const std::string& name) {
+    return this->operator[](name)(T());
+  }
+
+  std::string operator ()(const char *def_val) const {
+    return def(std::string(def_val));
+  }
+
+  int operator()(int def_val) const {
+    return def(def_val);
+  }
+
+  bool operator()(bool def_val) const {
+    return def(def_val);
+  }
+
+  bool exists(const std::string& name) const;
+  bool exists(size_t index) const;
+
+  std::string def(const std::string& def_val) const;
+  int def(int def_val) const;
+  bool def(bool def_val) const;
+
+  bool find(const std::string& name, std::string *val) const;
+
+  std::string get(const std::string& name, const std::string& def_val) const;
+
+  int get_int(const std::string& name, int def_val) const;
+  bool get_bool(const std::string& name, bool def_val) const;
+
+  int set(const std::string& name, const std::string& val);
+  int erase(const std::string& name);
+
+  void derive_from(const JSONFormattable& jf);
+
+  void encode_json(const char *name, ceph::Formatter *f) const;
+
+  bool is_array() const {
+    return (type == FMT_ARRAY);
+  }
+};
+WRITE_CLASS_ENCODER(JSONFormattable)
+
+void encode_json(const char *name, const JSONFormattable& v, ceph::Formatter *f);
+
+#endif
diff --git a/src/common/ceph_mutex.h b/src/common/ceph_mutex.h
new file mode 100644
index 000000000..8d87e605b
--- /dev/null
+++ b/src/common/ceph_mutex.h
@@ -0,0 +1,218 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <utility>
+#include "common/containers.h"
+
+// What and why
+// ============
+//
+// For general code making use of mutexes, use these ceph:: types.
+// The key requirement is that you make use of the ceph::make_mutex()
+// and make_recursive_mutex() factory methods, which take a string
+// naming the mutex for the purposes of the lockdep debug variant.
+
+#if defined(WITH_SEASTAR) && !defined(WITH_ALIEN)
+#include <seastar/core/condition-variable.hh>
+
+#include "crimson/common/log.h"
+#include "include/ceph_assert.h"
+
+#ifndef NDEBUG
+#define FUT_DEBUG(FMT_MSG, ...) crimson::get_logger(ceph_subsys_).trace(FMT_MSG, ##__VA_ARGS__)
+#else
+#define FUT_DEBUG(FMT_MSG, ...)
+#endif
+
+namespace ceph {
+  // an empty class satisfying the mutex concept
+  struct dummy_mutex {
+    void lock() {}
+    bool try_lock() {
+      return true;
+    }
+    void unlock() {}
+    void lock_shared() {}
+    void unlock_shared() {}
+  };
+
+  struct dummy_shared_mutex : dummy_mutex {
+    void lock_shared() {}
+    void unlock_shared() {}
+  };
+
+  // this implementation assumes running within a seastar::thread
+  struct green_condition_variable : private seastar::condition_variable {
+    template <class LockT>
+    void wait(LockT&&) {
+      FUT_DEBUG("green_condition_variable::{}: before blocking", __func__);
+      seastar::condition_variable::wait().get();
+      FUT_DEBUG("green_condition_variable::{}: after blocking", __func__);
+    }
+
+    void notify_one() noexcept {
+      FUT_DEBUG("green_condition_variable::{}", __func__);
+      signal();
+    }
+
+    void notify_all() noexcept {
+      FUT_DEBUG("green_condition_variable::{}", __func__);
+      broadcast();
+    }
+  };
+
+  using mutex = dummy_mutex;
+  using recursive_mutex = dummy_mutex;
+  using shared_mutex = dummy_shared_mutex;
+  using condition_variable = green_condition_variable;
+
+  template <typename ...Args>
+  dummy_mutex make_mutex(Args&& ...args) {
+    return {};
+  }
+
+  template <typename ...Args>
+  recursive_mutex make_recursive_mutex(Args&& ...args) {
+    return {};
+  }
+
+  template <typename ...Args>
+  shared_mutex make_shared_mutex(Args&& ...args) {
+    return {};
+  }
+
+  #define ceph_mutex_is_locked(m) true
+  #define ceph_mutex_is_locked_by_me(m) true
+}
+
+#else  // defined (WITH_SEASTAR) && !defined(WITH_ALIEN)
+//
+// For legacy Mutex users that passed recursive=true, use
+// ceph::make_recursive_mutex.  For legacy Mutex users that passed
+// lockdep=false, use std::mutex directly.
+
+#ifdef CEPH_DEBUG_MUTEX
+
+// ============================================================================
+// debug (lockdep-capable, various sanity checks and asserts)
+// ============================================================================
+//
+// Note: this is known to cause deadlocks on Windows because
+// of the winpthreads shared mutex implementation.
+
+#include "common/condition_variable_debug.h"
+#include "common/mutex_debug.h"
+#include "common/shared_mutex_debug.h"
+
+namespace ceph {
+  typedef ceph::mutex_debug mutex;
+  typedef ceph::mutex_recursive_debug recursive_mutex;
+  typedef ceph::condition_variable_debug condition_variable;
+  typedef ceph::shared_mutex_debug shared_mutex;
+
+  // pass arguments to mutex_debug ctor
+  template <typename ...Args>
+  mutex make_mutex(Args&& ...args) {
+    return {std::forward<Args>(args)...};
+  }
+
+  // pass arguments to recursive_mutex_debug ctor
+  template <typename ...Args>
+  recursive_mutex make_recursive_mutex(Args&& ...args) {
+    return {std::forward<Args>(args)...};
+  }
+
+  // pass arguments to shared_mutex_debug ctor
+  template <typename ...Args>
+  shared_mutex make_shared_mutex(Args&& ...args) {
+    return {std::forward<Args>(args)...};
+  }
+
+  // debug methods
+  #define ceph_mutex_is_locked(m) ((m).is_locked())
+  #define ceph_mutex_is_not_locked(m) (!(m).is_locked())
+  #define ceph_mutex_is_rlocked(m) ((m).is_rlocked())
+  #define ceph_mutex_is_wlocked(m) ((m).is_wlocked())
+  #define ceph_mutex_is_locked_by_me(m) ((m).is_locked_by_me())
+  #define ceph_mutex_is_not_locked_by_me(m) (!(m).is_locked_by_me())
+}
+
+#else
+
+// ============================================================================
+// release (fast and minimal)
+// ============================================================================
+
+#include <condition_variable>
+#include <mutex>
+
+// The winpthreads shared mutex implementation is broken.
+// We'll use boost::shared_mutex instead.
+// https://github.com/msys2/MINGW-packages/issues/3319
+#if __MINGW32__
+#include <boost/thread/shared_mutex.hpp>
+#else
+#include <shared_mutex>
+#endif
+
+namespace ceph {
+
+  typedef std::mutex mutex;
+  typedef std::recursive_mutex recursive_mutex;
+  typedef std::condition_variable condition_variable;
+
+#if __MINGW32__
+  typedef boost::shared_mutex shared_mutex;
+#else
+  typedef std::shared_mutex shared_mutex;
+#endif
+
+  // discard arguments to make_mutex (they are for debugging only)
+  template <typename ...Args>
+  mutex make_mutex(Args&& ...args) {
+    return {};
+  }
+  template <typename ...Args>
+  recursive_mutex make_recursive_mutex(Args&& ...args) {
+    return {};
+  }
+  template <typename ...Args>
+  shared_mutex make_shared_mutex(Args&& ...args) {
+    return {};
+  }
+
+  // debug methods.  Note that these can blindly return true
+  // because any code that does anything other than assert these
+  // are true is broken.
+  #define ceph_mutex_is_locked(m) true
+  #define ceph_mutex_is_not_locked(m) true
+  #define ceph_mutex_is_rlocked(m) true
+  #define ceph_mutex_is_wlocked(m) true
+  #define ceph_mutex_is_locked_by_me(m) true
+  #define ceph_mutex_is_not_locked_by_me(m) true
+
+}
+
+#endif	// CEPH_DEBUG_MUTEX
+
+#endif	// WITH_SEASTAR
+
+namespace ceph {
+
+template <class LockT,
+          class LockFactoryT>
+ceph::containers::tiny_vector<LockT> make_lock_container(
+  const std::size_t num_instances,
+  LockFactoryT&& lock_factory)
+{
+  return {
+    num_instances, [&](const std::size_t i, auto emplacer) {
+      // this will be called `num_instances` times
+      new (emplacer.data()) LockT {lock_factory(i)};
+    }
+  };
+}
+} // namespace ceph
+
diff --git a/src/common/ceph_releases.cc b/src/common/ceph_releases.cc
new file mode 100644
index 000000000..92e58256f
--- /dev/null
+++ b/src/common/ceph_releases.cc
@@ -0,0 +1,58 @@
+#include "ceph_releases.h"
+
+#include <ostream>
+
+#include "ceph_ver.h"
+
+std::ostream& operator<<(std::ostream& os, const ceph_release_t r)
+{
+  return os << ceph_release_name(static_cast<int>(r));
+}
+
+ceph_release_t ceph_release()
+{
+  return ceph_release_t{CEPH_RELEASE};
+}
+
+ceph_release_t ceph_release_from_name(std::string_view s)
+{
+  ceph_release_t r = ceph_release_t::max;
+  while (--r != ceph_release_t::unknown) {
+    if (s == to_string(r)) {
+      return r;
+    }
+  }
+  return ceph_release_t::unknown;
+}
+
+bool can_upgrade_from(ceph_release_t from_release,
+                      std::string_view from_release_name,
+                      std::ostream& err)
+{
+  if (from_release == ceph_release_t::unknown) {
+    // cannot tell, but i am optimistic
+    return true;
+  }
+  const ceph_release_t cutoff{static_cast<uint8_t>(static_cast<uint8_t>(from_release) + 2)};
+  const auto to_release = ceph_release();
+  if (cutoff < to_release) {
+    err << "recorded " << from_release_name << " "
+        << to_integer<int>(from_release) << " (" << from_release << ") "
+        << "is more than two releases older than installed "
+        << to_integer<int>(to_release) << " (" << to_release << "); "
+        << "you can only upgrade 2 releases at a time\n"
+        << "you should first upgrade to ";
+    auto release = from_release;
+    while (++release <= cutoff) {
+      err << to_integer<int>(release) << " (" << release << ")";
+      if (release < cutoff) {
+        err << " or ";
+      } else {
+        err << "\n";
+      }
+    }
+    return false;
+  } else {
+    return true;
+  }
+}
diff --git a/src/common/ceph_releases.h b/src/common/ceph_releases.h
new file mode 100644
index 000000000..e09e191e5
--- /dev/null
+++ b/src/common/ceph_releases.h
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cstdint>
+#include <iosfwd>
+#include <string_view>
+
+#include "common/ceph_strings.h"
+
+// the C++ version of CEPH_RELEASE_* defined by include/rados.h
+enum class ceph_release_t : std::uint8_t {
+  unknown = 0,
+  argonaut,
+  bobtail,
+  cuttlefish,
+  dumpling,
+  emperor,
+  firefly,
+  giant,
+  hammer,
+  infernalis,
+  jewel,
+  kraken,
+  luminous,
+  mimic,
+  nautilus,
+  octopus,
+  pacific,
+  quincy,
+  reef,
+  max,
+};
+
+std::ostream& operator<<(std::ostream& os, const ceph_release_t r);
+
+inline bool operator!(ceph_release_t& r) {
+  return (r < ceph_release_t::unknown ||
+          r == ceph_release_t::unknown);
+}
+
+inline ceph_release_t& operator--(ceph_release_t& r) {
+  r = static_cast<ceph_release_t>(static_cast<uint8_t>(r) - 1);
+  return r;
+}
+
+inline ceph_release_t& operator++(ceph_release_t& r) {
+  r = static_cast<ceph_release_t>(static_cast<uint8_t>(r) + 1);
+  return r;
+}
+
+inline bool operator<(ceph_release_t lhs, ceph_release_t rhs) {
+  // we used to use -1 for invalid release
+  if (static_cast<int8_t>(lhs) < 0) {
+    return true;
+  } else if (static_cast<int8_t>(rhs) < 0) {
+    return false;
+  }
+  return static_cast<uint8_t>(lhs) < static_cast<uint8_t>(rhs);
+}
+
+inline bool operator>(ceph_release_t lhs, ceph_release_t rhs) {
+  // we used to use -1 for invalid release
+  if (static_cast<int8_t>(lhs) < 0) {
+    return false;
+  } else if (static_cast<int8_t>(rhs) < 0) {
+    return true;
+  }
+  return static_cast<uint8_t>(lhs) > static_cast<uint8_t>(rhs);
+}
+
+inline bool operator>=(ceph_release_t lhs, ceph_release_t rhs) {
+  return !(lhs < rhs);
+}
+
+bool can_upgrade_from(ceph_release_t from_release,
+		      std::string_view from_release_name,
+		      std::ostream& err);
+
+ceph_release_t ceph_release_from_name(std::string_view sv);
+ceph_release_t ceph_release();
+
+inline std::string_view to_string(ceph_release_t r) {
+  return ceph_release_name(static_cast<int>(r));
+}
+template<typename IntType> IntType to_integer(ceph_release_t r) {
+  return static_cast<IntType>(r);
+}
diff --git a/src/common/ceph_strings.cc b/src/common/ceph_strings.cc
new file mode 100644
index 000000000..18dcc701b
--- /dev/null
+++ b/src/common/ceph_strings.cc
@@ -0,0 +1,390 @@
+/*
+ * Ceph string constants
+ */
+#include "ceph_strings.h"
+#include "include/types.h"
+#include "include/ceph_features.h"
+
+const char *ceph_entity_type_name(int type)
+{
+	switch (type) {
+	case CEPH_ENTITY_TYPE_MDS: return "mds";
+	case CEPH_ENTITY_TYPE_OSD: return "osd";
+	case CEPH_ENTITY_TYPE_MON: return "mon";
+	case CEPH_ENTITY_TYPE_MGR: return "mgr";
+	case CEPH_ENTITY_TYPE_CLIENT: return "client";
+	case CEPH_ENTITY_TYPE_AUTH: return "auth";
+	default: return "unknown";
+	}
+}
+
+const char *ceph_con_mode_name(int con_mode)
+{
+	switch (con_mode) {
+	case CEPH_CON_MODE_UNKNOWN: return "unknown";
+	case CEPH_CON_MODE_CRC: return "crc";
+	case CEPH_CON_MODE_SECURE: return "secure";
+	default: return "???";
+	}
+}
+
+const char *ceph_osd_op_name(int op)
+{
+	switch (op) {
+#define GENERATE_CASE(op, opcode, str)	case CEPH_OSD_OP_##op: return (str);
+__CEPH_FORALL_OSD_OPS(GENERATE_CASE)
+#undef GENERATE_CASE
+	default:
+		return "???";
+	}
+}
+
+const char *ceph_osd_state_name(int s)
+{
+	switch (s) {
+	case CEPH_OSD_EXISTS:
+		return "exists";
+	case CEPH_OSD_UP:
+		return "up";
+	case CEPH_OSD_AUTOOUT:
+		return "autoout";
+	case CEPH_OSD_NEW:
+		return "new";
+	case CEPH_OSD_FULL:
+		return "full";
+	case CEPH_OSD_NEARFULL:
+		return "nearfull";
+	case CEPH_OSD_BACKFILLFULL:
+		return "backfillfull";
+        case CEPH_OSD_DESTROYED:
+                return "destroyed";
+        case CEPH_OSD_NOUP:
+                return "noup";
+        case CEPH_OSD_NODOWN:
+                return "nodown";
+        case CEPH_OSD_NOIN:
+                return "noin";
+        case CEPH_OSD_NOOUT:
+                return "noout";
+        case CEPH_OSD_STOP:
+                return "stop";
+	default:
+		return "???";
+	}
+}
+
+const char *ceph_release_name(int r)
+{
+	switch (r) {
+	case CEPH_RELEASE_ARGONAUT:
+		return "argonaut";
+	case CEPH_RELEASE_BOBTAIL:
+		return "bobtail";
+	case CEPH_RELEASE_CUTTLEFISH:
+		return "cuttlefish";
+	case CEPH_RELEASE_DUMPLING:
+		return "dumpling";
+	case CEPH_RELEASE_EMPEROR:
+		return "emperor";
+	case CEPH_RELEASE_FIREFLY:
+		return "firefly";
+	case CEPH_RELEASE_GIANT:
+		return "giant";
+	case CEPH_RELEASE_HAMMER:
+		return "hammer";
+	case CEPH_RELEASE_INFERNALIS:
+		return "infernalis";
+	case CEPH_RELEASE_JEWEL:
+		return "jewel";
+	case CEPH_RELEASE_KRAKEN:
+		return "kraken";
+	case CEPH_RELEASE_LUMINOUS:
+		return "luminous";
+	case CEPH_RELEASE_MIMIC:
+		return "mimic";
+	case CEPH_RELEASE_NAUTILUS:
+		return "nautilus";
+	case CEPH_RELEASE_OCTOPUS:
+		return "octopus";
+	case CEPH_RELEASE_PACIFIC:
+		return "pacific";
+	case CEPH_RELEASE_QUINCY:
+		return "quincy";
+	case CEPH_RELEASE_REEF:
+		return "reef";
+	default:
+		if (r < 0)
+			return "unspecified";
+		return "unknown";
+	}
+}
+
+uint64_t ceph_release_features(int r)
+{
+	uint64_t req = 0;
+
+	req |= CEPH_FEATURE_CRUSH_TUNABLES;
+	if (r <= CEPH_RELEASE_CUTTLEFISH)
+		return req;
+
+	req |= CEPH_FEATURE_CRUSH_TUNABLES2 |
+		CEPH_FEATURE_OSDHASHPSPOOL;
+	if (r <= CEPH_RELEASE_EMPEROR)
+		return req;
+
+	req |= CEPH_FEATURE_CRUSH_TUNABLES3 |
+		CEPH_FEATURE_OSD_PRIMARY_AFFINITY |
+		CEPH_FEATURE_OSD_CACHEPOOL;
+	if (r <= CEPH_RELEASE_GIANT)
+		return req;
+
+	req |= CEPH_FEATURE_CRUSH_V4;
+	if (r <= CEPH_RELEASE_INFERNALIS)
+		return req;
+
+	req |= CEPH_FEATURE_CRUSH_TUNABLES5;
+	if (r <= CEPH_RELEASE_JEWEL)
+		return req;
+
+	req |= CEPH_FEATURE_MSG_ADDR2;
+	if (r <= CEPH_RELEASE_KRAKEN)
+		return req;
+
+	req |= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS; // and overlaps
+	if (r <= CEPH_RELEASE_LUMINOUS)
+		return req;
+
+	return req;
+}
+
+/* return oldest/first release that supports these features */
+int ceph_release_from_features(uint64_t features)
+{
+	int r = 1;
+	while (true) {
+		uint64_t need = ceph_release_features(r);
+		if ((need & features) != need ||
+		    r == CEPH_RELEASE_MAX) {
+			r--;
+			need = ceph_release_features(r);
+			/* we want the first release that looks like this */
+			while (r > 1 && ceph_release_features(r - 1) == need) {
+				r--;
+			}
+			break;
+		}
+		++r;
+	}
+	return r;
+}
+
+const char *ceph_osd_watch_op_name(int o)
+{
+	switch (o) {
+	case CEPH_OSD_WATCH_OP_UNWATCH:
+		return "unwatch";
+	case CEPH_OSD_WATCH_OP_WATCH:
+		return "watch";
+	case CEPH_OSD_WATCH_OP_RECONNECT:
+		return "reconnect";
+	case CEPH_OSD_WATCH_OP_PING:
+		return "ping";
+	default:
+		return "???";
+	}
+}
+
+const char *ceph_osd_alloc_hint_flag_name(int f)
+{
+	switch (f) {
+	case CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE:
+		return "sequential_write";
+	case CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE:
+		return "random_write";
+	case CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ:
+		return "sequential_read";
+	case CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ:
+		return "random_read";
+	case CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY:
+		return "append_only";
+	case CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE:
+		return "immutable";
+	case CEPH_OSD_ALLOC_HINT_FLAG_SHORTLIVED:
+		return "shortlived";
+	case CEPH_OSD_ALLOC_HINT_FLAG_LONGLIVED:
+		return "longlived";
+	case CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE:
+		return "compressible";
+	case CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE:
+		return "incompressible";
+	default:
+		return "???";
+	}
+}
+
+const char *ceph_mds_state_name(int s)
+{
+	switch (s) {
+		/* down and out */
+	case CEPH_MDS_STATE_DNE:        return "down:dne";
+	case CEPH_MDS_STATE_STOPPED:    return "down:stopped";
+	case CEPH_MDS_STATE_DAMAGED:   return "down:damaged";
+		/* up and out */
+	case CEPH_MDS_STATE_BOOT:       return "up:boot";
+	case CEPH_MDS_STATE_STANDBY:    return "up:standby";
+	case CEPH_MDS_STATE_STANDBY_REPLAY:    return "up:standby-replay";
+	case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay";
+	case CEPH_MDS_STATE_CREATING:   return "up:creating";
+	case CEPH_MDS_STATE_STARTING:   return "up:starting";
+		/* up and in */
+	case CEPH_MDS_STATE_REPLAY:     return "up:replay";
+	case CEPH_MDS_STATE_RESOLVE:    return "up:resolve";
+	case CEPH_MDS_STATE_RECONNECT:  return "up:reconnect";
+	case CEPH_MDS_STATE_REJOIN:     return "up:rejoin";
+	case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
+	case CEPH_MDS_STATE_ACTIVE:     return "up:active";
+	case CEPH_MDS_STATE_STOPPING:   return "up:stopping";
+               /* misc */
+	case CEPH_MDS_STATE_NULL:       return "null";
+	}
+	return "???";
+}
+
+const char *ceph_session_op_name(int op)
+{
+	switch (op) {
+	case CEPH_SESSION_REQUEST_OPEN: return "request_open";
+	case CEPH_SESSION_OPEN: return "open";
+	case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
+	case CEPH_SESSION_CLOSE: return "close";
+	case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
+	case CEPH_SESSION_RENEWCAPS: return "renewcaps";
+	case CEPH_SESSION_STALE: return "stale";
+	case CEPH_SESSION_RECALL_STATE: return "recall_state";
+	case CEPH_SESSION_FLUSHMSG: return "flushmsg";
+	case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack";
+	case CEPH_SESSION_FORCE_RO: return "force_ro";
+	case CEPH_SESSION_REJECT: return "reject";
+	case CEPH_SESSION_REQUEST_FLUSH_MDLOG: return "request_flushmdlog";
+	}
+	return "???";
+}
+
+const char *ceph_mds_op_name(int op)
+{
+	switch (op) {
+	case CEPH_MDS_OP_LOOKUP:  return "lookup";
+	case CEPH_MDS_OP_LOOKUPHASH:  return "lookuphash";
+	case CEPH_MDS_OP_LOOKUPPARENT:  return "lookupparent";
+	case CEPH_MDS_OP_LOOKUPINO:  return "lookupino";
+	case CEPH_MDS_OP_LOOKUPNAME:  return "lookupname";
+	case CEPH_MDS_OP_GETATTR:  return "getattr";
+	case CEPH_MDS_OP_DUMMY:  return "dummy";
+	case CEPH_MDS_OP_SETXATTR: return "setxattr";
+	case CEPH_MDS_OP_SETATTR: return "setattr";
+	case CEPH_MDS_OP_RMXATTR: return "rmxattr";
+	case CEPH_MDS_OP_SETLAYOUT: return "setlayou";
+	case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout";
+	case CEPH_MDS_OP_READDIR: return "readdir";
+	case CEPH_MDS_OP_MKNOD: return "mknod";
+	case CEPH_MDS_OP_LINK: return "link";
+	case CEPH_MDS_OP_UNLINK: return "unlink";
+	case CEPH_MDS_OP_RENAME: return "rename";
+	case CEPH_MDS_OP_MKDIR: return "mkdir";
+	case CEPH_MDS_OP_RMDIR: return "rmdir";
+	case CEPH_MDS_OP_SYMLINK: return "symlink";
+	case CEPH_MDS_OP_CREATE: return "create";
+	case CEPH_MDS_OP_OPEN: return "open";
+	case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
+	case CEPH_MDS_OP_LSSNAP: return "lssnap";
+	case CEPH_MDS_OP_MKSNAP: return "mksnap";
+	case CEPH_MDS_OP_RMSNAP: return "rmsnap";
+	case CEPH_MDS_OP_RENAMESNAP: return "renamesnap";
+	case CEPH_MDS_OP_READDIR_SNAPDIFF: return "readdir_snapdiff";
+	case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
+	case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
+	case CEPH_MDS_OP_FRAGMENTDIR: return "fragmentdir";
+	case CEPH_MDS_OP_EXPORTDIR: return "exportdir";
+	case CEPH_MDS_OP_FLUSH: return "flush_path";
+	case CEPH_MDS_OP_ENQUEUE_SCRUB: return "enqueue_scrub";
+	case CEPH_MDS_OP_REPAIR_FRAGSTATS: return "repair_fragstats";
+	case CEPH_MDS_OP_REPAIR_INODESTATS: return "repair_inodestats";
+	}
+	return "???";
+}
+
+const char *ceph_cap_op_name(int op)
+{
+	switch (op) {
+	case CEPH_CAP_OP_GRANT: return "grant";
+	case CEPH_CAP_OP_REVOKE: return "revoke";
+	case CEPH_CAP_OP_TRUNC: return "trunc";
+	case CEPH_CAP_OP_EXPORT: return "export";
+	case CEPH_CAP_OP_IMPORT: return "import";
+	case CEPH_CAP_OP_UPDATE: return "update";
+	case CEPH_CAP_OP_DROP: return "drop";
+	case CEPH_CAP_OP_FLUSH: return "flush";
+	case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
+	case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
+	case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
+	case CEPH_CAP_OP_RELEASE: return "release";
+	case CEPH_CAP_OP_RENEW: return "renew";
+	}
+	return "???";
+}
+
+const char *ceph_lease_op_name(int o)
+{
+	switch (o) {
+	case CEPH_MDS_LEASE_REVOKE: return "revoke";
+	case CEPH_MDS_LEASE_RELEASE: return "release";
+	case CEPH_MDS_LEASE_RENEW: return "renew";
+	case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
+	}
+	return "???";
+}
+
+const char *ceph_snap_op_name(int o)
+{
+	switch (o) {
+	case CEPH_SNAP_OP_UPDATE: return "update";
+	case CEPH_SNAP_OP_CREATE: return "create";
+	case CEPH_SNAP_OP_DESTROY: return "destroy";
+	case CEPH_SNAP_OP_SPLIT: return "split";
+	}
+	return "???";
+}
+
+const char *ceph_watch_event_name(int e)
+{
+	switch (e) {
+	case CEPH_WATCH_EVENT_NOTIFY: return "notify";
+	case CEPH_WATCH_EVENT_NOTIFY_COMPLETE: return "notify_complete";
+	case CEPH_WATCH_EVENT_DISCONNECT: return "disconnect";
+	}
+	return "???";
+}
+
+const char *ceph_pool_op_name(int op)
+{
+	switch (op) {
+	case POOL_OP_CREATE: return "create";
+	case POOL_OP_DELETE: return "delete";
+	case POOL_OP_AUID_CHANGE: return "auid change";  // (obsolete)
+	case POOL_OP_CREATE_SNAP: return "create snap";
+	case POOL_OP_DELETE_SNAP: return "delete snap";
+	case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
+	case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
+	}
+	return "???";
+}
+
+const char *ceph_osd_backoff_op_name(int op)
+{
+	switch (op) {
+	case CEPH_OSD_BACKOFF_OP_BLOCK: return "block";
+	case CEPH_OSD_BACKOFF_OP_ACK_BLOCK: return "ack-block";
+	case CEPH_OSD_BACKOFF_OP_UNBLOCK: return "unblock";
+	}
+	return "???";
+}
diff --git a/src/common/ceph_strings.h b/src/common/ceph_strings.h
new file mode 100644
index 000000000..2818061dc
--- /dev/null
+++ b/src/common/ceph_strings.h
@@ -0,0 +1,25 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cstdint>
+
+const char *ceph_entity_type_name(int type);
+const char *ceph_con_mode_name(int con_mode);
+const char *ceph_osd_op_name(int op);
+const char *ceph_osd_state_name(int s);
+const char *ceph_release_name(int r);
+std::uint64_t ceph_release_features(int r);
+int ceph_release_from_features(std::uint64_t features);
+const char *ceph_osd_watch_op_name(int o);
+const char *ceph_osd_alloc_hint_flag_name(int f);
+const char *ceph_mds_state_name(int s);
+const char *ceph_session_op_name(int op);
+const char *ceph_mds_op_name(int op);
+const char *ceph_cap_op_name(int op);
+const char *ceph_lease_op_name(int o);
+const char *ceph_snap_op_name(int o);
+const char *ceph_watch_event_name(int e);
+const char *ceph_pool_op_name(int op);
+const char *ceph_osd_backoff_op_name(int op);
diff --git a/src/common/ceph_time.cc b/src/common/ceph_time.cc
new file mode 100644
index 000000000..4af48a8a2
--- /dev/null
+++ b/src/common/ceph_time.cc
@@ -0,0 +1,350 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.	See file COPYING.
+ *
+ */
+
+// For ceph_timespec
+#include "ceph_time.h"
+
+#include <fmt/chrono.h>
+#include <fmt/ostream.h>
+
+#include "log/LogClock.h"
+#include "config.h"
+#include "strtol.h"
+
+#if defined(__APPLE__)
+#include <mach/mach.h>
+#include <mach/mach_time.h>
+
+
+#ifndef NSEC_PER_SEC
+#define NSEC_PER_SEC 1000000000ULL
+#endif
+
+int clock_gettime(int clk_id, struct timespec *tp)
+{
+  if (clk_id == CLOCK_REALTIME) {
+    // gettimeofday is much faster than clock_get_time
+    struct timeval now;
+    int ret = gettimeofday(&now, NULL);
+    if (ret)
+      return ret;
+    tp->tv_sec = now.tv_sec;
+    tp->tv_nsec = now.tv_usec * 1000L;
+  } else {
+    uint64_t t = mach_absolute_time();
+    static mach_timebase_info_data_t timebase_info;
+    if (timebase_info.denom == 0) {
+      (void)mach_timebase_info(&timebase_info);
+    }
+    auto nanos = t * timebase_info.numer / timebase_info.denom;
+    tp->tv_sec = nanos / NSEC_PER_SEC;
+    tp->tv_nsec = nanos - (tp->tv_sec * NSEC_PER_SEC);
+  }
+  return 0;
+}
+#endif
+
+using namespace std::literals;
+
+namespace ceph {
+using std::chrono::seconds;
+using std::chrono::nanoseconds;
+void real_clock::to_ceph_timespec(const time_point& t,
+				  struct ceph_timespec& ts) {
+  ts.tv_sec = to_time_t(t);
+  ts.tv_nsec = (t.time_since_epoch() % 1s).count();
+}
+struct ceph_timespec real_clock::to_ceph_timespec(const time_point& t) {
+  struct ceph_timespec ts;
+  to_ceph_timespec(t, ts);
+  return ts;
+}
+real_clock::time_point real_clock::from_ceph_timespec(
+  const struct ceph_timespec& ts) {
+  return time_point(seconds(ts.tv_sec) + nanoseconds(ts.tv_nsec));
+}
+
+void coarse_real_clock::to_ceph_timespec(const time_point& t,
+					 struct ceph_timespec& ts) {
+  ts.tv_sec = to_time_t(t);
+  ts.tv_nsec = (t.time_since_epoch() % seconds(1)).count();
+}
+struct ceph_timespec coarse_real_clock::to_ceph_timespec(
+  const time_point& t) {
+  struct ceph_timespec ts;
+  to_ceph_timespec(t, ts);
+  return ts;
+}
+coarse_real_clock::time_point coarse_real_clock::from_ceph_timespec(
+  const struct ceph_timespec& ts) {
+  return time_point(seconds(ts.tv_sec) + nanoseconds(ts.tv_nsec));
+}
+
+
+using std::chrono::duration_cast;
+using std::chrono::seconds;
+using std::chrono::microseconds;
+
+template<typename Clock,
+	 typename std::enable_if<Clock::is_steady>::type*>
+std::ostream& operator<<(std::ostream& m,
+			 const std::chrono::time_point<Clock>& t) {
+  return m << std::fixed << std::chrono::duration<double>(
+    t.time_since_epoch()).count()
+	   << 's';
+}
+
+template<typename Clock,
+	 typename std::enable_if<!Clock::is_steady>::type*>
+std::ostream& operator<<(std::ostream& m,
+			 const std::chrono::time_point<Clock>& t) {
+  m.setf(std::ios::right);
+  char oldfill = m.fill();
+  m.fill('0');
+  // localtime.  this looks like an absolute time.
+  //  conform to http://en.wikipedia.org/wiki/ISO_8601
+  struct tm bdt;
+  time_t tt = Clock::to_time_t(t);
+  localtime_r(&tt, &bdt);
+  char tz[32] = { 0 };
+  strftime(tz, sizeof(tz), "%z", &bdt);
+  m << std::setw(4) << (bdt.tm_year+1900)  // 2007 -> '07'
+    << '-' << std::setw(2) << (bdt.tm_mon+1)
+    << '-' << std::setw(2) << bdt.tm_mday
+    << 'T'
+    << std::setw(2) << bdt.tm_hour
+    << ':' << std::setw(2) << bdt.tm_min
+    << ':' << std::setw(2) << bdt.tm_sec
+    << "." << std::setw(6) << duration_cast<microseconds>(
+      t.time_since_epoch() % seconds(1)).count()
+    << tz;
+  m.fill(oldfill);
+  m.unsetf(std::ios::right);
+  return m;
+}
+
+template std::ostream&
+operator<< <mono_clock>(std::ostream& m, const mono_time& t);
+template std::ostream&
+operator<< <real_clock>(std::ostream& m, const real_time& t);
+template std::ostream&
+operator<< <coarse_mono_clock>(std::ostream& m, const coarse_mono_time& t);
+template std::ostream&
+operator<< <coarse_real_clock>(std::ostream& m, const coarse_real_time& t);
+
+std::string timespan_str(timespan t)
+{
+  // FIXME: somebody pretty please make a version of this function
+  // that isn't as lame as this one!
+  uint64_t nsec = std::chrono::nanoseconds(t).count();
+  std::ostringstream ss;
+  if (nsec < 2'000'000'000) {
+    ss << ((float)nsec / 1'000'000'000) << "s";
+    return ss.str();
+  }
+  uint64_t sec = nsec / 1'000'000'000;
+  if (sec < 120) {
+    ss << sec << "s";
+    return ss.str();
+  }
+  uint64_t min = sec / 60;
+  if (min < 120) {
+    ss << min << "m";
+    return ss.str();
+  }
+  uint64_t hr = min / 60;
+  if (hr < 48) {
+    ss << hr << "h";
+    return ss.str();
+  }
+  uint64_t day = hr / 24;
+  if (day < 14) {
+    ss << day << "d";
+    return ss.str();
+  }
+  uint64_t wk = day / 7;
+  if (wk < 12) {
+    ss << wk << "w";
+    return ss.str();
+  }
+  uint64_t mn = day / 30;
+  if (mn < 24) {
+    ss << mn << "M";
+    return ss.str();
+  }
+  uint64_t yr = day / 365;
+  ss << yr << "y";
+  return ss.str();
+}
+
+std::string exact_timespan_str(timespan t)
+{
+  uint64_t nsec = std::chrono::nanoseconds(t).count();
+  uint64_t sec = nsec / 1'000'000'000;
+  nsec %= 1'000'000'000;
+  uint64_t yr = sec / (60 * 60 * 24 * 365);
+  std::ostringstream ss;
+  if (yr) {
+    ss << yr << "y";
+    sec -= yr * (60 * 60 * 24 * 365);
+  }
+  uint64_t mn = sec / (60 * 60 * 24 * 30);
+  if (mn >= 3) {
+    ss << mn << "mo";
+    sec -= mn * (60 * 60 * 24 * 30);
+  }
+  uint64_t wk = sec / (60 * 60 * 24 * 7);
+  if (wk >= 2) {
+    ss << wk << "w";
+    sec -= wk * (60 * 60 * 24 * 7);
+  }
+  uint64_t day = sec / (60 * 60 * 24);
+  if (day >= 2) {
+    ss << day << "d";
+    sec -= day * (60 * 60 * 24);
+  }
+  uint64_t hr = sec / (60 * 60);
+  if (hr >= 2) {
+    ss << hr << "h";
+    sec -= hr * (60 * 60);
+  }
+  uint64_t min = sec / 60;
+  if (min >= 2) {
+    ss << min << "m";
+    sec -= min * 60;
+  }
+  if (sec || nsec) {
+    if (nsec) {
+      ss << (((float)nsec / 1'000'000'000) + sec) << "s";
+    } else {
+      ss << sec << "s";
+    }
+  }
+  return ss.str();
+}
+
+std::chrono::seconds parse_timespan(const std::string& s)
+{
+  static std::map<std::string,int> units = {
+    { "s", 1 },
+    { "sec", 1 },
+    { "second", 1 },
+    { "seconds", 1 },
+    { "m", 60 },
+    { "min", 60 },
+    { "minute", 60 },
+    { "minutes", 60 },
+    { "h", 60*60 },
+    { "hr", 60*60 },
+    { "hour", 60*60 },
+    { "hours", 60*60 },
+    { "d", 24*60*60 },
+    { "day", 24*60*60 },
+    { "days", 24*60*60 },
+    { "w", 7*24*60*60 },
+    { "wk", 7*24*60*60 },
+    { "week", 7*24*60*60 },
+    { "weeks", 7*24*60*60 },
+    { "mo", 30*24*60*60 },
+    { "month", 30*24*60*60 },
+    { "months", 30*24*60*60 },
+    { "y", 365*24*60*60 },
+    { "yr", 365*24*60*60 },
+    { "year", 365*24*60*60 },
+    { "years", 365*24*60*60 },
+  };
+
+  auto r = 0s;
+  auto pos = 0u;
+  while (pos < s.size()) {
+    // skip whitespace
+    while (std::isspace(s[pos])) {
+      ++pos;
+    }
+    if (pos >= s.size()) {
+      break;
+    }
+
+    // consume any digits
+    auto val_start = pos;
+    while (std::isdigit(s[pos])) {
+      ++pos;
+    }
+    if (val_start == pos) {
+      throw std::invalid_argument("expected digit");
+    }
+    auto n = s.substr(val_start, pos - val_start);
+    std::string err;
+    auto val = strict_strtoll(n.c_str(), 10, &err);
+    if (err.size()) {
+      throw std::invalid_argument(err);
+    }
+
+    // skip whitespace
+    while (std::isspace(s[pos])) {
+      ++pos;
+    }
+
+    // consume unit
+    auto unit_start = pos;
+    while (std::isalpha(s[pos])) {
+      ++pos;
+    }
+    if (unit_start != pos) {
+      auto unit = s.substr(unit_start, pos - unit_start);
+      auto p = units.find(unit);
+      if (p == units.end()) {
+	throw std::invalid_argument("unrecogized unit '"s + unit + "'");
+      }
+      val *= p->second;
+    } else if (pos < s.size()) {
+      throw std::invalid_argument("unexpected trailing '"s + s.substr(pos) + "'");
+    }
+    r += std::chrono::seconds(val);
+  }
+  return r;
+}
+
+}
+
+namespace std {
+template<typename Rep, typename Period>
+ostream& operator<<(ostream& m, const chrono::duration<Rep, Period>& t) {
+  if constexpr (chrono::treat_as_floating_point_v<Rep> ||
+                Period::den > 1) {
+    using seconds_t = chrono::duration<float>;
+    ::fmt::print(m, "{:.9}", chrono::duration_cast<seconds_t>(t));
+  } else {
+    ::fmt::print(m, "{}", t);
+  }
+  return m;
+}
+
+template ostream&
+operator<< <::ceph::timespan::rep,
+            ::ceph::timespan::period> (ostream&, const ::ceph::timespan&);
+
+template ostream&
+operator<< <::ceph::signedspan::rep,
+            ::ceph::signedspan::period> (ostream&, const ::ceph::signedspan&);
+
+template ostream&
+operator<< <chrono::seconds::rep,
+            chrono::seconds::period> (ostream&, const chrono::seconds&);
+
+template ostream&
+operator<< <chrono::milliseconds::rep,
+            chrono::milliseconds::period> (ostream&, const chrono::milliseconds&);
+
+} // namespace std
diff --git a/src/common/ceph_time.h b/src/common/ceph_time.h
new file mode 100644
index 000000000..292fa91ac
--- /dev/null
+++ b/src/common/ceph_time.h
@@ -0,0 +1,557 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.	See file COPYING.
+ *
+ */
+
+#ifndef COMMON_CEPH_TIME_H
+#define COMMON_CEPH_TIME_H
+
+#include <chrono>
+#include <iostream>
+#include <string>
+#include <optional>
+#if FMT_VERSION >= 90000
+#include <fmt/ostream.h>
+#endif
+#include <sys/time.h>
+
+#if defined(__APPLE__)
+#include <sys/_types/_timespec.h>
+
+#define CLOCK_REALTIME_COARSE CLOCK_REALTIME
+#define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC
+
+int clock_gettime(int clk_id, struct timespec *tp);
+#endif
+
+#ifdef _WIN32
+// Clock precision:
+// mingw < 8.0.1:
+//   * CLOCK_REALTIME: ~10-55ms (GetSystemTimeAsFileTime)
+// mingw >= 8.0.1:
+//   * CLOCK_REALTIME: <1us (GetSystemTimePreciseAsFileTime)
+//   * CLOCK_REALTIME_COARSE: ~10-55ms (GetSystemTimeAsFileTime)
+//
+// * CLOCK_MONOTONIC: <1us if TSC is usable, ~10-55ms otherwise
+//                    (QueryPerformanceCounter)
+// https://github.com/mirror/mingw-w64/commit/dcd990ed423381cf35702df9495d44f1979ebe50
+#ifndef CLOCK_REALTIME_COARSE
+  #define CLOCK_REALTIME_COARSE CLOCK_REALTIME
+#endif
+#ifndef CLOCK_MONOTONIC_COARSE
+  #define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC
+#endif
+#endif
+
+struct ceph_timespec;
+
+namespace ceph {
+// Currently we use a 64-bit count of nanoseconds.
+
+// We could, if we wished, use a struct holding a uint64_t count
+// of seconds and a uint32_t count of nanoseconds.
+
+// At least this way we can change it to something else if we
+// want.
+typedef uint64_t rep;
+
+
+// duration is the concrete time representation for our code in the
+// case that we are only interested in durations between now and the
+// future. Using it means we don't have to have EVERY function that
+// deals with a duration be a template. We can do so for user-facing
+// APIs, however.
+typedef std::chrono::duration<rep, std::nano> timespan;
+
+
+// Like the above but signed.
+typedef int64_t signed_rep;
+
+// Similar to the above but for durations that can specify
+// differences between now and a time point in the past.
+typedef std::chrono::duration<signed_rep, std::nano> signedspan;
+
+template<typename Duration>
+struct timeval to_timeval(Duration d) {
+  struct timeval tv;
+  auto sec = std::chrono::duration_cast<std::chrono::seconds>(d);
+  tv.tv_sec = sec.count();
+  auto usec = std::chrono::duration_cast<std::chrono::microseconds>(d-sec);
+  tv.tv_usec = usec.count();
+  return tv;
+}
+
+// We define our own clocks so we can have our choice of all time
+// sources supported by the operating system. With the standard
+// library the resolution and cost are unspecified. (For example,
+// the libc++ system_clock class gives only microsecond
+// resolution.)
+
+// One potential issue is that we should accept system_clock
+// timepoints in user-facing APIs alongside (or instead of)
+// ceph::real_clock times.
+
+// High-resolution real-time clock
+class real_clock {
+public:
+  typedef timespan duration;
+  typedef duration::rep rep;
+  typedef duration::period period;
+  // The second template parameter defaults to the clock's duration
+  // type.
+  typedef std::chrono::time_point<real_clock> time_point;
+  static constexpr const bool is_steady = false;
+
+  static time_point now() noexcept {
+    struct timespec ts;
+    clock_gettime(CLOCK_REALTIME, &ts);
+    return from_timespec(ts);
+  }
+
+  static bool is_zero(const time_point& t) {
+    return (t == time_point::min());
+  }
+
+  static time_point zero() {
+    return time_point::min();
+  }
+
+  // Allow conversion to/from any clock with the same interface as
+  // std::chrono::system_clock)
+  template<typename Clock, typename Duration>
+  static time_point to_system_time_point(
+    const std::chrono::time_point<Clock, Duration>& t) {
+    return time_point(seconds(Clock::to_time_t(t)) +
+		      std::chrono::duration_cast<duration>(t.time_since_epoch() %
+							   std::chrono::seconds(1)));
+  }
+  template<typename Clock, typename Duration>
+  static std::chrono::time_point<Clock, Duration> to_system_time_point(
+    const time_point& t) {
+    return (Clock::from_time_t(to_time_t(t)) +
+	    std::chrono::duration_cast<Duration>(t.time_since_epoch() %
+						 std::chrono::seconds(1)));
+  }
+
+  static time_t to_time_t(const time_point& t) noexcept {
+    return std::chrono::duration_cast<std::chrono::seconds>(t.time_since_epoch()).count();
+  }
+  static time_point from_time_t(const time_t& t) noexcept {
+    return time_point(std::chrono::seconds(t));
+  }
+
+  static void to_timespec(const time_point& t, struct timespec& ts) {
+    ts.tv_sec = to_time_t(t);
+    ts.tv_nsec = (t.time_since_epoch() % std::chrono::seconds(1)).count();
+  }
+  static struct timespec to_timespec(const time_point& t) {
+    struct timespec ts;
+    to_timespec(t, ts);
+    return ts;
+  }
+  static time_point from_timespec(const struct timespec& ts) {
+    return time_point(std::chrono::seconds(ts.tv_sec) +
+		      std::chrono::nanoseconds(ts.tv_nsec));
+  }
+
+  static void to_ceph_timespec(const time_point& t,
+			       struct ceph_timespec& ts);
+  static struct ceph_timespec to_ceph_timespec(const time_point& t);
+  static time_point from_ceph_timespec(const struct ceph_timespec& ts);
+
+  static void to_timeval(const time_point& t, struct timeval& tv) {
+    tv.tv_sec = to_time_t(t);
+    tv.tv_usec = std::chrono::duration_cast<std::chrono::microseconds>(
+      t.time_since_epoch() % std::chrono::seconds(1)).count();
+  }
+  static struct timeval to_timeval(const time_point& t) {
+    struct timeval tv;
+    to_timeval(t, tv);
+    return tv;
+  }
+  static time_point from_timeval(const struct timeval& tv) {
+    return time_point(std::chrono::seconds(tv.tv_sec) +
+		      std::chrono::microseconds(tv.tv_usec));
+  }
+
+  static double to_double(const time_point& t) {
+    return std::chrono::duration<double>(t.time_since_epoch()).count();
+  }
+  static time_point from_double(const double d) {
+    return time_point(std::chrono::duration_cast<duration>(
+			std::chrono::duration<double>(d)));
+  }
+};
+
+// Low-resolution but preusmably faster real-time clock
+class coarse_real_clock {
+public:
+  typedef timespan duration;
+  typedef duration::rep rep;
+  typedef duration::period period;
+  // The second template parameter defaults to the clock's duration
+  // type.
+  typedef std::chrono::time_point<coarse_real_clock> time_point;
+  static constexpr const bool is_steady = false;
+
+  static time_point now() noexcept {
+    struct timespec ts;
+#if defined(CLOCK_REALTIME_COARSE)
+    // Linux systems have _COARSE clocks.
+    clock_gettime(CLOCK_REALTIME_COARSE, &ts);
+#elif defined(CLOCK_REALTIME_FAST)
+    // BSD systems have _FAST clocks.
+    clock_gettime(CLOCK_REALTIME_FAST, &ts);
+#else
+    // And if we find neither, you may wish to consult your system's
+    // documentation.
+#warning Falling back to CLOCK_REALTIME, may be slow.
+    clock_gettime(CLOCK_REALTIME, &ts);
+#endif
+    return from_timespec(ts);
+  }
+
+  static bool is_zero(const time_point& t) {
+    return (t == time_point::min());
+  }
+
+  static time_point zero() {
+    return time_point::min();
+  }
+
+  static time_t to_time_t(const time_point& t) noexcept {
+    return std::chrono::duration_cast<std::chrono::seconds>(
+      t.time_since_epoch()).count();
+  }
+  static time_point from_time_t(const time_t t) noexcept {
+    return time_point(std::chrono::seconds(t));
+  }
+
+  static void to_timespec(const time_point& t, struct timespec& ts) {
+    ts.tv_sec = to_time_t(t);
+    ts.tv_nsec = (t.time_since_epoch() % std::chrono::seconds(1)).count();
+  }
+  static struct timespec to_timespec(const time_point& t) {
+    struct timespec ts;
+    to_timespec(t, ts);
+    return ts;
+  }
+  static time_point from_timespec(const struct timespec& ts) {
+    return time_point(std::chrono::seconds(ts.tv_sec) +
+		      std::chrono::nanoseconds(ts.tv_nsec));
+  }
+
+  static void to_ceph_timespec(const time_point& t,
+			       struct ceph_timespec& ts);
+  static struct ceph_timespec to_ceph_timespec(const time_point& t);
+  static time_point from_ceph_timespec(const struct ceph_timespec& ts);
+
+  static void to_timeval(const time_point& t, struct timeval& tv) {
+    tv.tv_sec = to_time_t(t);
+    tv.tv_usec = std::chrono::duration_cast<std::chrono::microseconds>(
+      t.time_since_epoch() % std::chrono::seconds(1)).count();
+  }
+  static struct timeval to_timeval(const time_point& t) {
+    struct timeval tv;
+    to_timeval(t, tv);
+    return tv;
+  }
+  static time_point from_timeval(const struct timeval& tv) {
+    return time_point(std::chrono::seconds(tv.tv_sec) +
+		      std::chrono::microseconds(tv.tv_usec));
+  }
+
+  static double to_double(const time_point& t) {
+    return std::chrono::duration<double>(t.time_since_epoch()).count();
+  }
+  static time_point from_double(const double d) {
+    return time_point(std::chrono::duration_cast<duration>(
+			std::chrono::duration<double>(d)));
+  }
+};
+
+// High-resolution monotonic clock
+class mono_clock {
+public:
+  typedef timespan duration;
+  typedef duration::rep rep;
+  typedef duration::period period;
+  typedef std::chrono::time_point<mono_clock> time_point;
+  static constexpr const bool is_steady = true;
+
+  static time_point now() noexcept {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return time_point(std::chrono::seconds(ts.tv_sec) +
+		      std::chrono::nanoseconds(ts.tv_nsec));
+  }
+
+  static bool is_zero(const time_point& t) {
+    return (t == time_point::min());
+  }
+
+  static time_point zero() {
+    return time_point::min();
+  }
+};
+
+// Low-resolution but, I would hope or there's no point, faster
+// monotonic clock
+class coarse_mono_clock {
+public:
+  typedef timespan duration;
+  typedef duration::rep rep;
+  typedef duration::period period;
+  typedef std::chrono::time_point<coarse_mono_clock> time_point;
+  static constexpr const bool is_steady = true;
+
+  static time_point now() noexcept {
+    struct timespec ts;
+#if defined(CLOCK_MONOTONIC_COARSE)
+    // Linux systems have _COARSE clocks.
+    clock_gettime(CLOCK_MONOTONIC_COARSE, &ts);
+#elif defined(CLOCK_MONOTONIC_FAST)
+    // BSD systems have _FAST clocks.
+    clock_gettime(CLOCK_MONOTONIC_FAST, &ts);
+#else
+    // And if we find neither, you may wish to consult your system's
+    // documentation.
+#warning Falling back to CLOCK_MONOTONIC, may be slow.
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+#endif
+    return time_point(std::chrono::seconds(ts.tv_sec) +
+		      std::chrono::nanoseconds(ts.tv_nsec));
+  }
+
+  static bool is_zero(const time_point& t) {
+    return (t == time_point::min());
+  }
+
+  static time_point zero() {
+    return time_point::min();
+  }
+};
+
+namespace time_detail {
+// So that our subtractions produce negative spans rather than
+// arithmetic underflow.
+template<typename Rep1, typename Period1, typename Rep2,
+	 typename Period2>
+inline auto difference(std::chrono::duration<Rep1, Period1> minuend,
+		       std::chrono::duration<Rep2, Period2> subtrahend)
+  -> typename std::common_type<
+  std::chrono::duration<typename std::make_signed<Rep1>::type,
+			Period1>,
+  std::chrono::duration<typename std::make_signed<Rep2>::type,
+			Period2> >::type {
+  // Foo.
+  using srep =
+    typename std::common_type<
+      std::chrono::duration<typename std::make_signed<Rep1>::type,
+			    Period1>,
+    std::chrono::duration<typename std::make_signed<Rep2>::type,
+			  Period2> >::type;
+  return srep(srep(minuend).count() - srep(subtrahend).count());
+}
+
+template<typename Clock, typename Duration1, typename Duration2>
+inline auto difference(
+  typename std::chrono::time_point<Clock, Duration1> minuend,
+  typename std::chrono::time_point<Clock, Duration2> subtrahend)
+  -> typename std::common_type<
+  std::chrono::duration<typename std::make_signed<
+			  typename Duration1::rep>::type,
+			typename Duration1::period>,
+  std::chrono::duration<typename std::make_signed<
+			  typename Duration2::rep>::type,
+			typename Duration2::period> >::type {
+  return difference(minuend.time_since_epoch(),
+		    subtrahend.time_since_epoch());
+}
+}
+
+// Please note that the coarse clocks are disjoint. You cannot
+// subtract a real_clock timepoint from a coarse_real_clock
+// timepoint as, from C++'s perspective, they are disjoint types.
+
+// This is not necessarily bad. If I sample a mono_clock and then a
+// coarse_mono_clock, the coarse_mono_clock's time could potentially
+// be previous to the mono_clock's time (just due to differing
+// resolution) which would be Incorrect.
+
+// This is not horrible, though, since you can use an idiom like
+// mono_clock::timepoint(coarsepoint.time_since_epoch()) to unwrap
+// and rewrap if you know what you're doing.
+
+
+// Actual wall-clock times
+typedef real_clock::time_point real_time;
+typedef coarse_real_clock::time_point coarse_real_time;
+
+// Monotonic times should never be serialized or communicated
+// between machines, since they are incomparable. Thus we also don't
+// make any provision for converting between
+// std::chrono::steady_clock time and ceph::mono_clock time.
+typedef mono_clock::time_point mono_time;
+typedef coarse_mono_clock::time_point coarse_mono_time;
+
+template<typename Rep1, typename Ratio1, typename Rep2, typename Ratio2>
+auto floor(const std::chrono::duration<Rep1, Ratio1>& duration,
+	   const std::chrono::duration<Rep2, Ratio2>& precision) ->
+  typename std::common_type<std::chrono::duration<Rep1, Ratio1>,
+			    std::chrono::duration<Rep2, Ratio2> >::type {
+  return duration - (duration % precision);
+}
+
+template<typename Rep1, typename Ratio1, typename Rep2, typename Ratio2>
+auto ceil(const std::chrono::duration<Rep1, Ratio1>& duration,
+	  const std::chrono::duration<Rep2, Ratio2>& precision) ->
+  typename std::common_type<std::chrono::duration<Rep1, Ratio1>,
+			    std::chrono::duration<Rep2, Ratio2> >::type {
+  auto tmod = duration % precision;
+  return duration - tmod + (tmod > tmod.zero() ? 1 : 0) * precision;
+}
+
+template<typename Clock, typename Duration, typename Rep, typename Ratio>
+auto floor(const std::chrono::time_point<Clock, Duration>& timepoint,
+	   const std::chrono::duration<Rep, Ratio>& precision) ->
+  std::chrono::time_point<Clock,
+			  typename std::common_type<
+			    Duration, std::chrono::duration<Rep, Ratio>
+			    >::type> {
+  return std::chrono::time_point<
+    Clock, typename std::common_type<
+      Duration, std::chrono::duration<Rep, Ratio> >::type>(
+	floor(timepoint.time_since_epoch(), precision));
+}
+template<typename Clock, typename Duration, typename Rep, typename Ratio>
+auto ceil(const std::chrono::time_point<Clock, Duration>& timepoint,
+	  const std::chrono::duration<Rep, Ratio>& precision) ->
+  std::chrono::time_point<Clock,
+			  typename std::common_type<
+			    Duration,
+			    std::chrono::duration<Rep, Ratio> >::type> {
+  return std::chrono::time_point<
+    Clock, typename std::common_type<
+      Duration, std::chrono::duration<Rep, Ratio> >::type>(
+	ceil(timepoint.time_since_epoch(), precision));
+}
+
+inline timespan make_timespan(const double d) {
+  return std::chrono::duration_cast<timespan>(
+    std::chrono::duration<double>(d));
+}
+inline std::optional<timespan> maybe_timespan(const double d) {
+  return d ? std::make_optional(make_timespan(d)) : std::nullopt;
+}
+
+template<typename Clock,
+	 typename std::enable_if<!Clock::is_steady>::type* = nullptr>
+std::ostream& operator<<(std::ostream& m,
+			 const std::chrono::time_point<Clock>& t);
+template<typename Clock,
+	 typename std::enable_if<Clock::is_steady>::type* = nullptr>
+std::ostream& operator<<(std::ostream& m,
+			 const std::chrono::time_point<Clock>& t);
+
+// The way std::chrono handles the return type of subtraction is not
+// wonderful. The difference of two unsigned types SHOULD be signed.
+
+inline signedspan operator -(real_time minuend,
+			     real_time subtrahend) {
+  return time_detail::difference(minuend, subtrahend);
+}
+
+inline signedspan operator -(coarse_real_time minuend,
+			     coarse_real_time subtrahend) {
+  return time_detail::difference(minuend, subtrahend);
+}
+
+inline signedspan operator -(mono_time minuend,
+			     mono_time subtrahend) {
+  return time_detail::difference(minuend, subtrahend);
+}
+
+inline signedspan operator -(coarse_mono_time minuend,
+			     coarse_mono_time subtrahend) {
+  return time_detail::difference(minuend, subtrahend);
+}
+
+// We could add specializations of time_point - duration and
+// time_point + duration to assert on overflow, but I don't think we
+// should.
+inline timespan abs(signedspan z) {
+  return z > signedspan::zero() ?
+    std::chrono::duration_cast<timespan>(z) :
+    timespan(-z.count());
+}
+inline timespan to_timespan(signedspan z) {
+  if (z < signedspan::zero()) {
+    //ceph_assert(z >= signedspan::zero());
+    // There is a kernel bug that seems to be triggering this assert.  We've
+    // seen it in:
+    //   centos 8.1: 4.18.0-147.el8.x86_64
+    //   debian 10.3: 4.19.0-8-amd64
+    //   debian 10.1: 4.19.67-2+deb10u1
+    //   ubuntu 18.04
+    // see bugs:
+    //   https://tracker.ceph.com/issues/43365
+    //   https://tracker.ceph.com/issues/44078
+    z = signedspan::zero();
+  }
+  return std::chrono::duration_cast<timespan>(z);
+}
+
+std::string timespan_str(timespan t);
+std::string exact_timespan_str(timespan t);
+std::chrono::seconds parse_timespan(const std::string& s);
+
+// detects presence of Clock::to_timespec() and from_timespec()
+template <typename Clock, typename = std::void_t<>>
+struct converts_to_timespec : std::false_type {};
+
+template <typename Clock>
+struct converts_to_timespec<Clock, std::void_t<decltype(
+  Clock::from_timespec(Clock::to_timespec(
+			 std::declval<typename Clock::time_point>()))
+  )>> : std::true_type {};
+
+template <typename Clock>
+constexpr bool converts_to_timespec_v = converts_to_timespec<Clock>::value;
+
+template<typename Rep, typename T>
+static Rep to_seconds(T t) {
+  return std::chrono::duration_cast<
+    std::chrono::duration<Rep>>(t).count();
+}
+
+template<typename Rep, typename T>
+static Rep to_microseconds(T t) {
+  return std::chrono::duration_cast<
+    std::chrono::duration<
+      Rep,
+      std::micro>>(t).count();
+}
+
+} // namespace ceph
+
+namespace std {
+template<typename Rep, typename Period>
+ostream& operator<<(ostream& m, const chrono::duration<Rep, Period>& t);
+}
+
+#if FMT_VERSION >= 90000
+template<typename Clock>
+struct fmt::formatter<std::chrono::time_point<Clock>> : fmt::ostream_formatter {};
+#endif
+
+#endif // COMMON_CEPH_TIME_H
diff --git a/src/common/ceph_timer.h b/src/common/ceph_timer.h
new file mode 100644
index 000000000..2be077834
--- /dev/null
+++ b/src/common/ceph_timer.h
@@ -0,0 +1,312 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef COMMON_CEPH_TIMER_H
+#define COMMON_CEPH_TIMER_H
+
+#include <cassert>
+#include <condition_variable>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <boost/intrusive/set.hpp>
+
+#include "include/function2.hpp"
+#include "include/compat.h"
+
+#include "common/detail/construct_suspended.h"
+
+namespace bi = boost::intrusive;
+namespace ceph {
+
+// Compared to the SafeTimer this does fewer allocations (you
+// don't have to allocate a new Context every time you
+// want to cue the next tick.)
+//
+// It also does not share a lock with the caller. If you call
+// cancel event, it either cancels the event (and returns true) or
+// you missed it. If this does not work for you, you can set up a
+// flag and mutex of your own.
+//
+// You get to pick your clock. I like mono_clock, since I usually
+// want to wait FOR a given duration. real_clock is worthwhile if
+// you want to wait UNTIL a specific moment of wallclock time.  If
+// you want you can set up a timer that executes a function after
+// you use up ten seconds of CPU time.
+
+template<typename TC>
+class timer {
+  using sh = bi::set_member_hook<bi::link_mode<bi::normal_link>>;
+
+  struct event {
+    typename TC::time_point t = typename TC::time_point::min();
+    std::uint64_t id = 0;
+    fu2::unique_function<void()> f;
+
+    sh schedule_link;
+    sh event_link;
+
+    event() = default;
+    event(typename TC::time_point t, std::uint64_t id,
+	  fu2::unique_function<void()> f) : t(t), id(id), f(std::move(f)) {}
+
+    event(const event&) = delete;
+    event& operator =(const event&) = delete;
+
+    event(event&&) = delete;
+    event& operator =(event&&) = delete;
+
+    bool operator <(const event& e) const noexcept {
+      return t == e.t ? id < e.id : t < e.t;
+    }
+  };
+  struct id_key {
+    using type = std::uint64_t;
+    const type& operator ()(const event& e) const noexcept {
+      return e.id;
+    }
+  };
+
+  bi::set<event, bi::member_hook<event, sh, &event::schedule_link>,
+	  bi::constant_time_size<false>> schedule;
+
+  bi::set<event, bi::member_hook<event, sh, &event::event_link>,
+	  bi::constant_time_size<false>,
+	  bi::key_of_value<id_key>> events;
+
+  std::mutex lock;
+  std::condition_variable cond;
+
+  event* running = nullptr;
+  std::uint64_t next_id = 0;
+
+  bool suspended;
+  std::thread thread;
+
+  void timer_thread() {
+    std::unique_lock l(lock);
+    while (!suspended) {
+      auto now = TC::now();
+
+      while (!schedule.empty()) {
+	auto p = schedule.begin();
+	// Should we wait for the future?
+	if (p->t > now)
+	  break;
+
+	auto& e = *p;
+	schedule.erase(e);
+	events.erase(e.id);
+
+	// Since we have only one thread it is impossible to have more
+	// than one running event
+	running = &e;
+
+	l.unlock();
+	p->f();
+	l.lock();
+
+	if (running) {
+	  running = nullptr;
+	  delete &e;
+	} // Otherwise the event requeued itself
+      }
+
+      if (suspended)
+	break;
+      if (schedule.empty()) {
+	cond.wait(l);
+      } else {
+	// Since wait_until takes its parameter by reference, passing
+	// the time /in the event/ is unsafe, as it might be canceled
+	// while we wait.
+	const auto t = schedule.begin()->t;
+	cond.wait_until(l, t);
+      }
+    }
+  }
+
+public:
+  timer() : suspended(false) {
+    thread = std::thread(&timer::timer_thread, this);
+    ceph_pthread_setname(thread.native_handle(), "ceph_timer");
+  }
+
+  // Create a suspended timer, jobs will be executed in order when
+  // it is resumed.
+  timer(construct_suspended_t) : suspended(true) {}
+
+  timer(const timer&) = delete;
+  timer& operator =(const timer&) = delete;
+
+  ~timer() {
+    suspend();
+    cancel_all_events();
+  }
+
+  // Suspend operation of the timer (and let its thread die).
+  void suspend() {
+    std::unique_lock l(lock);
+    if (suspended)
+      return;
+
+    suspended = true;
+    cond.notify_one();
+    l.unlock();
+    thread.join();
+  }
+
+  // Resume operation of the timer. (Must have been previously
+  // suspended.)
+  void resume() {
+    std::unique_lock l(lock);
+    if (!suspended)
+      return;
+
+    suspended = false;
+    assert(!thread.joinable());
+    thread = std::thread(&timer::timer_thread, this);
+  }
+
+  // Schedule an event in the relative future
+  template<typename Callable, typename... Args>
+  std::uint64_t add_event(typename TC::duration duration,
+			  Callable&& f, Args&&... args) {
+    return add_event(TC::now() + duration,
+		     std::forward<Callable>(f),
+		     std::forward<Args>(args)...);
+  }
+
+  // Schedule an event in the absolute future
+  template<typename Callable, typename... Args>
+  std::uint64_t add_event(typename TC::time_point when,
+			  Callable&& f, Args&&... args) {
+    std::lock_guard l(lock);
+    auto e = std::make_unique<event>(when, ++next_id,
+				     std::bind(std::forward<Callable>(f),
+					       std::forward<Args>(args)...));
+    auto id = e->id;
+    auto i = schedule.insert(*e);
+    events.insert(*(e.release()));
+
+    /* If the event we have just inserted comes before everything
+     * else, we need to adjust our timeout. */
+    if (i.first == schedule.begin())
+      cond.notify_one();
+
+    // Previously each event was a context, identified by a
+    // pointer, and each context to be called only once. Since you
+    // can queue the same function pointer, member function,
+    // lambda, or functor up multiple times, identifying things by
+    // function for the purposes of cancellation is no longer
+    // suitable. Thus:
+    return id;
+  }
+
+  // Adjust the timeout of a currently-scheduled event (relative)
+  bool adjust_event(std::uint64_t id, typename TC::duration duration) {
+    return adjust_event(id, TC::now() + duration);
+  }
+
+  // Adjust the timeout of a currently-scheduled event (absolute)
+  bool adjust_event(std::uint64_t id, typename TC::time_point when) {
+    std::lock_guard l(lock);
+
+    auto it = events.find(id);
+
+    if (it == events.end())
+      return false;
+
+    auto& e = *it;
+
+    schedule.erase(e);
+    e.t = when;
+    schedule.insert(e);
+
+    return true;
+  }
+
+  // Cancel an event. If the event has already come and gone (or you
+  // never submitted it) you will receive false. Otherwise you will
+  // receive true and it is guaranteed the event will not execute.
+  bool cancel_event(const std::uint64_t id) {
+    std::lock_guard l(lock);
+    auto p = events.find(id);
+    if (p == events.end()) {
+      return false;
+    }
+
+    auto& e = *p;
+    events.erase(e.id);
+    schedule.erase(e);
+    delete &e;
+
+    return true;
+  }
+
+  // Reschedules a currently running event in the relative
+  // future. Must be called only from an event executed by this
+  // timer. If you have a function that can be called either from
+  // this timer or some other way, it is your responsibility to make
+  // sure it can tell the difference only does not call
+  // reschedule_me in the non-timer case.
+  //
+  // Returns an event id. If you had an event_id from the first
+  // scheduling, replace it with this return value.
+  std::uint64_t reschedule_me(typename TC::duration duration) {
+    return reschedule_me(TC::now() + duration);
+  }
+
+  // Reschedules a currently running event in the absolute
+  // future. Must be called only from an event executed by this
+  // timer. if you have a function that can be called either from
+  // this timer or some other way, it is your responsibility to make
+  // sure it can tell the difference only does not call
+  // reschedule_me in the non-timer case.
+  //
+  // Returns an event id. If you had an event_id from the first
+  // scheduling, replace it with this return value.
+  std::uint64_t reschedule_me(typename TC::time_point when) {
+    assert(std::this_thread::get_id() == thread.get_id());
+    std::lock_guard l(lock);
+    running->t = when;
+    std::uint64_t id = ++next_id;
+    running->id = id;
+    schedule.insert(*running);
+    events.insert(*running);
+
+    // Hacky, but keeps us from being deleted
+    running = nullptr;
+
+    // Same function, but you get a new ID.
+    return id;
+  }
+
+  // Remove all events from the queue.
+  void cancel_all_events() {
+    std::lock_guard l(lock);
+    while (!events.empty()) {
+      auto p = events.begin();
+      event& e = *p;
+      schedule.erase(e);
+      events.erase(e.id);
+      delete &e;
+    }
+  }
+}; // timer
+} // namespace ceph
+
+#endif
diff --git a/src/common/cmdparse.cc b/src/common/cmdparse.cc
new file mode 100644
index 000000000..009e6678c
--- /dev/null
+++ b/src/common/cmdparse.cc
@@ -0,0 +1,738 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/common_fwd.h"
+#include "common/cmdparse.h"
+#include "common/Formatter.h"
+#include "common/debug.h"
+#include "common/strtol.h"
+#include "json_spirit/json_spirit.h"
+
+using std::is_same_v;
+using std::ostringstream;
+using std::string;
+using std::stringstream;
+using std::string_view;
+using std::vector;
+
+using namespace std::literals;
+
+/**
+ * Given a cmddesc like "foo baz name=bar,type=CephString",
+ * return the prefix "foo baz".
+ */
+namespace ceph::common {
+std::string cmddesc_get_prefix(const std::string_view &cmddesc)
+{
+  string tmp(cmddesc); // FIXME: stringstream ctor can't take string_view :(
+  stringstream ss(tmp);
+  std::string word;
+  std::ostringstream result;
+  bool first = true;
+  while (std::getline(ss, word, ' ')) {
+    if (word.find_first_of(",=") != string::npos) {
+      break;
+    }
+
+    if (!first) {
+      result << " ";
+    }
+    result << word;
+    first = false;
+  }
+
+  return result.str();
+}
+
+using arg_desc_t = std::map<std::string_view, std::string_view>;
+
+// Snarf up all the key=val,key=val pairs, put 'em in a dict.
+arg_desc_t cmddesc_get_args(const string_view cmddesc)
+{
+  arg_desc_t arg_desc;
+  for_each_substr(cmddesc, ",", [&](auto kv) {
+      // key=value; key by itself implies value is bool true
+      // name="name" means arg dict will be titled 'name'
+      auto equal = kv.find('=');
+      if (equal == kv.npos) {
+	// it should be the command
+	return;
+      }
+      auto key = kv.substr(0, equal);
+      auto val = kv.substr(equal + 1);
+      arg_desc[key] = val;
+    });
+  return arg_desc;
+}
+
+std::string cmddesc_get_prenautilus_compat(const std::string &cmddesc)
+{
+  std::vector<std::string> out;
+  stringstream ss(cmddesc);
+  std::string word;
+  bool changed = false;
+  while (std::getline(ss, word, ' ')) {
+    // if no , or =, must be a plain word to put out
+    if (word.find_first_of(",=") == string::npos) {
+      out.push_back(word);
+      continue;
+    }
+    auto desckv = cmddesc_get_args(word);
+    auto j = desckv.find("type");
+    if (j != desckv.end() && j->second == "CephBool") {
+      // Instruct legacy clients or mons to send --foo-bar string in place
+      // of a 'true'/'false' value
+      std::ostringstream oss;
+      oss << "--" << desckv["name"];
+      std::string val = oss.str();
+      std::replace(val.begin(), val.end(), '_', '-');
+      desckv["type"] = "CephChoices";
+      desckv["strings"] = val;
+      std::ostringstream fss;
+      for (auto k = desckv.begin(); k != desckv.end(); ++k) {
+	if (k != desckv.begin()) {
+	  fss << ",";
+	}
+	fss << k->first << "=" << k->second;
+      }
+      out.push_back(fss.str());
+      changed = true;
+    } else {
+      out.push_back(word);
+    }
+  }
+  if (!changed) {
+    return cmddesc;
+  }
+  std::string o;
+  for (auto i = out.begin(); i != out.end(); ++i) {
+    if (i != out.begin()) {
+      o += " ";
+    }
+    o += *i;
+  }
+  return o;
+}
+
+/**
+ * Read a command description list out of cmd, and dump it to f.
+ * A signature description is a set of space-separated words;
+ * see MonCommands.h for more info.
+ */
+
+void
+dump_cmd_to_json(Formatter *f, uint64_t features, const string& cmd)
+{
+  // put whole command signature in an already-opened container
+  // elements are: "name", meaning "the typeless name that means a literal"
+  // an object {} with key:value pairs representing an argument
+
+  stringstream ss(cmd);
+  std::string word;
+  bool positional = true;
+
+  while (std::getline(ss, word, ' ')) {
+    if (word == "--") {
+      positional = false;
+      continue;
+    }
+
+    // if no , or =, must be a plain word to put out
+    if (word.find_first_of(",=") == string::npos) {
+      f->dump_string("arg", word);
+      continue;
+    }
+
+    // accumulate descriptor keywords in desckv
+    auto desckv = cmddesc_get_args(word);
+    // name the individual desc object based on the name key
+    f->open_object_section(desckv["name"]);
+
+    // Compatibility for pre-nautilus clients that don't know about CephBool
+    std::string val;
+    if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+      auto i = desckv.find("type");
+      if (i != desckv.end() && i->second == "CephBool") {
+        // Instruct legacy clients to send --foo-bar string in place
+        // of a 'true'/'false' value
+        std::ostringstream oss;
+        oss << "--" << desckv["name"];
+        val = oss.str();
+        std::replace(val.begin(), val.end(), '_', '-');
+
+        desckv["type"] = "CephChoices";
+        desckv["strings"] = val;
+      }
+    }
+
+    // dump all the keys including name into the array
+    if (!positional) {
+      desckv["positional"] = "false";
+    }
+    for (auto [key, value] : desckv) {
+      if (key == "positional") {
+	if (!HAVE_FEATURE(features, SERVER_QUINCY)) {
+	  continue;
+	}
+	f->dump_bool(key, value == "true" || value == "True");
+      } else if (key == "req" && HAVE_FEATURE(features, SERVER_QUINCY)) {
+	f->dump_bool(key, value == "true" || value == "True");
+      } else {
+	f->dump_string(key, value);
+      }
+    }
+    f->close_section(); // attribute object for individual desc
+  }
+}
+
+void
+dump_cmd_and_help_to_json(Formatter *jf,
+			  uint64_t features,
+			  const string& secname,
+			  const string& cmdsig,
+			  const string& helptext)
+{
+      jf->open_object_section(secname);
+      jf->open_array_section("sig");
+      dump_cmd_to_json(jf, features, cmdsig);
+      jf->close_section(); // sig array
+      jf->dump_string("help", helptext);
+      jf->close_section(); // cmd
+}
+
+void
+dump_cmddesc_to_json(Formatter *jf,
+		     uint64_t features,
+		     const string& secname,
+		     const string& cmdsig,
+		     const string& helptext,
+		     const string& module,
+		     const string& perm,
+		     uint64_t flags)
+{
+      jf->open_object_section(secname);
+      jf->open_array_section("sig");
+      dump_cmd_to_json(jf, features, cmdsig);
+      jf->close_section(); // sig array
+      jf->dump_string("help", helptext);
+      jf->dump_string("module", module);
+      jf->dump_string("perm", perm);
+      jf->dump_int("flags", flags);
+      jf->close_section(); // cmd
+}
+
+void cmdmap_dump(const cmdmap_t &cmdmap, Formatter *f)
+{
+  ceph_assert(f != nullptr);
+
+  class dump_visitor : public boost::static_visitor<void>
+  {
+    Formatter *f;
+    std::string const &key;
+    public:
+    dump_visitor(Formatter *f_, std::string const &key_)
+      : f(f_), key(key_)
+    {
+    }
+
+    void operator()(const std::string &operand) const
+    {
+      f->dump_string(key, operand);
+    }
+
+    void operator()(const bool &operand) const
+    {
+      f->dump_bool(key, operand);
+    }
+
+    void operator()(const int64_t &operand) const
+    {
+      f->dump_int(key, operand);
+    }
+
+    void operator()(const double &operand) const
+    {
+      f->dump_float(key, operand);
+    }
+
+    void operator()(const std::vector<std::string> &operand) const
+    {
+      f->open_array_section(key);
+      for (const auto& i : operand) {
+        f->dump_string("item", i);
+      }
+      f->close_section();
+    }
+
+    void operator()(const std::vector<int64_t> &operand) const
+    {
+      f->open_array_section(key);
+      for (const auto i : operand) {
+        f->dump_int("item", i);
+      }
+      f->close_section();
+    }
+
+    void operator()(const std::vector<double> &operand) const
+    {
+      f->open_array_section(key);
+      for (const auto i : operand) {
+        f->dump_float("item", i);
+      }
+      f->close_section();
+    }
+  };
+
+  //f->open_object_section("cmdmap");
+  for (const auto &i : cmdmap) {
+    boost::apply_visitor(dump_visitor(f, i.first), i.second);
+  }
+  //f->close_section();
+}
+
+
+/** Parse JSON in vector cmd into a map from field to map of values
+ * (use mValue/mObject)
+ * 'cmd' should not disappear over lifetime of map
+ * 'mapp' points to the caller's map
+ * 'ss' captures any errors during JSON parsing; if function returns
+ * false, ss is valid */
+
+bool
+cmdmap_from_json(const vector<string>& cmd, cmdmap_t *mapp, std::ostream& ss)
+{
+  json_spirit::mValue v;
+
+  string fullcmd;
+  // First, join all cmd strings
+  for (auto& c : cmd)
+    fullcmd += c;
+
+  try {
+    if (!json_spirit::read(fullcmd, v))
+      throw std::runtime_error("unparseable JSON " + fullcmd);
+    if (v.type() != json_spirit::obj_type)
+      throw std::runtime_error("not JSON object " + fullcmd);
+
+    // allocate new mObject (map) to return
+    // make sure all contents are simple types (not arrays or objects)
+    json_spirit::mObject o = v.get_obj();
+    for (auto it = o.begin(); it != o.end(); ++it) {
+
+      // ok, marshal it into our string->cmd_vartype map, or throw an
+      // exception if it's not a simple datatype.  This is kind of
+      // annoying, since json_spirit has a boost::variant inside it
+      // already, but it's not public.  Oh well.
+
+      switch (it->second.type()) {
+
+      case json_spirit::obj_type:
+      default:
+	throw std::runtime_error("JSON array/object not allowed " + fullcmd);
+        break;
+
+      case json_spirit::array_type:
+	{
+	  // array is a vector of values.  Unpack it to a vector
+	  // of strings, doubles, or int64_t, the only types we handle.
+	  const vector<json_spirit::mValue>& spvals = it->second.get_array();
+	  if (spvals.empty()) {
+	    // if an empty array is acceptable, the caller should always check for
+	    // vector<string> if the expected value of "vector<int64_t>" in the
+	    // cmdmap is missing.
+	    (*mapp)[it->first] = vector<string>();
+	  } else if (spvals.front().type() == json_spirit::str_type) {
+	    vector<string> outv;
+	    for (const auto& sv : spvals) {
+	      if (sv.type() != json_spirit::str_type) {
+		throw std::runtime_error("Can't handle arrays of multiple types");
+	      }
+	      outv.push_back(sv.get_str());
+	    }
+	    (*mapp)[it->first] = std::move(outv);
+	  } else if (spvals.front().type() == json_spirit::int_type) {
+	    vector<int64_t> outv;
+	    for (const auto& sv : spvals) {
+	      if (spvals.front().type() != json_spirit::int_type) {
+		throw std::runtime_error("Can't handle arrays of multiple types");
+	      }
+	      outv.push_back(sv.get_int64());
+	    }
+	    (*mapp)[it->first] = std::move(outv);
+	  } else if (spvals.front().type() == json_spirit::real_type) {
+	    vector<double> outv;
+	    for (const auto& sv : spvals) {
+	      if (spvals.front().type() != json_spirit::real_type) {
+		throw std::runtime_error("Can't handle arrays of multiple types");
+	      }
+	      outv.push_back(sv.get_real());
+	    }
+	    (*mapp)[it->first] = std::move(outv);
+	  } else {
+	    throw std::runtime_error("Can't handle arrays of types other than "
+				     "int, string, or double");
+	  }
+	}
+	break;
+      case json_spirit::str_type:
+	(*mapp)[it->first] = it->second.get_str();
+	break;
+
+      case json_spirit::bool_type:
+	(*mapp)[it->first] = it->second.get_bool();
+	break;
+
+      case json_spirit::int_type:
+	(*mapp)[it->first] = it->second.get_int64();
+	break;
+
+      case json_spirit::real_type:
+	(*mapp)[it->first] = it->second.get_real();
+	break;
+      }
+    }
+    return true;
+  } catch (const std::runtime_error &e) {
+    ss << e.what();
+    return false;
+  }
+}
+
+class stringify_visitor : public boost::static_visitor<string>
+{
+  public:
+    template <typename T>
+    string operator()(T &operand) const
+      {
+	ostringstream oss;
+	oss << operand;
+	return oss.str();
+      }
+};
+
+string 
+cmd_vartype_stringify(const cmd_vartype &v)
+{
+  return boost::apply_visitor(stringify_visitor(), v);
+}
+
+
+void
+handle_bad_get(CephContext *cct, const string& k, const char *tname)
+{
+  ostringstream errstr;
+  int status;
+  const char *typestr = abi::__cxa_demangle(tname, 0, 0, &status);
+  if (status != 0) 
+    typestr = tname;
+  errstr << "bad boost::get: key " << k << " is not type " << typestr;
+  lderr(cct) << errstr.str() << dendl;
+
+  ostringstream oss;
+  oss << ClibBackTrace(1);
+  lderr(cct) << oss.str() << dendl;
+
+  if (status == 0)
+    free((char *)typestr);
+}
+
+long parse_pos_long(const char *s, std::ostream *pss)
+{
+  if (*s == '-' || *s == '+') {
+    if (pss)
+      *pss << "expected numerical value, got: " << s;
+    return -EINVAL;
+  }
+
+  string err;
+  long r = strict_strtol(s, 10, &err);
+  if ((r == 0) && !err.empty()) {
+    if (pss)
+      *pss << err;
+    return -1;
+  }
+  if (r < 0) {
+    if (pss)
+      *pss << "unable to parse positive integer '" << s << "'";
+    return -1;
+  }
+  return r;
+}
+
+int parse_osd_id(const char *s, std::ostream *pss)
+{
+  // osd.NNN?
+  if (strncmp(s, "osd.", 4) == 0) {
+    s += 4;
+  }
+
+  // NNN?
+  ostringstream ss;
+  long id = parse_pos_long(s, &ss);
+  if (id < 0) {
+    *pss << ss.str();
+    return id;
+  }
+  if (id > 0xffff) {
+    *pss << "osd id " << id << " is too large";
+    return -ERANGE;
+  }
+  return id;
+}
+
+namespace {
+template <typename Func>
+bool find_first_in(std::string_view s, const char *delims, Func&& f)
+{
+  auto pos = s.find_first_not_of(delims);
+  while (pos != s.npos) {
+    s.remove_prefix(pos);
+    auto end = s.find_first_of(delims);
+    if (f(s.substr(0, end))) {
+      return true;
+    }
+    pos = s.find_first_not_of(delims, end);
+  }
+  return false;
+}
+
+template<typename T>
+T str_to_num(const std::string& s)
+{
+  if constexpr (is_same_v<T, int>) {
+    return std::stoi(s);
+  } else if constexpr (is_same_v<T, long>) {
+    return std::stol(s);
+  } else if constexpr (is_same_v<T, long long>) {
+    return std::stoll(s);
+  } else if constexpr (is_same_v<T, double>) {
+    return std::stod(s);
+  }
+}
+
+template<typename T>
+bool arg_in_range(T value, const arg_desc_t& desc, std::ostream& os) {
+  auto range = desc.find("range");
+  if (range == desc.end()) {
+    return true;
+  }
+  auto min_max = get_str_list(string(range->second), "|");
+  auto min = str_to_num<T>(min_max.front());
+  auto max = std::numeric_limits<T>::max();
+  if (min_max.size() > 1) {
+    max = str_to_num<T>(min_max.back());
+  }
+  if (value < min || value > max) {
+    os << "'" << value << "' out of range: " << min_max;
+    return false;
+  }
+  return true;
+}
+
+bool validate_str_arg(std::string_view value,
+		      std::string_view type,
+		      const arg_desc_t& desc,
+		      std::ostream& os)
+{
+  if (type == "CephIPAddr") {
+    entity_addr_t addr;
+    if (addr.parse(value)) {
+      return true;
+    } else {
+      os << "failed to parse addr '" << value << "', should be ip:[port]";
+      return false;
+    }
+  } else if (type == "CephChoices") {
+    auto choices = desc.find("strings");
+    ceph_assert(choices != end(desc));
+    auto strings = choices->second;
+    if (find_first_in(strings, "|", [=](auto choice) {
+	  return (value == choice);
+	})) {
+      return true;
+    } else {
+      os << "'" << value << "' not belong to '" << strings << "'";
+      return false;
+    }
+  } else {
+    // CephString or other types like CephPgid
+    return true;
+  }
+}
+
+bool validate_bool(const cmdmap_t& cmdmap,
+		  const arg_desc_t& desc,
+		  const std::string_view name,
+		  const std::string_view type,
+		  std::ostream& os)
+{
+  bool v;
+  try {
+    if (!cmd_getval(cmdmap, name, v)) {
+      if (auto req = desc.find("req");
+	  req != end(desc) && req->second == "false") {
+	return true;
+      } else {
+	os << "missing required parameter: '" << name << "'";
+	return false;
+      }
+    }
+    return true;
+  } catch (const bad_cmd_get& e) {
+    return false;
+  }
+}
+
+template<bool is_vector,
+	 typename T,
+	 typename Value = std::conditional_t<is_vector,
+					     vector<T>,
+					     T>>
+bool validate_arg(const cmdmap_t& cmdmap,
+		  const arg_desc_t& desc,
+		  const std::string_view name,
+		  const std::string_view type,
+		  std::ostream& os)
+{
+  Value v;
+  try {
+    if (!cmd_getval(cmdmap, name, v)) {
+      if constexpr (is_vector) {
+	  // an empty list is acceptable.
+	  return true;
+	} else {
+	if (auto req = desc.find("req");
+	    req != end(desc) && req->second == "false") {
+	  return true;
+	} else {
+	  os << "missing required parameter: '" << name << "'";
+	  return false;
+	}
+      }
+    }
+  } catch (const bad_cmd_get& e) {
+    return false;
+  }
+  auto validate = [&](const T& value) {
+    if constexpr (is_same_v<std::string, T>) {
+      return validate_str_arg(value, type, desc, os);
+    } else if constexpr (is_same_v<int64_t, T> ||
+			 is_same_v<double, T>) {
+      return arg_in_range(value, desc, os);
+    }
+  };
+  if constexpr(is_vector) {
+    return find_if_not(begin(v), end(v), validate) == end(v);
+  } else {
+    return validate(v);
+  }
+}
+} // anonymous namespace
+
+bool validate_cmd(const std::string& desc,
+		  const cmdmap_t& cmdmap,
+		  std::ostream& os)
+{
+  return !find_first_in(desc, " ", [&](auto desc) {
+    auto arg_desc = cmddesc_get_args(desc);
+    if (arg_desc.empty()) {
+      return false;
+    }
+    ceph_assert(arg_desc.count("name"));
+    ceph_assert(arg_desc.count("type"));
+    auto name = arg_desc["name"];
+    auto type = arg_desc["type"];
+    if (arg_desc.count("n")) {
+      if (type == "CephInt") {
+	return !validate_arg<true, int64_t>(cmdmap, arg_desc,
+					    name, type, os);
+      } else if (type == "CephFloat") {
+	return !validate_arg<true, double>(cmdmap, arg_desc,
+					    name, type, os);
+      } else {
+	return !validate_arg<true, string>(cmdmap, arg_desc,
+					   name, type, os);
+      }
+    } else {
+      if (type == "CephInt") {
+	return !validate_arg<false, int64_t>(cmdmap, arg_desc,
+					    name, type, os);
+      } else if (type == "CephFloat") {
+	return !validate_arg<false, double>(cmdmap, arg_desc,
+					    name, type, os);
+      } else if (type == "CephBool") {
+	return !validate_bool(cmdmap, arg_desc,
+			      name, type, os);
+      } else {
+	return !validate_arg<false, string>(cmdmap, arg_desc,
+					    name, type, os);
+      }
+    }
+  });
+}
+
+bool cmd_getval(const cmdmap_t& cmdmap,
+		std::string_view k, bool& val)
+{
+  /*
+   * Specialized getval for booleans.  CephBool didn't exist before Nautilus,
+   * so earlier clients are sent a CephChoices argdesc instead, and will
+   * send us a "--foo-bar" value string for boolean arguments.
+   */
+  auto found = cmdmap.find(k);
+  if (found == cmdmap.end()) {
+    return false;
+  }
+  try {
+    val = boost::get<bool>(found->second);
+    return true;
+  } catch (boost::bad_get&) {
+    try {
+      std::string expected{"--"};
+      expected += k;
+      std::replace(expected.begin(), expected.end(), '_', '-');
+
+      std::string v_str = boost::get<std::string>(found->second);
+      if (v_str == expected) {
+	val = true;
+	return true;
+      } else {
+	throw bad_cmd_get(k, cmdmap);
+      }
+    } catch (boost::bad_get&) {
+      throw bad_cmd_get(k, cmdmap);
+    }
+  }
+}
+
+bool cmd_getval_compat_cephbool(
+  const cmdmap_t& cmdmap,
+  const std::string& k, bool& val)
+{
+  try {
+    return cmd_getval(cmdmap, k, val);
+  } catch (bad_cmd_get& e) {
+    // try as legacy/compat CephChoices
+    std::string t;
+    if (!cmd_getval(cmdmap, k, t)) {
+      return false;
+    }
+    std::string expected = "--"s + k;
+    std::replace(expected.begin(), expected.end(), '_', '-');
+    val = (t == expected);
+    return true;
+  }
+}
+
+}
diff --git a/src/common/cmdparse.h b/src/common/cmdparse.h
new file mode 100644
index 000000000..fb7fb77f7
--- /dev/null
+++ b/src/common/cmdparse.h
@@ -0,0 +1,128 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_COMMON_CMDPARSE_H
+#define CEPH_COMMON_CMDPARSE_H
+
+#include <vector>
+#include <stdexcept>
+#include <optional>
+#include <ostream>
+#include <boost/variant.hpp>
+#include "include/ceph_assert.h"	// boost clobbers this
+#include "include/common_fwd.h"
+#include "common/Formatter.h"
+#include "common/BackTrace.h"
+
+typedef boost::variant<std::string,
+		       bool,
+		       int64_t,
+		       double,
+		       std::vector<std::string>,
+		       std::vector<int64_t>,
+		       std::vector<double>>  cmd_vartype;
+typedef std::map<std::string, cmd_vartype, std::less<>> cmdmap_t;
+
+namespace ceph::common {
+std::string cmddesc_get_prefix(const std::string_view &cmddesc);
+std::string cmddesc_get_prenautilus_compat(const std::string &cmddesc);
+void dump_cmd_to_json(ceph::Formatter *f, uint64_t features,
+                      const std::string& cmd);
+void dump_cmd_and_help_to_json(ceph::Formatter *f,
+			       uint64_t features,
+			       const std::string& secname,
+			       const std::string& cmd,
+			       const std::string& helptext);
+void dump_cmddesc_to_json(ceph::Formatter *jf,
+		          uint64_t features,
+		          const std::string& secname,
+		          const std::string& cmdsig,
+		          const std::string& helptext,
+		          const std::string& module,
+		          const std::string& perm,
+		          uint64_t flags);
+bool cmdmap_from_json(const std::vector<std::string>& cmd, cmdmap_t *mapp,
+		      std::ostream& ss);
+void cmdmap_dump(const cmdmap_t &cmdmap, ceph::Formatter *f);
+void handle_bad_get(CephContext *cct, const std::string& k, const char *name);
+
+std::string cmd_vartype_stringify(const cmd_vartype& v);
+
+struct bad_cmd_get : public std::exception {
+  std::string desc;
+  bad_cmd_get(std::string_view f, const cmdmap_t& cmdmap) {
+    desc += "bad or missing field '";
+    desc += f;
+    desc += "'";
+  }
+  const char *what() const throw() override {
+    return desc.c_str();
+  }
+};
+
+bool cmd_getval(const cmdmap_t& cmdmap,
+		std::string_view k, bool& val);
+
+bool cmd_getval_compat_cephbool(
+  const cmdmap_t& cmdmap,
+  const std::string& k, bool& val);
+
+template <typename T>
+bool cmd_getval(const cmdmap_t& cmdmap,
+		std::string_view k, T& val)
+{
+  auto found = cmdmap.find(k);
+  if (found == cmdmap.end()) {
+    return false;
+  }
+  try {
+    val = boost::get<T>(found->second);
+    return true;
+  } catch (boost::bad_get&) {
+    throw bad_cmd_get(k, cmdmap);
+  }
+}
+
+template <typename T>
+std::optional<T> cmd_getval(const cmdmap_t& cmdmap,
+			    std::string_view k)
+{
+  T ret;
+  if (const bool found = cmd_getval(cmdmap, k, ret); found) {
+    return std::make_optional(std::move(ret));
+  } else {
+    return std::nullopt;
+  }
+}
+
+// with default
+
+template <typename T, typename V>
+T cmd_getval_or(const cmdmap_t& cmdmap, std::string_view k,
+		const V& defval)
+{
+  auto found = cmdmap.find(k);
+  if (found == cmdmap.end()) {
+    return T(defval);
+  }
+  try {
+    return boost::get<T>(cmdmap.find(k)->second);
+  } catch (boost::bad_get&) {
+    throw bad_cmd_get(k, cmdmap);
+  }
+}
+
+template <typename T>
+void
+cmd_putval(CephContext *cct, cmdmap_t& cmdmap, std::string_view k, const T& val)
+{
+  cmdmap.insert_or_assign(std::string{k}, val);
+}
+
+bool validate_cmd(const std::string& desc,
+		  const cmdmap_t& cmdmap,
+		  std::ostream& os);
+extern int parse_osd_id(const char *s, std::ostream *pss);
+extern long parse_pos_long(const char *s, std::ostream *pss = NULL);
+
+}
+#endif
diff --git a/src/common/code_environment.cc b/src/common/code_environment.cc
new file mode 100644
index 000000000..14d55f60c
--- /dev/null
+++ b/src/common/code_environment.cc
@@ -0,0 +1,126 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/code_environment.h"
+
+#include <iostream>
+
+#include "acconfig.h"
+
+#ifdef HAVE_PTHREAD_GETNAME_NP
+#include <pthread.h>
+#endif
+
+#include <string.h>
+
+code_environment_t g_code_env = CODE_ENVIRONMENT_UTILITY;
+
+extern "C" const char *code_environment_to_str(enum code_environment_t e)
+{
+  switch (e) {
+    case CODE_ENVIRONMENT_UTILITY:
+      return "CODE_ENVIRONMENT_UTILITY";
+    case CODE_ENVIRONMENT_DAEMON:
+      return "CODE_ENVIRONMENT_DAEMON";
+    case CODE_ENVIRONMENT_LIBRARY:
+      return "CODE_ENVIRONMENT_LIBRARY";
+    default:
+      return NULL;
+  }
+}
+
+std::ostream &operator<<(std::ostream &oss, const enum code_environment_t e)
+{
+  oss << code_environment_to_str(e);
+  return oss;
+}
+
+#if defined(HAVE_PTHREAD_GETNAME_NP) && !defined(_WIN32)
+
+int get_process_name(char *buf, int len)
+{
+  if (len <= 16) {
+    // The man page discourages using pthread_getname_np() with a buffer shorter
+    // than 16 bytes. With a 16-byte buffer, it might not be null-terminated.
+    return -ENAMETOOLONG;
+  }
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(buf, 0, len);
+  return pthread_getname_np(pthread_self(), buf, len);
+}
+
+#elif defined(HAVE_GETPROGNAME)
+
+int get_process_name(char *buf, int len)
+{
+  if (len <= 0) {
+    return -EINVAL;
+  }
+
+  const char *progname = getprogname();
+  if (progname == nullptr || *progname == '\0') {
+    return -ENOSYS;
+  }
+
+  strncpy(buf, progname, len - 1);
+  buf[len - 1] = '\0';
+  return 0;
+}
+
+#elif defined(_WIN32)
+
+int get_process_name(char *buf, int len)
+{
+  if (len <= 0) {
+    return -EINVAL;
+  }
+
+  char full_path[MAX_PATH];
+  int length = GetModuleFileNameA(nullptr, full_path, sizeof(full_path));
+  if (length <= 0)
+    return -ENOSYS;
+
+  char* start = strrchr(full_path, '\\');
+  if (!start)
+    return -ENOSYS;
+  start++;
+  char* end = strstr(start, ".exe");
+  if (!end)
+    return -ENOSYS;
+  if (len <= end - start) {
+    return -ENAMETOOLONG;
+  }
+
+  memcpy(buf, start, end - start);
+  buf[end - start] = '\0';
+  return 0;
+}
+
+#else
+
+int get_process_name(char *buf, int len)
+{
+  return -ENOSYS;
+}
+
+#endif
+
+std::string get_process_name_cpp()
+{
+  char buf[32];
+  if (get_process_name(buf, sizeof(buf))) {
+    return "(unknown)";
+  }
+  return std::string(buf);
+}
diff --git a/src/common/code_environment.h b/src/common/code_environment.h
new file mode 100644
index 000000000..b94ba52ca
--- /dev/null
+++ b/src/common/code_environment.h
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_CODE_ENVIRONMENT_H
+#define CEPH_COMMON_CODE_ENVIRONMENT_H
+
+enum code_environment_t {
+  CODE_ENVIRONMENT_UTILITY = 0,
+  CODE_ENVIRONMENT_DAEMON = 1,
+  CODE_ENVIRONMENT_LIBRARY = 2,
+  CODE_ENVIRONMENT_UTILITY_NODOUT = 3,
+};
+
+#ifdef __cplusplus
+#include <iosfwd>
+#include <string>
+
+extern "C" code_environment_t g_code_env;
+extern "C" const char *code_environment_to_str(enum code_environment_t e);
+std::ostream &operator<<(std::ostream &oss, const enum code_environment_t e);
+extern "C" int get_process_name(char *buf, int len);
+std::string get_process_name_cpp();
+
+#else
+
+extern code_environment_t g_code_env;
+const char *code_environment_to_str(const enum code_environment_t e);
+extern int get_process_name(char *buf, int len);
+
+#endif
+
+#endif
diff --git a/src/common/cohort_lru.h b/src/common/cohort_lru.h
new file mode 100644
index 000000000..b105c80cc
--- /dev/null
+++ b/src/common/cohort_lru.h
@@ -0,0 +1,498 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Copyright (C) 2015 CohortFS, LLC.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.	See file COPYING.
+ *
+ */
+
+#ifndef COHORT_LRU_H
+#define COHORT_LRU_H
+
+#include <boost/intrusive/list.hpp>
+#include <boost/intrusive/slist.hpp>
+
+#include "common/likely.h"
+
+#ifndef CACHE_LINE_SIZE
+#define CACHE_LINE_SIZE 64 /* XXX arch-specific define */
+#endif
+#define CACHE_PAD(_n) char __pad ## _n [CACHE_LINE_SIZE]
+
+namespace cohort {
+
+  namespace lru {
+
+    namespace bi = boost::intrusive;
+
+    /* public flag values */
+    constexpr uint32_t FLAG_NONE = 0x0000;
+    constexpr uint32_t FLAG_INITIAL = 0x0001;
+    constexpr uint32_t FLAG_RECYCLE = 0x0002;
+
+    enum class Edge : std::uint8_t
+    {
+      MRU = 0,
+      LRU
+    };
+
+    typedef bi::link_mode<bi::safe_link> link_mode;
+
+    class ObjectFactory; // Forward declaration
+
+    class Object
+    {
+    private:
+      uint32_t lru_flags;
+      std::atomic<uint32_t> lru_refcnt;
+      std::atomic<uint32_t> lru_adj;
+      bi::list_member_hook< link_mode > lru_hook;
+
+      typedef bi::list<Object,
+		       bi::member_hook<
+			 Object, bi::list_member_hook< link_mode >,
+			 &Object::lru_hook >,
+		       bi::constant_time_size<true>> Queue;
+
+      bi::slist_member_hook< link_mode > q2_hook;
+
+      typedef bi::slist<Object,
+			bi::member_hook<
+			  Object, bi::slist_member_hook< link_mode >,
+			  &Object::q2_hook >,
+			bi::constant_time_size<true>> Queue2;
+
+    public:
+
+      Object() : lru_flags(FLAG_NONE), lru_refcnt(0), lru_adj(0) {}
+
+      uint32_t get_refcnt() const { return lru_refcnt; }
+
+      virtual bool reclaim(const ObjectFactory* newobj_fac) = 0;
+
+      virtual ~Object() {}
+
+    private:
+      template <typename LK>
+      friend class LRU;
+
+      template <typename T, typename TTree, typename CLT, typename CEQ,
+	      typename K, typename LK>
+      friend class TreeX;
+    };
+
+    /* allocator & recycler interface (create or re-use LRU objects) */
+    class ObjectFactory
+    {
+    public:
+      virtual Object* alloc(void) = 0;
+      virtual void recycle(Object*) = 0;
+      virtual ~ObjectFactory() {};
+    };
+
+    template <typename LK>
+    class LRU
+    {
+    private:
+
+      struct Lane {
+	LK lock;
+	Object::Queue q;
+	// Object::Queue pinned; /* placeholder for possible expansion */
+	CACHE_PAD(0);
+	Lane() {}
+      };
+
+      Lane *qlane;
+      int n_lanes;
+      std::atomic<uint32_t> evict_lane;
+      const uint32_t lane_hiwat;
+
+      static constexpr uint32_t lru_adj_modulus = 5;
+
+      static constexpr uint32_t SENTINEL_REFCNT = 1;
+
+      /* internal flag values */
+      static constexpr uint32_t FLAG_INLRU = 0x0001;
+      static constexpr uint32_t FLAG_PINNED  = 0x0002; // possible future use
+      static constexpr uint32_t FLAG_EVICTING = 0x0004;
+
+      Lane& lane_of(void* addr) {
+	return qlane[(uint64_t)(addr) % n_lanes];
+      }
+
+      uint32_t next_evict_lane() {
+	return (evict_lane++ % n_lanes);
+      }
+
+      bool can_reclaim(Object* o) {
+	return ((o->lru_refcnt == SENTINEL_REFCNT) &&
+		(!(o->lru_flags & FLAG_EVICTING)));
+      }
+
+      Object* evict_block(const ObjectFactory* newobj_fac) {
+	uint32_t lane_ix = next_evict_lane();
+	for (int ix = 0; ix < n_lanes; ++ix,
+	       lane_ix = next_evict_lane()) {
+	  Lane& lane = qlane[lane_ix];
+	  std::unique_lock lane_lock{lane.lock};
+	  /* if object at LRU has refcnt==1, it may be reclaimable */
+	  Object* o = &(lane.q.back());
+	  if (can_reclaim(o)) {
+	    ++(o->lru_refcnt);
+	    o->lru_flags |= FLAG_EVICTING;
+	    lane_lock.unlock();
+	    if (o->reclaim(newobj_fac)) {
+	      lane_lock.lock();
+	      --(o->lru_refcnt);
+	      /* assertions that o state has not changed across
+	       * relock */
+	      ceph_assert(o->lru_refcnt == SENTINEL_REFCNT);
+	      ceph_assert(o->lru_flags & FLAG_INLRU);
+	      Object::Queue::iterator it =
+		Object::Queue::s_iterator_to(*o);
+	      lane.q.erase(it);
+	      return o;
+	    } else {
+	      --(o->lru_refcnt);
+	      o->lru_flags &= ~FLAG_EVICTING;
+	      /* unlock in next block */
+	    }
+	  } /* can_reclaim(o) */
+	} /* each lane */
+	return nullptr;
+      } /* evict_block */
+
+    public:
+
+      LRU(int lanes, uint32_t _hiwat)
+	: n_lanes(lanes), evict_lane(0), lane_hiwat(_hiwat)
+	  {
+	    ceph_assert(n_lanes > 0);
+	    qlane = new Lane[n_lanes];
+	  }
+
+      ~LRU() { delete[] qlane; }
+
+      bool ref(Object* o, uint32_t flags) {
+	++(o->lru_refcnt);
+	if (flags & FLAG_INITIAL) {
+	  if ((++(o->lru_adj) % lru_adj_modulus) == 0) {
+	    Lane& lane = lane_of(o);
+	    lane.lock.lock();
+	    /* move to MRU */
+	    Object::Queue::iterator it =
+	      Object::Queue::s_iterator_to(*o);
+	    lane.q.erase(it);
+	    lane.q.push_front(*o);
+	    lane.lock.unlock();
+	  } /* adj */
+	} /* initial ref */
+	return true;
+      } /* ref */
+
+      void unref(Object* o, uint32_t flags) {
+	uint32_t refcnt = --(o->lru_refcnt);
+	Object* tdo = nullptr;
+	if (unlikely(refcnt == 0)) {
+	  Lane& lane = lane_of(o);
+	  lane.lock.lock();
+	  refcnt = o->lru_refcnt.load();
+	  if (unlikely(refcnt == 0)) {
+	    Object::Queue::iterator it =
+	      Object::Queue::s_iterator_to(*o);
+	    lane.q.erase(it);
+	    tdo = o;
+	  }
+	  lane.lock.unlock();
+	} else if (unlikely(refcnt == SENTINEL_REFCNT)) {
+	  Lane& lane = lane_of(o);
+	  lane.lock.lock();
+	  refcnt = o->lru_refcnt.load();
+	  if (likely(refcnt == SENTINEL_REFCNT)) {
+	    /* move to LRU */
+	    Object::Queue::iterator it =
+	      Object::Queue::s_iterator_to(*o);
+	    lane.q.erase(it);
+	    /* hiwat check */
+	    if (lane.q.size() > lane_hiwat) {
+	      tdo = o;
+	    } else {
+	      lane.q.push_back(*o);
+	    }
+	  }
+	  lane.lock.unlock();
+	}
+	/* unref out-of-line && !LOCKED */
+	if (tdo)
+	  delete tdo;
+      } /* unref */
+
+      Object* insert(ObjectFactory* fac, Edge edge, uint32_t& flags) {
+	/* use supplied functor to re-use an evicted object, or
+	 * allocate a new one of the descendant type */
+	Object* o = evict_block(fac);
+	if (o) {
+	  fac->recycle(o); /* recycle existing object */
+	  flags |= FLAG_RECYCLE;
+	}
+	else
+	  o = fac->alloc(); /* get a new one */
+
+	o->lru_flags = FLAG_INLRU;
+
+	Lane& lane = lane_of(o);
+	lane.lock.lock();
+	switch (edge) {
+	case Edge::MRU:
+	  lane.q.push_front(*o);
+	  break;
+	case Edge::LRU:
+	  lane.q.push_back(*o);
+	  break;
+	default:
+	  ceph_abort();
+	  break;
+	}
+	if (flags & FLAG_INITIAL)
+	  o->lru_refcnt += 2; /* sentinel ref + initial */
+	else
+	  ++(o->lru_refcnt); /* sentinel */
+	lane.lock.unlock();
+	return o;
+      } /* insert */
+
+    };
+
+    template <typename T, typename TTree, typename CLT, typename CEQ,
+	      typename K, typename LK>
+    class TreeX
+    {
+    public:
+
+      static constexpr uint32_t FLAG_NONE = 0x0000;
+      static constexpr uint32_t FLAG_LOCK = 0x0001;
+      static constexpr uint32_t FLAG_UNLOCK = 0x0002;
+      static constexpr uint32_t FLAG_UNLOCK_ON_MISS = 0x0004;
+
+      typedef T value_type;
+      typedef TTree container_type;
+      typedef typename TTree::iterator iterator;
+      typedef std::pair<iterator, bool> check_result;
+      typedef typename TTree::insert_commit_data insert_commit_data;
+      int n_part;
+      int csz;
+
+      typedef std::unique_lock<LK> unique_lock;
+
+      struct Partition {
+	LK lock;
+	TTree tr;
+	T** cache;
+	int csz;
+	CACHE_PAD(0);
+
+	Partition() : tr(), cache(nullptr), csz(0) {}
+
+	~Partition() {
+	  if (csz)
+	    ::operator delete(cache);
+	}
+      };
+
+      struct Latch {
+	Partition* p;
+	LK* lock;
+	insert_commit_data commit_data{};
+
+	Latch() : p(nullptr), lock(nullptr) {}
+      };
+
+      Partition& partition_of_scalar(uint64_t x) {
+	return part[x % n_part];
+      }
+
+      Partition& get(uint8_t x) {
+	return part[x];
+      }
+
+      Partition*& get() {
+	return part;
+      }
+
+      void lock() {
+	std::for_each(locks.begin(), locks.end(),
+		      [](LK* lk){ lk->lock(); });
+      }
+
+      void unlock() {
+	std::for_each(locks.begin(), locks.end(),
+		      [](LK* lk){ lk->unlock(); });
+      }
+
+      TreeX(int n_part=1, int csz=127) : n_part(n_part), csz(csz) {
+	ceph_assert(n_part > 0);
+	part = new Partition[n_part];
+	for (int ix = 0; ix < n_part; ++ix) {
+	  Partition& p = part[ix];
+	  if (csz) {
+	    p.csz = csz;
+	    p.cache = (T**) ::operator new(csz * sizeof(T*));
+	    // FIPS zeroization audit 20191115: this memset is not security related.
+	    memset(p.cache, 0, csz * sizeof(T*));
+	  }
+	  locks.push_back(&p.lock);
+	}
+      }
+
+      ~TreeX() {
+	delete[] part;
+      }
+
+      T* find(uint64_t hk, const K& k, uint32_t flags) {
+	T* v;
+	Latch lat;
+	uint32_t slot = 0;
+	lat.p = &(partition_of_scalar(hk));
+	if (flags & FLAG_LOCK) {
+	  lat.lock = &lat.p->lock;
+	  lat.lock->lock();
+	}
+	if (csz) { /* template specialize? */
+	  slot = hk % csz;
+	  v = lat.p->cache[slot];
+	  if (v) {
+	    if (CEQ()(*v, k)) {
+	      if (flags & FLAG_LOCK)
+		lat.lock->unlock();
+	      return v;
+	    }
+	    v = nullptr;
+	  }
+	} else {
+	  v = nullptr;
+	}
+	iterator it = lat.p->tr.find(k, CLT());
+	if (it != lat.p->tr.end()){
+	  v = &(*(it));
+	  if (csz) {
+	    /* fill cache slot at hk */
+	    lat.p->cache[slot] = v;
+	  }
+	}
+	if (flags & FLAG_LOCK)
+	  lat.lock->unlock();
+	return v;
+      } /* find */
+
+      T* find_latch(uint64_t hk, const K& k, Latch& lat,
+		    uint32_t flags) {
+	uint32_t slot = 0;
+	T* v;
+	lat.p = &(partition_of_scalar(hk));
+	lat.lock = &lat.p->lock;
+	if (flags & FLAG_LOCK)
+	  lat.lock->lock();
+	if (csz) { /* template specialize? */
+	  slot = hk % csz;
+	  v = lat.p->cache[slot];
+	  if (v) {
+	    if (CEQ()(*v, k)) {
+	      if ((flags & FLAG_LOCK) && (flags & FLAG_UNLOCK))
+		lat.lock->unlock();
+	      return v;
+	    }
+	    v = nullptr;
+	  }
+	} else {
+	  v = nullptr;
+	}
+	check_result r = lat.p->tr.insert_unique_check(
+	  k, CLT(), lat.commit_data);
+	if (! r.second /* !insertable (i.e., !found) */) {
+	  v = &(*(r.first));
+	  if (csz) {
+	    /* fill cache slot at hk */
+	    lat.p->cache[slot] = v;
+	  }
+	}
+	if ((flags & FLAG_LOCK) && (flags & FLAG_UNLOCK))
+	  lat.lock->unlock();
+	return v;
+      } /* find_latch */
+      bool is_same_partition(uint64_t lhs, uint64_t rhs) {
+        return ((lhs % n_part) == (rhs % n_part));
+      }
+      void insert_latched(T* v, Latch& lat, uint32_t flags) {
+	(void) lat.p->tr.insert_unique_commit(*v, lat.commit_data);
+	if (flags & FLAG_UNLOCK)
+	  lat.lock->unlock();
+      } /* insert_latched */
+
+      void insert(uint64_t hk, T* v, uint32_t flags) {
+	Partition& p = partition_of_scalar(hk);
+	if (flags & FLAG_LOCK)
+	  p.lock.lock();
+	p.tr.insert_unique(*v);
+	if (flags & FLAG_LOCK)
+	  p.lock.unlock();
+      } /* insert */
+
+      void remove(uint64_t hk, T* v, uint32_t flags) {
+	Partition& p = partition_of_scalar(hk);
+	iterator it = TTree::s_iterator_to(*v);
+	if (flags & FLAG_LOCK)
+	  p.lock.lock();
+	p.tr.erase(it);
+	if (csz) { /* template specialize? */
+	  uint32_t slot = hk % csz;
+	  T* v2 = p.cache[slot];
+	  /* we are intrusive, just compare addresses */
+	  if (v == v2)
+	    p.cache[slot] = nullptr;
+	}
+	if (flags & FLAG_LOCK)
+	  p.lock.unlock();
+      } /* remove */
+
+      void drain(std::function<void(T*)> uref,
+		 uint32_t flags = FLAG_NONE) {
+	/* clear a table, call supplied function on
+	 * each element found (e.g., returns sentinel
+	 * references) */
+	Object::Queue2 drain_q;
+	for (int t_ix = 0; t_ix < n_part; ++t_ix) {
+	  Partition& p = part[t_ix];
+	  if (flags & FLAG_LOCK) /* LOCKED */
+	    p.lock.lock();
+	  while (p.tr.size() > 0) {
+	    iterator it = p.tr.begin();
+	    T* v = &(*it);
+	    p.tr.erase(it);
+	    drain_q.push_front(*v);
+	  }
+	  if (flags & FLAG_LOCK) /* we locked it, !LOCKED */
+	    p.lock.unlock();
+	} /* each partition */
+	/* unref out-of-line && !LOCKED */
+	while (drain_q.size() > 0) {
+	  Object::Queue2::iterator it = drain_q.begin();
+	  T* v = static_cast<T*>(&(*it));
+	  drain_q.erase(it); /* must precede uref(v) in safe_link mode */
+	  uref(v);
+	}
+      } /* drain */
+
+    private:
+      Partition *part;
+      std::vector<LK*> locks;
+    };
+
+  } /* namespace LRU */
+} /* namespace cohort */
+
+#endif /* COHORT_LRU_H */
diff --git a/src/common/common_init.cc b/src/common/common_init.cc
new file mode 100644
index 000000000..f3ca03261
--- /dev/null
+++ b/src/common/common_init.cc
@@ -0,0 +1,133 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2010-2011 Dreamhost
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+#include "common/common_init.h"
+#include "common/admin_socket.h"
+#include "common/ceph_argparse.h"
+#include "common/ceph_context.h"
+#include "common/config.h"
+#include "common/dout.h"
+#include "common/hostname.h"
+#include "common/strtol.h"
+#include "common/valgrind.h"
+#include "common/zipkin_trace.h"
+
+#define dout_subsys ceph_subsys_
+
+#ifndef WITH_SEASTAR
+CephContext *common_preinit(const CephInitParameters &iparams,
+			    enum code_environment_t code_env, int flags)
+{
+  // set code environment
+  ANNOTATE_BENIGN_RACE_SIZED(&g_code_env, sizeof(g_code_env), "g_code_env");
+  g_code_env = code_env;
+
+  // Create a configuration object
+  CephContext *cct = new CephContext(iparams.module_type, code_env, flags);
+
+  auto& conf = cct->_conf;
+  // add config observers here
+
+  // Set up our entity name.
+  conf->name = iparams.name;
+
+  // different default keyring locations for osd and mds.  this is
+  // for backward compatibility.  moving forward, we want all keyrings
+  // in these locations.  the mon already forces $mon_data/keyring.
+  if (conf->name.is_mds()) {
+    conf.set_val_default("keyring", "$mds_data/keyring");
+  } else if (conf->name.is_osd()) {
+    conf.set_val_default("keyring", "$osd_data/keyring");
+  }
+
+  if ((flags & CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS)) {
+    // make this unique despite multiple instances by the same name.
+    conf.set_val_default("admin_socket",
+			  "$run_dir/$cluster-$name.$pid.$cctid.asok");
+  }
+
+  if (code_env == CODE_ENVIRONMENT_LIBRARY ||
+      code_env == CODE_ENVIRONMENT_UTILITY_NODOUT) {
+    conf.set_val_default("log_to_stderr", "false");
+    conf.set_val_default("err_to_stderr", "false");
+    conf.set_val_default("log_flush_on_exit", "false");
+  }
+
+  conf.set_val("no_config_file", iparams.no_config_file ? "true" : "false");
+
+  if (conf->host.empty()) {
+    conf.set_val("host", ceph_get_short_hostname());
+  }
+  return cct;
+}
+#endif	// #ifndef WITH_SEASTAR
+
+void complain_about_parse_error(CephContext *cct,
+				const std::string& parse_error)
+{
+  if (parse_error.empty())
+    return;
+  lderr(cct) << "Errors while parsing config file!" << dendl;
+  lderr(cct) << parse_error << dendl;
+}
+
+#ifndef WITH_SEASTAR
+
+/* Please be sure that this can safely be called multiple times by the
+ * same application. */
+void common_init_finish(CephContext *cct)
+{
+  // only do this once per cct
+  if (cct->_finished) {
+    return;
+  }
+  cct->_finished = true;
+  cct->init_crypto();
+  ZTracer::ztrace_init();
+
+  if (!cct->_log->is_started()) {
+    cct->_log->start();
+  }
+
+  int flags = cct->get_init_flags();
+  if (!(flags & CINIT_FLAG_NO_DAEMON_ACTIONS))
+    cct->start_service_thread();
+
+  if ((flags & CINIT_FLAG_DEFER_DROP_PRIVILEGES) &&
+      (cct->get_set_uid() || cct->get_set_gid())) {
+    cct->get_admin_socket()->chown(cct->get_set_uid(), cct->get_set_gid());
+  }
+
+  const auto& conf = cct->_conf;
+
+  if (!conf->admin_socket.empty() && !conf->admin_socket_mode.empty()) {
+    int ret = 0;
+    std::string err;
+
+    ret = strict_strtol(conf->admin_socket_mode.c_str(), 8, &err);
+    if (err.empty()) {
+      if (!(ret & (~ACCESSPERMS))) {
+        cct->get_admin_socket()->chmod(static_cast<mode_t>(ret));
+      } else {
+        lderr(cct) << "Invalid octal permissions string: "
+            << conf->admin_socket_mode << dendl;
+      }
+    } else {
+      lderr(cct) << "Invalid octal string: " << err << dendl;
+    }
+  }
+}
+
+#endif	// #ifndef WITH_SEASTAR
diff --git a/src/common/common_init.h b/src/common/common_init.h
new file mode 100644
index 000000000..b9e141739
--- /dev/null
+++ b/src/common/common_init.h
@@ -0,0 +1,88 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2009-2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_INIT_H
+#define CEPH_COMMON_INIT_H
+
+#include <deque>
+
+#include "include/common_fwd.h"
+#include "common/code_environment.h"
+
+enum common_init_flags_t {
+  // Set up defaults that make sense for an unprivileged daemon
+  CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS = 0x1,
+
+  // By default, don't read a configuration file OR contact mons
+  CINIT_FLAG_NO_DEFAULT_CONFIG_FILE = 0x2,
+
+  // Don't close stderr (in daemonize)
+  CINIT_FLAG_NO_CLOSE_STDERR = 0x4,
+
+  // don't do anything daemonish, like create /var/run/ceph, or print a banner
+  CINIT_FLAG_NO_DAEMON_ACTIONS = 0x8,
+
+  // don't drop privileges
+  CINIT_FLAG_DEFER_DROP_PRIVILEGES = 0x10,
+
+  // don't contact mons for config
+  CINIT_FLAG_NO_MON_CONFIG = 0x20,
+
+  // don't expose default cct perf counters
+  CINIT_FLAG_NO_CCT_PERF_COUNTERS = 0x40,
+};
+
+#ifndef WITH_SEASTAR
+class CephInitParameters;
+
+/*
+ * NOTE: If you are writing a Ceph daemon, ignore this function and call
+ * global_init instead. It will call common_preinit for you.
+ *
+ * common_preinit creates the CephContext.
+ *
+ * After this function gives you a CephContext, you need to set up the
+ * Ceph configuration, which lives inside the CephContext as md_config_t.
+ * The initial settings are not very useful because they do not reflect what
+ * the user asked for.
+ *
+ * This is usually done by something like this:
+ * cct->_conf.parse_env();
+ * cct->_conf.apply_changes();
+ *
+ * Your library may also supply functions to read a configuration file.
+ */
+CephContext *common_preinit(const CephInitParameters &iparams,
+			    enum code_environment_t code_env, int flags);
+#endif // #ifndef WITH_SEASTAR
+
+/* Print out some parse error. */
+void complain_about_parse_error(CephContext *cct,
+				const std::string& parse_error);
+
+/* This function is called after you have done your last
+ * fork. When you make this call, the system will initialize everything that
+ * cannot be initialized before a fork.
+ *
+ * This includes things like starting threads, initializing libraries that
+ * can't handle forking, and so forth.
+ *
+ * If you are writing a Ceph library, you can call this pretty much any time.
+ * We do not allow our library users to fork and continue using the Ceph
+ * libraries. The most obvious reason for this is that the threads started by
+ * the Ceph libraries would be destroyed by a fork().
+ */
+void common_init_finish(CephContext *cct);
+
+#endif
diff --git a/src/common/compat.cc b/src/common/compat.cc
new file mode 100644
index 000000000..82b57ad94
--- /dev/null
+++ b/src/common/compat.cc
@@ -0,0 +1,567 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <cstdio>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include "acconfig.h"
+#ifdef HAVE_MEMSET_S
+# define __STDC_WANT_LIB_EXT1__ 1
+#endif
+#include <string.h>
+#include <thread>
+#ifndef _WIN32
+#include <sys/mount.h>
+#else
+#include <stdlib.h>
+#endif
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#if defined(__linux__) 
+#include <sys/vfs.h>
+#endif
+
+#include "include/compat.h"
+#include "include/sock_compat.h"
+#include "common/safe_io.h"
+
+// The type-value for a ZFS FS in fstatfs.
+#define FS_ZFS_TYPE 0xde
+
+// On FreeBSD, ZFS fallocate always fails since it is considered impossible to
+// reserve space on a COW filesystem. posix_fallocate() returns EINVAL
+// Linux in this case already emulates the reservation in glibc
+// In which case it is allocated manually, and still that is not a real guarantee
+// that a full buffer is allocated on disk, since it could be compressed.
+// To prevent this the written buffer needs to be loaded with random data.
+int manual_fallocate(int fd, off_t offset, off_t len) {
+  int r = lseek(fd, offset, SEEK_SET);
+  if (r == -1)
+    return errno;
+  char data[1024*128];
+  // TODO: compressing filesystems would require random data
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(data, 0x42, sizeof(data));
+  for (off_t off = 0; off < len; off += sizeof(data)) {
+    if (off + static_cast<off_t>(sizeof(data)) > len)
+      r = safe_write(fd, data, len - off);
+    else
+      r = safe_write(fd, data, sizeof(data));
+    if (r == -1) {
+      return errno;
+    }
+  }
+  return 0;
+}
+
+int on_zfs(int basedir_fd) {
+  #ifndef _WIN32
+  struct statfs basefs;
+  (void)fstatfs(basedir_fd, &basefs);
+  return (basefs.f_type == FS_ZFS_TYPE);
+  #else
+  return 0;
+  #endif
+}
+
+int ceph_posix_fallocate(int fd, off_t offset, off_t len) {
+  // Return 0 if oke, otherwise errno > 0
+
+#ifdef HAVE_POSIX_FALLOCATE
+  if (on_zfs(fd)) {
+    return manual_fallocate(fd, offset, len);
+  } else {
+    return posix_fallocate(fd, offset, len);
+  }
+#elif defined(__APPLE__)
+  fstore_t store;
+  store.fst_flags = F_ALLOCATECONTIG;
+  store.fst_posmode = F_PEOFPOSMODE;
+  store.fst_offset = offset;
+  store.fst_length = len;
+
+  int ret = fcntl(fd, F_PREALLOCATE, &store);
+  if (ret == -1) {
+    ret = errno;
+  }
+  return ret;
+#else
+  return manual_fallocate(fd, offset, len);
+#endif
+} 
+
+int pipe_cloexec(int pipefd[2], int flags)
+{
+#if defined(HAVE_PIPE2)
+  return pipe2(pipefd, O_CLOEXEC | flags);
+#else
+  if (pipe(pipefd) == -1)
+    return -1;
+
+  #ifndef _WIN32
+  /*
+   * The old-fashioned, race-condition prone way that we have to fall
+   * back on if pipe2 does not exist.
+   */
+  if (fcntl(pipefd[0], F_SETFD, FD_CLOEXEC) < 0) {
+    goto fail;
+  }
+
+  if (fcntl(pipefd[1], F_SETFD, FD_CLOEXEC) < 0) {
+    goto fail;
+  }
+  #endif
+
+  return 0;
+fail:
+  int save_errno = errno;
+  VOID_TEMP_FAILURE_RETRY(close(pipefd[0]));
+  VOID_TEMP_FAILURE_RETRY(close(pipefd[1]));
+  return (errno = save_errno, -1);
+#endif
+}
+
+
+int socket_cloexec(int domain, int type, int protocol)
+{
+#ifdef SOCK_CLOEXEC
+  return socket(domain, type|SOCK_CLOEXEC, protocol);
+#else
+  int fd = socket(domain, type, protocol);
+  if (fd == -1)
+    return -1;
+
+  #ifndef _WIN32
+  if (fcntl(fd, F_SETFD, FD_CLOEXEC) < 0)
+    goto fail;
+  #endif
+
+  return fd;
+fail:
+  int save_errno = errno;
+  VOID_TEMP_FAILURE_RETRY(close(fd));
+  return (errno = save_errno, -1);
+#endif
+}
+
+int socketpair_cloexec(int domain, int type, int protocol, int sv[2])
+{
+#ifdef SOCK_CLOEXEC
+  return socketpair(domain, type|SOCK_CLOEXEC, protocol, sv);
+#elif _WIN32
+  /* TODO */
+  return -ENOTSUP;
+#else
+  int rc = socketpair(domain, type, protocol, sv);
+  if (rc == -1)
+    return -1;
+
+  #ifndef _WIN32
+  if (fcntl(sv[0], F_SETFD, FD_CLOEXEC) < 0)
+    goto fail;
+
+  if (fcntl(sv[1], F_SETFD, FD_CLOEXEC) < 0)
+    goto fail;
+  #endif
+
+  return 0;
+fail:
+  int save_errno = errno;
+  VOID_TEMP_FAILURE_RETRY(close(sv[0]));
+  VOID_TEMP_FAILURE_RETRY(close(sv[1]));
+  return (errno = save_errno, -1);
+#endif
+}
+
+int accept_cloexec(int sockfd, struct sockaddr* addr, socklen_t* addrlen)
+{
+#ifdef HAVE_ACCEPT4
+  return accept4(sockfd, addr, addrlen, SOCK_CLOEXEC);
+#else
+  int fd = accept(sockfd, addr, addrlen);
+  if (fd == -1)
+    return -1;
+
+  #ifndef _WIN32
+  if (fcntl(fd, F_SETFD, FD_CLOEXEC) < 0)
+    goto fail;
+  #endif
+
+  return fd;
+fail:
+  int save_errno = errno;
+  VOID_TEMP_FAILURE_RETRY(close(fd));
+  return (errno = save_errno, -1);
+#endif
+}
+
+#if defined(__FreeBSD__)
+int sched_setaffinity(pid_t pid, size_t cpusetsize,
+                      cpu_set_t *mask)
+{
+  return 0;
+}
+#endif
+
+char *ceph_strerror_r(int errnum, char *buf, size_t buflen)
+{
+#ifdef _WIN32
+  strerror_s(buf, buflen, errnum);
+  return buf;
+#elif defined(STRERROR_R_CHAR_P)
+  return strerror_r(errnum, buf, buflen);
+#else
+  if (strerror_r(errnum, buf, buflen)) {
+    snprintf(buf, buflen, "Unknown error %d", errnum);
+  }
+  return buf;
+#endif
+}
+
+int ceph_memzero_s(void *dest, size_t destsz, size_t count) {
+#ifdef HAVE_MEMSET_S
+    return memset_s(dest, destsz, 0, count);
+#elif defined(_WIN32)
+    SecureZeroMemory(dest, count);
+#else
+    explicit_bzero(dest, count);
+#endif
+    return 0;
+}
+
+#ifdef _WIN32
+
+#include <iomanip>
+#include <ctime>
+
+// chown is not available on Windows. Plus, changing file owners is not
+// a common practice on Windows.
+int chown(const char *path, uid_t owner, gid_t group) {
+  return 0;
+}
+
+int fchown(int fd, uid_t owner, gid_t group) {
+  return 0;
+}
+
+int lchown(const char *path, uid_t owner, gid_t group) {
+  return 0;
+}
+
+int posix_memalign(void **memptr, size_t alignment, size_t size) {
+  *memptr = _aligned_malloc(size, alignment);
+  return *memptr ? 0 : errno;
+}
+
+char *strptime(const char *s, const char *format, struct tm *tm) {
+  std::istringstream input(s);
+  input.imbue(std::locale(setlocale(LC_ALL, nullptr)));
+  input >> std::get_time(tm, format);
+  if (input.fail()) {
+    return nullptr;
+  }
+  return (char*)(s + input.tellg());
+}
+
+int pipe(int pipefd[2]) {
+  // We'll use the same pipe size as Linux (64kb).
+  return _pipe(pipefd, 0x10000, O_NOINHERIT);
+}
+
+// lrand48 is not available on Windows. We'll generate a pseudo-random
+// value in the 0 - 2^31 range by calling rand twice.
+long int lrand48(void) {
+  long int val;
+  val = (long int) rand();
+  val <<= 16;
+  val += (long int) rand();
+  return val;
+}
+
+int random() {
+  return rand();
+}
+
+int fsync(int fd) {
+  HANDLE handle = (HANDLE*)_get_osfhandle(fd);
+  if (handle == INVALID_HANDLE_VALUE)
+    return -1;
+  if (!FlushFileBuffers(handle))
+    return -1;
+  return 0;
+}
+
+ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset) {
+  DWORD bytes_written = 0;
+
+  HANDLE handle = (HANDLE*)_get_osfhandle(fd);
+  if (handle == INVALID_HANDLE_VALUE)
+    return -1;
+
+  OVERLAPPED overlapped = { 0 };
+  ULARGE_INTEGER offsetUnion;
+  offsetUnion.QuadPart = offset;
+
+  overlapped.Offset = offsetUnion.LowPart;
+  overlapped.OffsetHigh = offsetUnion.HighPart;
+
+  if (!WriteFile(handle, buf, count, &bytes_written, &overlapped))
+    // we may consider mapping error codes, although that may
+    // not be exhaustive.
+    return -1;
+
+  return bytes_written;
+}
+
+ssize_t pread(int fd, void *buf, size_t count, off_t offset) {
+  DWORD bytes_read = 0;
+
+  HANDLE handle = (HANDLE*)_get_osfhandle(fd);
+  if (handle == INVALID_HANDLE_VALUE)
+    return -1;
+
+  OVERLAPPED overlapped = { 0 };
+  ULARGE_INTEGER offsetUnion;
+  offsetUnion.QuadPart = offset;
+
+  overlapped.Offset = offsetUnion.LowPart;
+  overlapped.OffsetHigh = offsetUnion.HighPart;
+
+  if (!ReadFile(handle, buf, count, &bytes_read, &overlapped)) {
+    if (GetLastError() != ERROR_HANDLE_EOF)
+      return -1;
+  }
+
+  return bytes_read;
+}
+
+ssize_t preadv(int fd, const struct iovec *iov, int iov_cnt) {
+  ssize_t read = 0;
+
+  for (int i = 0; i < iov_cnt; i++) {
+    int r = ::read(fd, iov[i].iov_base, iov[i].iov_len);
+    if (r < 0)
+      return r;
+    read += r;
+    if (r < iov[i].iov_len)
+      break;
+  }
+
+  return read;
+}
+
+ssize_t writev(int fd, const struct iovec *iov, int iov_cnt) {
+  ssize_t written = 0;
+
+  for (int i = 0; i < iov_cnt; i++) {
+    int r = ::write(fd, iov[i].iov_base, iov[i].iov_len);
+    if (r < 0)
+      return r;
+    written += r;
+    if (r < iov[i].iov_len)
+      break;
+  }
+
+  return written;
+}
+
+int &alloc_tls() {
+  static __thread int tlsvar;
+  tlsvar++;
+  return tlsvar;
+}
+
+void apply_tls_workaround() {
+  // Workaround for the following Mingw bugs:
+  // https://sourceforge.net/p/mingw-w64/bugs/727/
+  // https://sourceforge.net/p/mingw-w64/bugs/527/
+  // https://sourceforge.net/p/mingw-w64/bugs/445/
+  // https://gcc.gnu.org/bugzilla/attachment.cgi?id=41382
+  pthread_key_t key;
+  pthread_key_create(&key, nullptr);
+  // Use a TLS slot for emutls
+  alloc_tls();
+  // Free up a slot that can now be used for c++ destructors
+  pthread_key_delete(key);
+}
+
+CEPH_CONSTRUCTOR(ceph_windows_init) {
+  // This will run at startup time before invoking main().
+  WSADATA wsaData;
+  int error;
+
+  #ifdef __MINGW32__
+  apply_tls_workaround();
+  #endif
+
+  error = WSAStartup(MAKEWORD(2, 2), &wsaData);
+  if (error != 0) {
+    fprintf(stderr, "WSAStartup failed: %d", WSAGetLastError());
+    exit(error);
+  }
+}
+
+int _win_socketpair(int socks[2])
+{
+  union {
+     struct sockaddr_in inaddr;
+     struct sockaddr addr;
+  } a;
+  SOCKET listener;
+  int e;
+  socklen_t addrlen = sizeof(a.inaddr);
+  int reuse = 1;
+
+  if (socks == 0) {
+    WSASetLastError(WSAEINVAL);
+    return -1;
+  }
+
+  listener = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+  if (listener == INVALID_SOCKET) {
+    return -1;
+  }
+
+  memset(&a, 0, sizeof(a));
+  a.inaddr.sin_family = AF_INET;
+  a.inaddr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+  a.inaddr.sin_port = 0;
+
+  socks[0] = socks[1] = -1;
+  SOCKET s[2] = { INVALID_SOCKET, INVALID_SOCKET };
+
+  do {
+    if (setsockopt(listener, SOL_SOCKET, SO_REUSEADDR,
+           (char*) &reuse, (socklen_t) sizeof(reuse)) == -1)
+      break;
+    if (bind(listener, &a.addr, sizeof(a.inaddr)) == SOCKET_ERROR)
+      break;
+    if (getsockname(listener, &a.addr, &addrlen) == SOCKET_ERROR)
+      break;
+    if (listen(listener, 1) == SOCKET_ERROR)
+      break;
+    s[0] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+    if (s[0] == INVALID_SOCKET)
+      break;
+    if (connect(s[0], &a.addr, sizeof(a.inaddr)) == SOCKET_ERROR)
+      break;
+    s[1] = accept(listener, NULL, NULL);
+    if (s[1] == INVALID_SOCKET)
+      break;
+
+    closesocket(listener);
+
+    // The Windows socket API is mostly compatible with the Berkeley
+    // API, with a few exceptions. The Windows socket functions use
+    // SOCKET instead of int. The issue is that on x64 systems,
+    // SOCKET uses 64b while int uses 32b. There's been much debate
+    // whether casting a Windows socket to an int is safe or not.
+    // Worth noting that Windows kernel objects use 32b. For now,
+    // we're just adding a check.
+    //
+    // Ideally, we should update ceph to use the right type but this
+    // can be quite difficult, especially considering that there are
+    // a significant number of functions that accept both sockets and
+    // file descriptors.
+    if (s[0] >> 32 || s[1] >> 32) {
+      WSASetLastError(WSAENAMETOOLONG);
+      break;
+    }
+
+    socks[0] = s[0];
+    socks[1] = s[1];
+
+    return 0;
+
+  } while (0);
+
+  e = WSAGetLastError();
+  closesocket(listener);
+  closesocket(s[0]);
+  closesocket(s[1]);
+  WSASetLastError(e);
+  return -1;
+}
+
+int win_socketpair(int socks[2]) {
+  int r = 0;
+  for (int i = 0; i < 15; i++) {
+    r = _win_socketpair(socks);
+    if (r && WSAGetLastError() == WSAEADDRINUSE) {
+      sleep(2);
+      continue;
+    }
+    else {
+      break;
+    }
+  }
+  return r;
+}
+
+unsigned get_page_size() {
+  SYSTEM_INFO system_info;
+  GetSystemInfo(&system_info);
+  return system_info.dwPageSize;
+}
+
+int setenv(const char *name, const char *value, int overwrite) {
+  if (!overwrite && getenv(name)) {
+    return 0;
+  }
+  return _putenv_s(name, value);
+}
+
+ssize_t get_self_exe_path(char* path, int buff_length) {
+  return GetModuleFileName(NULL, path, buff_length - 1);
+}
+
+int geteuid()
+{
+  return 0;
+}
+
+int getegid()
+{
+  return 0;
+}
+
+int getuid()
+{
+  return 0;
+}
+
+int getgid()
+{
+  return 0;
+}
+
+#else
+
+unsigned get_page_size() {
+  return sysconf(_SC_PAGESIZE);
+}
+
+ssize_t get_self_exe_path(char* path, int buff_length) {
+  return readlink("/proc/self/exe", path,
+                  sizeof(buff_length) - 1);
+}
+
+#endif /* _WIN32 */
diff --git a/src/common/compiler_extensions.h b/src/common/compiler_extensions.h
new file mode 100644
index 000000000..2fd8f5c2d
--- /dev/null
+++ b/src/common/compiler_extensions.h
@@ -0,0 +1,30 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMPILER_EXTENSIONS_H
+#define CEPH_COMPILER_EXTENSIONS_H
+
+/* We should be able to take advantage of nice nonstandard features of gcc
+ * and other compilers, but still maintain portability.
+ */
+
+#ifdef __GNUC__
+// GCC
+#define WARN_UNUSED_RESULT __attribute__((warn_unused_result))
+#else
+// some other compiler - just make it a no-op
+#define WARN_UNUSED_RESULT
+#endif
+
+#endif
diff --git a/src/common/condition_variable_debug.cc b/src/common/condition_variable_debug.cc
new file mode 100644
index 000000000..376fd12d4
--- /dev/null
+++ b/src/common/condition_variable_debug.cc
@@ -0,0 +1,79 @@
+#include "condition_variable_debug.h"
+#include "common/mutex_debug.h"
+
+namespace ceph {
+
+condition_variable_debug::condition_variable_debug()
+  : waiter_mutex{nullptr}
+{
+  int r = pthread_cond_init(&cond, nullptr);
+  if (r) {
+    throw std::system_error(r, std::generic_category());
+  }
+}
+
+condition_variable_debug::~condition_variable_debug()
+{
+  pthread_cond_destroy(&cond);
+}
+
+void condition_variable_debug::wait(std::unique_lock<mutex_debug>& lock)
+{
+  // make sure this cond is used with one mutex only
+  ceph_assert(waiter_mutex == nullptr ||
+         waiter_mutex == lock.mutex());
+  waiter_mutex = lock.mutex();
+  ceph_assert(waiter_mutex->is_locked());
+  waiter_mutex->_pre_unlock();
+  if (int r = pthread_cond_wait(&cond, waiter_mutex->native_handle());
+      r != 0) {
+    throw std::system_error(r, std::generic_category());
+  }
+  waiter_mutex->_post_lock();
+}
+
+void condition_variable_debug::notify_one()
+{
+  // make sure signaler is holding the waiter's lock.
+  ceph_assert(waiter_mutex == nullptr ||
+         waiter_mutex->is_locked());
+  if (int r = pthread_cond_signal(&cond); r != 0) {
+    throw std::system_error(r, std::generic_category());
+  }
+}
+
+void condition_variable_debug::notify_all(bool sloppy)
+{
+  if (!sloppy) {
+    // make sure signaler is holding the waiter's lock.
+    ceph_assert(waiter_mutex == NULL ||
+                waiter_mutex->is_locked());
+  }
+  if (int r = pthread_cond_broadcast(&cond); r != 0 && !sloppy) {
+    throw std::system_error(r, std::generic_category());
+  }
+}
+
+std::cv_status condition_variable_debug::_wait_until(mutex_debug* mutex,
+                                                     timespec* ts)
+{
+  // make sure this cond is used with one mutex only
+  ceph_assert(waiter_mutex == nullptr ||
+         waiter_mutex == mutex);
+  waiter_mutex = mutex;
+  ceph_assert(waiter_mutex->is_locked());
+
+  waiter_mutex->_pre_unlock();
+  int r = pthread_cond_timedwait(&cond, waiter_mutex->native_handle(), ts);
+  waiter_mutex->_post_lock();
+  switch (r) {
+  case 0:
+    return std::cv_status::no_timeout;
+  case ETIMEDOUT:
+    return std::cv_status::timeout;
+  default:
+    throw std::system_error(r, std::generic_category());
+  }
+}
+
+} // namespace ceph
diff --git a/src/common/condition_variable_debug.h b/src/common/condition_variable_debug.h
new file mode 100644
index 000000000..0c5d90ac8
--- /dev/null
+++ b/src/common/condition_variable_debug.h
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <condition_variable>
+#include <ctime>
+#include <pthread.h>
+#include "common/ceph_time.h"
+
+namespace ceph {
+
+namespace mutex_debug_detail {
+  template<bool> class mutex_debug_impl;
+}
+
+class condition_variable_debug {
+  using mutex_debug = mutex_debug_detail::mutex_debug_impl<false>;
+
+  pthread_cond_t cond;
+  mutex_debug* waiter_mutex;
+
+  condition_variable_debug&
+  operator=(const condition_variable_debug&) = delete;
+  condition_variable_debug(const condition_variable_debug&) = delete;
+
+public:
+  condition_variable_debug();
+  ~condition_variable_debug();
+  void wait(std::unique_lock<mutex_debug>& lock);
+  template<class Predicate>
+  void wait(std::unique_lock<mutex_debug>& lock, Predicate pred) {
+    while (!pred()) {
+      wait(lock);
+    }
+  }
+  template<class Clock, class Duration>
+  std::cv_status wait_until(
+    std::unique_lock<mutex_debug>& lock,
+    const std::chrono::time_point<Clock, Duration>& when) {
+    if constexpr (Clock::is_steady) {
+      // convert from mono_clock to real_clock
+      auto real_when = ceph::real_clock::now();
+      const auto delta = when - Clock::now();
+      real_when += std::chrono::ceil<typename Clock::duration>(delta);
+      timespec ts = ceph::real_clock::to_timespec(real_when);
+      return _wait_until(lock.mutex(), &ts);
+    } else {
+      timespec ts = Clock::to_timespec(when);
+      return _wait_until(lock.mutex(), &ts);
+    }
+  }
+  template<class Rep, class Period>
+  std::cv_status wait_for(
+    std::unique_lock<mutex_debug>& lock,
+    const std::chrono::duration<Rep, Period>& awhile) {
+    ceph::real_time when{ceph::real_clock::now()};
+    when += awhile;
+    timespec ts = ceph::real_clock::to_timespec(when);
+    return _wait_until(lock.mutex(), &ts);
+  }
+  template<class Rep, class Period, class Pred>
+  bool wait_for(
+    std::unique_lock<mutex_debug>& lock,
+    const std::chrono::duration<Rep, Period>& awhile,
+    Pred pred) {
+    ceph::real_time when{ceph::real_clock::now()};
+    when += awhile;
+    timespec ts = ceph::real_clock::to_timespec(when);
+    while (!pred()) {
+      if ( _wait_until(lock.mutex(), &ts) == std::cv_status::timeout) {
+        return pred();
+      }
+    }
+    return true;
+  }
+  void notify_one();
+  void notify_all(bool sloppy = false);
+private:
+  std::cv_status _wait_until(mutex_debug* mutex, timespec* ts);
+};
+
+} // namespace ceph
diff --git a/src/common/config.cc b/src/common/config.cc
new file mode 100644
index 000000000..c8101587b
--- /dev/null
+++ b/src/common/config.cc
@@ -0,0 +1,1577 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <filesystem>
+#include "common/ceph_argparse.h"
+#include "common/common_init.h"
+#include "common/config.h"
+#include "common/config_obs.h"
+#include "include/str_list.h"
+#include "include/stringify.h"
+#include "osd/osd_types.h"
+#include "common/errno.h"
+#include "common/hostname.h"
+#include "common/dout.h"
+
+/* Don't use standard Ceph logging in this file.
+ * We can't use logging until it's initialized, and a lot of the necessary
+ * initialization happens here.
+ */
+#undef dout
+#undef pdout
+#undef derr
+#undef generic_dout
+
+// set set_mon_vals()
+#define dout_subsys ceph_subsys_monc
+
+namespace fs = std::filesystem;
+
+using std::cerr;
+using std::cout;
+using std::map;
+using std::less;
+using std::list;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::string;
+using std::string_view;
+using std::vector;
+
+using ceph::bufferlist;
+using ceph::decode;
+using ceph::encode;
+using ceph::Formatter;
+
+static const char *CEPH_CONF_FILE_DEFAULT = "$data_dir/config,/etc/ceph/$cluster.conf,$home/.ceph/$cluster.conf,$cluster.conf"
+#if defined(__FreeBSD__)
+    ",/usr/local/etc/ceph/$cluster.conf"
+#elif defined(_WIN32)
+    ",$programdata/ceph/$cluster.conf"
+#endif
+    ;
+
+#define _STR(x) #x
+#define STRINGIFY(x) _STR(x)
+
+const char *ceph_conf_level_name(int level)
+{
+  switch (level) {
+  case CONF_DEFAULT: return "default";   // built-in default
+  case CONF_MON: return "mon";           // monitor config database
+  case CONF_ENV: return "env";           // process environment (CEPH_ARGS)
+  case CONF_FILE: return "file";         // ceph.conf file
+  case CONF_CMDLINE: return "cmdline";   // process command line args
+  case CONF_OVERRIDE: return "override"; // injectargs or 'config set' at runtime
+  case CONF_FINAL: return "final";
+  default: return "???";
+  }
+}
+
+int ceph_resolve_file_search(const std::string& filename_list,
+			     std::string& result)
+{
+  list<string> ls;
+  get_str_list(filename_list, ";,", ls);
+
+  int ret = -ENOENT;
+  list<string>::iterator iter;
+  for (iter = ls.begin(); iter != ls.end(); ++iter) {
+    int fd = ::open(iter->c_str(), O_RDONLY|O_CLOEXEC);
+    if (fd < 0) {
+      ret = -errno;
+      continue;
+    }
+    close(fd);
+    result = *iter;
+    return 0;
+  }
+
+  return ret;
+}
+
+static int conf_stringify(const Option::value_t& v, string *out)
+{
+  if (v == Option::value_t{}) {
+    return -ENOENT;
+  }
+  *out = Option::to_str(v);
+  return 0;
+}
+
+md_config_t::md_config_t(ConfigValues& values,
+			 const ConfigTracker& tracker,
+			 bool is_daemon)
+  : is_daemon(is_daemon)
+{
+  // Load the compile-time list of Option into
+  // a map so that we can resolve keys quickly.
+  for (const auto &i : ceph_options) {
+    if (schema.count(i.name)) {
+      // We may be instantiated pre-logging so send 
+      std::cerr << "Duplicate config key in schema: '" << i.name << "'"
+                << std::endl;
+      ceph_abort();
+    }
+    schema.emplace(i.name, i);
+  }
+
+  // Define the debug_* options as well.
+  subsys_options.reserve(values.subsys.get_num());
+  for (unsigned i = 0; i < values.subsys.get_num(); ++i) {
+    string name = string("debug_") + values.subsys.get_name(i);
+    subsys_options.push_back(
+      Option(name, Option::TYPE_STR, Option::LEVEL_ADVANCED));
+    Option& opt = subsys_options.back();
+    opt.set_default(stringify(values.subsys.get_log_level(i)) + "/" +
+		    stringify(values.subsys.get_gather_level(i)));
+    string desc = string("Debug level for ") + values.subsys.get_name(i);
+    opt.set_description(desc.c_str());
+    opt.set_flag(Option::FLAG_RUNTIME);
+    opt.set_long_description("The value takes the form 'N' or 'N/M' where N and M are values between 0 and 99.  N is the debug level to log (all values below this are included), and M is the level to gather and buffer in memory.  In the event of a crash, the most recent items <= M are dumped to the log file.");
+    opt.set_subsys(i);
+    opt.set_validator([](std::string *value, std::string *error_message) {
+	int m, n;
+	int r = sscanf(value->c_str(), "%d/%d", &m, &n);
+	if (r >= 1) {
+	  if (m < 0 || m > 99) {
+	    *error_message = "value must be in range [0, 99]";
+	    return -ERANGE;
+	  }
+	  if (r == 2) {
+	    if (n < 0 || n > 99) {
+	      *error_message = "value must be in range [0, 99]";
+	      return -ERANGE;
+	    }
+	  } else {
+	    // normalize to M/N
+	    n = m;
+	    *value = stringify(m) + "/" + stringify(n);
+	  }
+	} else {
+	  *error_message = "value must take the form N or N/M, where N and M are integers";
+	  return -EINVAL;
+	}
+	return 0;
+      });
+  }
+  for (auto& opt : subsys_options) {
+    schema.emplace(opt.name, opt);
+  }
+
+  // Populate list of legacy_values according to the OPTION() definitions
+  // Note that this is just setting up our map of name->member ptr.  The
+  // default values etc will get loaded in along with new-style data,
+  // as all loads write to both the values map, and the legacy
+  // members if present.
+  legacy_values = {
+#define OPTION(name, type) \
+    {STRINGIFY(name), &ConfigValues::name},
+#define SAFE_OPTION(name, type) OPTION(name, type)
+#include "options/legacy_config_opts.h"
+#undef OPTION
+#undef SAFE_OPTION
+  };
+
+  validate_schema();
+
+  // Validate default values from the schema
+  for (const auto &i : schema) {
+    const Option &opt = i.second;
+    if (opt.type == Option::TYPE_STR) {
+      bool has_daemon_default = (opt.daemon_value != Option::value_t{});
+      Option::value_t default_val;
+      if (is_daemon && has_daemon_default) {
+	default_val = opt.daemon_value;
+      } else {
+	default_val = opt.value;
+      }
+      // We call pre_validate as a sanity check, but also to get any
+      // side effect (value modification) from the validator.
+      auto* def_str = std::get_if<std::string>(&default_val);
+      std::string val = *def_str;
+      std::string err;
+      if (opt.pre_validate(&val, &err) != 0) {
+        std::cerr << "Default value " << opt.name << "=" << *def_str << " is "
+                     "invalid: " << err << std::endl;
+
+        // This is the compiled-in default that is failing its own option's
+        // validation, so this is super-invalid and should never make it
+        // past a pull request: crash out.
+        ceph_abort();
+      }
+      if (val != *def_str) {
+	// if the validator normalizes the string into a different form than
+	// what was compiled in, use that.
+	set_val_default(values, tracker, opt.name, val);
+      }
+    }
+  }
+
+  // Copy out values (defaults) into any legacy (C struct member) fields
+  update_legacy_vals(values);
+}
+
+md_config_t::~md_config_t()
+{
+}
+
+/**
+ * Sanity check schema.  Assert out on failures, to ensure any bad changes
+ * cannot possibly pass any testing and make it into a release.
+ */
+void md_config_t::validate_schema()
+{
+  for (const auto &i : schema) {
+    const auto &opt = i.second;
+    for (const auto &see_also_key : opt.see_also) {
+      if (schema.count(see_also_key) == 0) {
+        std::cerr << "Non-existent see-also key '" << see_also_key
+                  << "' on option '" << opt.name << "'" << std::endl;
+        ceph_abort();
+      }
+    }
+  }
+
+  for (const auto &i : legacy_values) {
+    if (schema.count(i.first) == 0) {
+      std::cerr << "Schema is missing legacy field '" << i.first << "'"
+                << std::endl;
+      ceph_abort();
+    }
+  }
+}
+
+const Option *md_config_t::find_option(const std::string_view name) const
+{
+  auto p = schema.find(name);
+  if (p != schema.end()) {
+    return &p->second;
+  }
+  return nullptr;
+}
+
+void md_config_t::set_val_default(ConfigValues& values,
+				  const ConfigTracker& tracker,
+				  const string_view name, const std::string& val)
+{
+  const Option *o = find_option(name);
+  ceph_assert(o);
+  string err;
+  int r = _set_val(values, tracker, val, *o, CONF_DEFAULT, &err);
+  ceph_assert(r >= 0);
+}
+
+int md_config_t::set_mon_vals(CephContext *cct,
+    ConfigValues& values,
+    const ConfigTracker& tracker,
+    const map<string,string,less<>>& kv,
+    config_callback config_cb)
+{
+  ignored_mon_values.clear();
+
+  if (!config_cb) {
+    ldout(cct, 4) << __func__ << " no callback set" << dendl;
+  }
+
+  for (auto& i : kv) {
+    if (config_cb) {
+      if (config_cb(i.first, i.second)) {
+	ldout(cct, 4) << __func__ << " callback consumed " << i.first << dendl;
+	continue;
+      }
+      ldout(cct, 4) << __func__ << " callback ignored " << i.first << dendl;
+    }
+    const Option *o = find_option(i.first);
+    if (!o) {
+      ldout(cct,10) << __func__ << " " << i.first << " = " << i.second
+		    << " (unrecognized option)" << dendl;
+      continue;
+    }
+    if (o->has_flag(Option::FLAG_NO_MON_UPDATE)) {
+      ignored_mon_values.emplace(i);
+      continue;
+    }
+    std::string err;
+    int r = _set_val(values, tracker, i.second, *o, CONF_MON, &err);
+    if (r < 0) {
+      ldout(cct, 4) << __func__ << " failed to set " << i.first << " = "
+		    << i.second << ": " << err << dendl;
+      ignored_mon_values.emplace(i);
+    } else if (r == ConfigValues::SET_NO_CHANGE ||
+	       r == ConfigValues::SET_NO_EFFECT) {
+      ldout(cct,20) << __func__ << " " << i.first << " = " << i.second
+		    << " (no change)" << dendl;
+    } else if (r == ConfigValues::SET_HAVE_EFFECT) {
+      ldout(cct,10) << __func__ << " " << i.first << " = " << i.second << dendl;
+    } else {
+      ceph_abort();
+    }
+  }
+  values.for_each([&] (auto name, auto configs) {
+    auto config = configs.find(CONF_MON);
+    if (config == configs.end()) {
+      return;
+    }
+    if (kv.find(name) != kv.end()) {
+      return;
+    }
+    ldout(cct,10) << __func__ << " " << name
+		  << " cleared (was " << Option::to_str(config->second) << ")"
+		  << dendl;
+    values.rm_val(name, CONF_MON);
+    // if this is a debug option, it needs to propagate to teh subsys;
+    // this isn't covered by update_legacy_vals() below.  similarly,
+    // we want to trigger a config notification for these items.
+    const Option *o = find_option(name);
+    _refresh(values, *o);
+  });
+  values_bl.clear();
+  update_legacy_vals(values);
+  return 0;
+}
+
+int md_config_t::parse_config_files(ConfigValues& values,
+				    const ConfigTracker& tracker,
+				    const char *conf_files_str,
+				    std::ostream *warnings,
+				    int flags)
+{
+  if (safe_to_start_threads)
+    return -ENOSYS;
+
+  if (values.cluster.empty() && !conf_files_str) {
+    values.cluster = get_cluster_name(nullptr);
+  }
+  // open new conf
+  for (auto& fn : get_conffile_paths(values, conf_files_str, warnings, flags)) {
+    bufferlist bl;
+    std::string error;
+    if (bl.read_file(fn.c_str(), &error)) {
+      parse_error = error;
+      continue;
+    }
+    ostringstream oss;
+    int ret = parse_buffer(values, tracker, bl.c_str(), bl.length(), &oss);
+    if (ret == 0) {
+      parse_error.clear();
+      conf_path = fn;
+      break;
+    }
+    parse_error = oss.str();
+    if (ret != -ENOENT) {
+      return ret;
+    }
+  }
+  // it must have been all ENOENTs, that's the only way we got here
+  if (conf_path.empty()) {
+    return -ENOENT;
+  }
+  if (values.cluster.empty()) {
+    values.cluster = get_cluster_name(conf_path.c_str());
+  }
+  update_legacy_vals(values);
+  return 0;
+}
+
+int
+md_config_t::parse_buffer(ConfigValues& values,
+			  const ConfigTracker& tracker,
+			  const char* buf, size_t len,
+			  std::ostream* warnings)
+{
+  if (!cf.parse_buffer(string_view{buf, len}, warnings)) {
+    return -EINVAL;
+  }
+  const auto my_sections = get_my_sections(values);
+  for (const auto &i : schema) {
+    const auto &opt = i.second;
+    std::string val;
+    if (_get_val_from_conf_file(my_sections, opt.name, val)) {
+      continue;
+    }
+    std::string error_message;
+    if (_set_val(values, tracker, val, opt, CONF_FILE, &error_message) < 0) {
+      if (warnings != nullptr) {
+        *warnings << "parse error setting " << std::quoted(opt.name)
+                  << " to " << std::quoted(val);
+        if (!error_message.empty()) {
+          *warnings << " (" << error_message << ")";
+        }
+        *warnings << '\n';
+      }
+    }
+  }
+  cf.check_old_style_section_names({"mds", "mon", "osd"}, cerr);
+  return 0;
+}
+
+std::list<std::string>
+md_config_t::get_conffile_paths(const ConfigValues& values,
+				const char *conf_files_str,
+				std::ostream *warnings,
+				int flags) const
+{
+  if (!conf_files_str) {
+    const char *c = getenv("CEPH_CONF");
+    if (c) {
+      conf_files_str = c;
+    } else {
+      if (flags & CINIT_FLAG_NO_DEFAULT_CONFIG_FILE)
+	return {};
+      conf_files_str = CEPH_CONF_FILE_DEFAULT;
+    }
+  }
+
+  std::list<std::string> paths;
+  get_str_list(conf_files_str, ";,", paths);
+  for (auto i = paths.begin(); i != paths.end(); ) {
+    string& path = *i;
+    if (path.find("$data_dir") != path.npos &&
+	data_dir_option.empty()) {
+      // useless $data_dir item, skip
+      i = paths.erase(i);
+    } else {
+      early_expand_meta(values, path, warnings);
+      ++i;
+    }
+  }
+  return paths;
+}
+
+std::string md_config_t::get_cluster_name(const char* conffile)
+{
+  if (conffile) {
+    // If cluster name is not set yet, use the prefix of the
+    // basename of configuration file as cluster name.
+    if (fs::path path{conffile}; path.extension() == ".conf") {
+      return path.stem().string();
+    } else {
+      // If the configuration file does not follow $cluster.conf
+      // convention, we do the last try and assign the cluster to
+      // 'ceph'.
+      return "ceph";
+    }
+  } else {
+    // set the cluster name to 'ceph' when configuration file is not specified.
+    return "ceph";
+  }
+}
+
+void md_config_t::parse_env(unsigned entity_type,
+			    ConfigValues& values,
+			    const ConfigTracker& tracker,
+			    const char *args_var)
+{
+  if (safe_to_start_threads)
+    return;
+  if (!args_var) {
+    args_var = "CEPH_ARGS";
+  }
+  if (auto s = getenv("CEPH_KEYRING"); s) {
+    string err;
+    _set_val(values, tracker, s, *find_option("keyring"), CONF_ENV, &err);
+  }
+  if (auto dir = getenv("CEPH_LIB"); dir) {
+    for (auto name : { "erasure_code_dir", "plugin_dir", "osd_class_dir" }) {
+    std::string err;
+      const Option *o = find_option(name);
+      ceph_assert(o);
+      _set_val(values, tracker, dir, *o, CONF_ENV, &err);
+    }
+  }
+
+  // Apply pod memory limits:
+  //
+  // There are two types of resource requests: `limits` and `requests`.
+  //
+  // - Requests: Used by the K8s scheduler to determine on which nodes to
+  //   schedule the pods. This helps spread the pods to different nodes. This
+  //   value should be conservative in order to make sure all the pods are
+  //   schedulable. This corresponds to POD_MEMORY_REQUEST (set by the Rook
+  //   CRD) and is the target memory utilization we try to maintain for daemons
+  //   that respect it.
+  //
+  //   If POD_MEMORY_REQUEST is present, we use it as the target.
+  //
+  // - Limits: At runtime, the container runtime (and Linux) will use the
+  //   limits to see if the pod is using too many resources. In that case, the
+  //   pod will be killed/restarted automatically if the pod goes over the limit.
+  //   This should be higher than what is specified for requests (potentially
+  //   much higher). This corresponds to the cgroup memory limit that will
+  //   trigger the Linux OOM killer.
+  //
+  //   If POD_MEMORY_LIMIT is present, we use it as the /default/ value for
+  //   the target, which means it will only apply if the *_memory_target option
+  //   isn't set via some other path (e.g., POD_MEMORY_REQUEST, or the cluster
+  //   config, or whatever.)
+  //
+  // Here are the documented best practices:
+  //   https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/#motivation-for-cpu-requests-and-limits
+  //
+  // When the operator creates the CephCluster CR, it will need to generate the
+  // desired requests and limits. As long as we are conservative in our choice
+  // for requests and generous with the limits we should be in a good place to
+  // get started.
+  //
+  // The support in Rook is already there for applying the limits as seen in
+  // these links.
+  //
+  // Rook docs on the resource requests and limits:
+  //   https://rook.io/docs/rook/v1.0/ceph-cluster-crd.html#cluster-wide-resources-configuration-settings
+  // Example CR settings:
+  //   https://github.com/rook/rook/blob/6d2ef936698593036185aabcb00d1d74f9c7bfc1/cluster/examples/kubernetes/ceph/cluster.yaml#L90
+  //
+  uint64_t pod_limit = 0, pod_request = 0;
+  if (auto pod_lim = getenv("POD_MEMORY_LIMIT"); pod_lim) {
+    string err;
+    uint64_t v = atoll(pod_lim);
+    if (v) {
+      switch (entity_type) {
+      case CEPH_ENTITY_TYPE_OSD:
+        {
+	  double cgroup_ratio = get_val<double>(
+	    values, "osd_memory_target_cgroup_limit_ratio");
+	  if (cgroup_ratio > 0.0) {
+	    pod_limit = v * cgroup_ratio;
+	    // set osd_memory_target *default* based on cgroup limit, so that
+	    // it can be overridden by any explicit settings elsewhere.
+	    set_val_default(values, tracker,
+			    "osd_memory_target", stringify(pod_limit));
+	  }
+	}
+      }
+    }
+  }
+  if (auto pod_req = getenv("POD_MEMORY_REQUEST"); pod_req) {
+    if (uint64_t v = atoll(pod_req); v) {
+      pod_request = v;
+    }
+  }
+  if (pod_request && pod_limit) {
+    // If both LIMIT and REQUEST are set, ensure that we use the
+    // min of request and limit*ratio.  This is important
+    // because k8s set set LIMIT == REQUEST if only LIMIT is
+    // specified, and we want to apply the ratio in that case,
+    // even though REQUEST is present.
+    pod_request = std::min<uint64_t>(pod_request, pod_limit);
+  }
+  if (pod_request) {
+    string err;
+    switch (entity_type) {
+    case CEPH_ENTITY_TYPE_OSD:
+      _set_val(values, tracker, stringify(pod_request),
+	       *find_option("osd_memory_target"),
+	       CONF_ENV, &err);
+      break;
+    }
+  }
+
+  if (getenv(args_var)) {
+    vector<const char *> env_args;
+    env_to_vec(env_args, args_var);
+    parse_argv(values, tracker, env_args, CONF_ENV);
+  }
+}
+
+void md_config_t::show_config(const ConfigValues& values,
+			      std::ostream& out) const
+{
+  _show_config(values, &out, nullptr);
+}
+
+void md_config_t::show_config(const ConfigValues& values,
+			      Formatter *f) const
+{
+  _show_config(values, nullptr, f);
+}
+
+void md_config_t::config_options(Formatter *f) const
+{
+  f->open_array_section("options");
+  for (const auto& i: schema) {
+    f->dump_object("option", i.second);
+  }
+  f->close_section();
+}
+
+void md_config_t::_show_config(const ConfigValues& values,
+			       std::ostream *out, Formatter *f) const
+{
+  if (out) {
+    *out << "name = " << values.name << std::endl;
+    *out << "cluster = " << values.cluster << std::endl;
+  }
+  if (f) {
+    f->dump_string("name", stringify(values.name));
+    f->dump_string("cluster", values.cluster);
+  }
+  for (const auto& i: schema) {
+    const Option &opt = i.second;
+    string val;
+    conf_stringify(_get_val(values, opt), &val);
+    if (out) {
+      *out << opt.name << " = " << val << std::endl;
+    }
+    if (f) {
+      f->dump_string(opt.name.c_str(), val);
+    }
+  }
+}
+
+int md_config_t::parse_argv(ConfigValues& values,
+			    const ConfigTracker& tracker,
+			    std::vector<const char*>& args, int level)
+{
+  if (safe_to_start_threads) {
+    return -ENOSYS;
+  }
+
+  // In this function, don't change any parts of the configuration directly.
+  // Instead, use set_val to set them. This will allow us to send the proper
+  // observer notifications later.
+  std::string val;
+  for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
+    if (strcmp(*i, "--") == 0) {
+      /* Normally we would use ceph_argparse_double_dash. However, in this
+       * function we *don't* want to remove the double dash, because later
+       * argument parses will still need to see it. */
+      break;
+    }
+    else if (ceph_argparse_flag(args, i, "--show_conf", (char*)NULL)) {
+      cerr << cf << std::endl;
+      _exit(0);
+    }
+    else if (ceph_argparse_flag(args, i, "--show_config", (char*)NULL)) {
+      do_show_config = true;
+    }
+    else if (ceph_argparse_witharg(args, i, &val, "--show_config_value", (char*)NULL)) {
+      do_show_config_value = val;
+    }
+    else if (ceph_argparse_flag(args, i, "--no-mon-config", (char*)NULL)) {
+      values.no_mon_config = true;
+    }
+    else if (ceph_argparse_flag(args, i, "--mon-config", (char*)NULL)) {
+      values.no_mon_config = false;
+    }
+    else if (ceph_argparse_flag(args, i, "--foreground", "-f", (char*)NULL)) {
+      set_val_or_die(values, tracker, "daemonize", "false");
+    }
+    else if (ceph_argparse_flag(args, i, "-d", (char*)NULL)) {
+      set_val_or_die(values, tracker, "fuse_debug", "true");
+      set_val_or_die(values, tracker, "daemonize", "false");
+      set_val_or_die(values, tracker, "log_file", "");
+      set_val_or_die(values, tracker, "log_to_stderr", "true");
+      set_val_or_die(values, tracker, "err_to_stderr", "true");
+      set_val_or_die(values, tracker, "log_to_syslog", "false");
+    }
+    // Some stuff that we wanted to give universal single-character options for
+    // Careful: you can burn through the alphabet pretty quickly by adding
+    // to this list.
+    else if (ceph_argparse_witharg(args, i, &val, "--monmap", "-M", (char*)NULL)) {
+      set_val_or_die(values, tracker, "monmap", val.c_str());
+    }
+    else if (ceph_argparse_witharg(args, i, &val, "--mon_host", "-m", (char*)NULL)) {
+      set_val_or_die(values, tracker, "mon_host", val.c_str());
+    }
+    else if (ceph_argparse_witharg(args, i, &val, "--bind", (char*)NULL)) {
+      set_val_or_die(values, tracker, "public_addr", val.c_str());
+    }
+    else if (ceph_argparse_witharg(args, i, &val, "--keyfile", "-K", (char*)NULL)) {
+      bufferlist bl;
+      string err;
+      int r;
+      if (val == "-") {
+	r = bl.read_fd(STDIN_FILENO, 1024);
+      } else {
+	r = bl.read_file(val.c_str(), &err);
+      }
+      if (r >= 0) {
+	string k(bl.c_str(), bl.length());
+	set_val_or_die(values, tracker, "key", k.c_str());
+      }
+    }
+    else if (ceph_argparse_witharg(args, i, &val, "--keyring", "-k", (char*)NULL)) {
+      set_val_or_die(values, tracker, "keyring", val.c_str());
+    }
+    else if (ceph_argparse_witharg(args, i, &val, "--client_mountpoint", "-r", (char*)NULL)) {
+      set_val_or_die(values, tracker, "client_mountpoint", val.c_str());
+    }
+    else {
+      int r = parse_option(values, tracker, args, i, NULL, level);
+      if (r < 0) {
+        return r;
+      }
+    }
+  }
+  // meta expands could have modified anything.  Copy it all out again.
+  update_legacy_vals(values);
+  return 0;
+}
+
+void md_config_t::do_argv_commands(const ConfigValues& values) const
+{
+
+  if (do_show_config) {
+    _show_config(values, &cout, NULL);
+    _exit(0);
+  }
+
+  if (do_show_config_value.size()) {
+    string val;
+    int r = conf_stringify(_get_val(values, do_show_config_value, 0, &cerr),
+			   &val);
+    if (r < 0) {
+      if (r == -ENOENT)
+	std::cerr << "failed to get config option '"
+		  << do_show_config_value << "': option not found" << std::endl;
+      else
+	std::cerr << "failed to get config option '"
+		  << do_show_config_value << "': " << cpp_strerror(r)
+		  << std::endl;
+      _exit(1);
+    }
+    std::cout << val << std::endl;
+    _exit(0);
+  }
+}
+
+int md_config_t::parse_option(ConfigValues& values,
+			      const ConfigTracker& tracker,
+			      std::vector<const char*>& args,
+			      std::vector<const char*>::iterator& i,
+			      ostream *oss,
+			      int level)
+{
+  int ret = 0;
+  size_t o = 0;
+  std::string val;
+
+  std::string option_name;
+  std::string error_message;
+  o = 0;
+  for (const auto& opt_iter: schema) {
+    const Option &opt = opt_iter.second;
+    ostringstream err;
+    std::string as_option("--");
+    as_option += opt.name;
+    option_name = opt.name;
+    if (ceph_argparse_witharg(
+	  args, i, &val, err,
+	  string(string("--default-") + opt.name).c_str(), (char*)NULL)) {
+      if (!err.str().empty()) {
+        error_message = err.str();
+	ret = -EINVAL;
+	break;
+      }
+      ret = _set_val(values, tracker,  val, opt, CONF_DEFAULT, &error_message);
+      break;
+    } else if (opt.type == Option::TYPE_BOOL) {
+      int res;
+      if (ceph_argparse_binary_flag(args, i, &res, oss, as_option.c_str(),
+				    (char*)NULL)) {
+	if (res == 0)
+	  ret = _set_val(values, tracker, "false", opt, level, &error_message);
+	else if (res == 1)
+	  ret = _set_val(values, tracker, "true", opt, level, &error_message);
+	else
+	  ret = res;
+	break;
+      } else {
+	std::string no("--no-");
+	no += opt.name;
+	if (ceph_argparse_flag(args, i, no.c_str(), (char*)NULL)) {
+	  ret = _set_val(values, tracker, "false", opt, level, &error_message);
+	  break;
+	}
+      }
+    } else if (ceph_argparse_witharg(args, i, &val, err,
+                                     as_option.c_str(), (char*)NULL)) {
+      if (!err.str().empty()) {
+        error_message = err.str();
+	ret = -EINVAL;
+	break;
+      }
+      ret = _set_val(values, tracker,  val, opt, level, &error_message);
+      break;
+    }
+    ++o;
+  }
+
+  if (ret < 0 || !error_message.empty()) {
+    ceph_assert(!option_name.empty());
+    if (oss) {
+      *oss << "Parse error setting " << option_name << " to '"
+           << val << "' using injectargs";
+      if (!error_message.empty()) {
+        *oss << " (" << error_message << ")";
+      }
+      *oss << ".\n";
+    } else {
+      cerr << "parse error setting '" << option_name << "' to '"
+	   << val << "'";
+      if (!error_message.empty()) {
+        cerr << " (" << error_message << ")";
+      }
+      cerr << "\n" << std::endl;
+    }
+  }
+
+  if (o == schema.size()) {
+    // ignore
+    ++i;
+  }
+  return ret >= 0 ? 0 : ret;
+}
+
+int md_config_t::parse_injectargs(ConfigValues& values,
+				  const ConfigTracker& tracker,
+				  std::vector<const char*>& args,
+				  std::ostream *oss)
+{
+  int ret = 0;
+  for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
+    int r = parse_option(values, tracker, args, i, oss, CONF_OVERRIDE);
+    if (r < 0)
+      ret = r;
+  }
+  return ret;
+}
+
+void md_config_t::set_safe_to_start_threads()
+{
+  safe_to_start_threads = true;
+}
+
+void md_config_t::_clear_safe_to_start_threads()
+{
+  safe_to_start_threads = false;
+}
+
+int md_config_t::injectargs(ConfigValues& values,
+			    const ConfigTracker& tracker,
+			    const std::string& s, std::ostream *oss)
+{
+  int ret;
+  char b[s.length()+1];
+  strcpy(b, s.c_str());
+  std::vector<const char*> nargs;
+  char *p = b;
+  while (*p) {
+    nargs.push_back(p);
+    while (*p && *p != ' ') p++;
+    if (!*p)
+      break;
+    *p++ = 0;
+    while (*p && *p == ' ') p++;
+  }
+  ret = parse_injectargs(values, tracker, nargs, oss);
+  if (!nargs.empty()) {
+    *oss << " failed to parse arguments: ";
+    std::string prefix;
+    for (std::vector<const char*>::const_iterator i = nargs.begin();
+	 i != nargs.end(); ++i) {
+      *oss << prefix << *i;
+      prefix = ",";
+    }
+    *oss << "\n";
+    ret = -EINVAL;
+  }
+  update_legacy_vals(values);
+  return ret;
+}
+
+void md_config_t::set_val_or_die(ConfigValues& values,
+				 const ConfigTracker& tracker,
+				 const std::string_view key,
+				 const std::string &val)
+{
+  std::stringstream err;
+  int ret = set_val(values, tracker, key, val, &err);
+  if (ret != 0) {
+    std::cerr << "set_val_or_die(" << key << "): " << err.str();
+  }
+  ceph_assert(ret == 0);
+}
+
+int md_config_t::set_val(ConfigValues& values,
+			 const ConfigTracker& tracker,
+			 const std::string_view key, const char *val,
+			 std::stringstream *err_ss)
+{
+  if (key.empty()) {
+    if (err_ss) *err_ss << "No key specified";
+    return -EINVAL;
+  }
+  if (!val) {
+    return -EINVAL;
+  }
+
+  std::string v(val);
+
+  string k(ConfFile::normalize_key_name(key));
+
+  const auto &opt_iter = schema.find(k);
+  if (opt_iter != schema.end()) {
+    const Option &opt = opt_iter->second;
+    std::string error_message;
+    int r = _set_val(values, tracker, v, opt, CONF_OVERRIDE, &error_message);
+    if (r >= 0) {
+      if (err_ss) *err_ss << "Set " << opt.name << " to " << v;
+      r = 0;
+    } else {
+      if (err_ss) *err_ss << error_message;
+    }
+    return r;
+  }
+
+  if (err_ss) *err_ss << "Configuration option not found: '" << key << "'";
+  return -ENOENT;
+}
+
+int md_config_t::rm_val(ConfigValues& values, const std::string_view key)
+{
+  return _rm_val(values, key, CONF_OVERRIDE);
+}
+
+void md_config_t::get_defaults_bl(const ConfigValues& values,
+					 bufferlist *bl)
+{
+  if (defaults_bl.length() == 0) {
+    uint32_t n = 0;
+    bufferlist bl;
+    for (const auto &i : schema) {
+      ++n;
+      encode(i.second.name, bl);
+      auto [value, found] = values.get_value(i.second.name, CONF_DEFAULT);
+      if (found) {
+	encode(Option::to_str(value), bl);
+      } else {
+	string val;
+	conf_stringify(_get_val_default(i.second), &val);
+	encode(val, bl);
+      }
+    }
+    encode(n, defaults_bl);
+    defaults_bl.claim_append(bl);
+  }
+  *bl = defaults_bl;
+}
+
+void md_config_t::get_config_bl(
+  const ConfigValues& values,
+  uint64_t have_version,
+  bufferlist *bl,
+  uint64_t *got_version)
+{
+  if (values_bl.length() == 0) {
+    uint32_t n = 0;
+    bufferlist bl;
+    values.for_each([&](auto& name, auto& configs) {
+      if (name == "fsid" ||
+	  name == "host") {
+	return;
+      }
+      ++n;
+      encode(name, bl);
+      encode((uint32_t)configs.size(), bl);
+      for (auto& j : configs) {
+	encode(j.first, bl);
+	encode(Option::to_str(j.second), bl);
+      }
+    });
+    // make sure overridden items appear, and include the default value
+    for (auto& i : ignored_mon_values) {
+      if (values.contains(i.first)) {
+	continue;
+      }
+      if (i.first == "fsid" ||
+	  i.first == "host") {
+	continue;
+      }
+      const Option *opt = find_option(i.first);
+      if (!opt) {
+	continue;
+      }
+      ++n;
+      encode(i.first, bl);
+      encode((uint32_t)1, bl);
+      encode((int32_t)CONF_DEFAULT, bl);
+      string val;
+      conf_stringify(_get_val_default(*opt), &val);
+      encode(val, bl);
+    }
+    encode(n, values_bl);
+    values_bl.claim_append(bl);
+    encode(ignored_mon_values, values_bl);
+    ++values_bl_version;
+  }
+  if (have_version != values_bl_version) {
+    *bl = values_bl;
+    *got_version = values_bl_version;
+  }
+}
+
+std::optional<std::string> md_config_t::get_val_default(std::string_view key)
+{
+  std::string val;
+  const Option *opt = find_option(key);
+  if (opt && (conf_stringify(_get_val_default(*opt), &val) == 0)) {
+    return std::make_optional(std::move(val));
+  }
+  return std::nullopt;
+}
+
+int md_config_t::get_val(const ConfigValues& values,
+			 const std::string_view key, char **buf, int len) const
+{
+  string k(ConfFile::normalize_key_name(key));
+  return _get_val_cstr(values, k, buf, len);
+}
+
+int md_config_t::get_val(
+  const ConfigValues& values,
+  const std::string_view key,
+  std::string *val) const
+{
+  return conf_stringify(get_val_generic(values, key), val);
+}
+
+Option::value_t md_config_t::get_val_generic(
+  const ConfigValues& values,
+  const std::string_view key) const
+{
+  return _get_val(values, key);
+}
+
+Option::value_t md_config_t::_get_val(
+  const ConfigValues& values,
+  const std::string_view key,
+  expand_stack_t *stack,
+  std::ostream *err) const
+{
+  if (key.empty()) {
+    return {};
+  }
+
+  // In key names, leading and trailing whitespace are not significant.
+  string k(ConfFile::normalize_key_name(key));
+
+  const Option *o = find_option(k);
+  if (!o) {
+    // not a valid config option
+    return {};
+  }
+
+  return _get_val(values, *o, stack, err);
+}
+
+Option::value_t md_config_t::_get_val(
+  const ConfigValues& values,
+  const Option& o,
+  expand_stack_t *stack,
+  std::ostream *err) const
+{
+  expand_stack_t a_stack;
+  if (!stack) {
+    stack = &a_stack;
+  }
+  return _expand_meta(values,
+		      _get_val_nometa(values, o),
+		      &o, stack, err);
+}
+
+Option::value_t md_config_t::_get_val_nometa(const ConfigValues& values,
+					     const Option& o) const
+{
+  if (auto [value, found] = values.get_value(o.name, -1); found) {
+    return value;
+  } else {
+    return _get_val_default(o);
+  }
+}
+
+const Option::value_t& md_config_t::_get_val_default(const Option& o) const
+{
+  bool has_daemon_default = (o.daemon_value != Option::value_t{});
+  if (is_daemon && has_daemon_default) {
+    return o.daemon_value;
+  } else {
+    return o.value;
+  }
+}
+
+void md_config_t::early_expand_meta(
+  const ConfigValues& values,
+  std::string &val,
+  std::ostream *err) const
+{
+  expand_stack_t stack;
+  Option::value_t v = _expand_meta(values,
+				   Option::value_t(val),
+				   nullptr, &stack, err);
+  conf_stringify(v, &val);
+}
+
+bool md_config_t::finalize_reexpand_meta(ConfigValues& values,
+					 const ConfigTracker& tracker)
+{
+  std::vector<std::string> reexpands;
+  reexpands.swap(may_reexpand_meta);
+  for (auto& name : reexpands) {
+    // always refresh the options if they are in the may_reexpand_meta
+    // map, because the options may have already been expanded with old
+    // meta.
+    const auto &opt_iter = schema.find(name);
+    ceph_assert(opt_iter != schema.end());
+    const Option &opt = opt_iter->second;
+    _refresh(values, opt);
+  }
+
+  return !may_reexpand_meta.empty();
+}
+
+Option::value_t md_config_t::_expand_meta(
+  const ConfigValues& values,
+  const Option::value_t& in,
+  const Option *o,
+  expand_stack_t *stack,
+  std::ostream *err) const
+{
+  //cout << __func__ << " in '" << in << "' stack " << stack << std::endl;
+  if (!stack) {
+    return in;
+  }
+  const auto str = std::get_if<std::string>(&in);
+  if (!str) {
+    // strings only!
+    return in;
+  }
+
+  auto pos = str->find('$');
+  if (pos == std::string::npos) {
+    // no substitutions!
+    return in;
+  }
+
+  if (o) {
+    stack->push_back(make_pair(o, &in));
+  }
+  string out;
+  decltype(pos) last_pos = 0;
+  while (pos != std::string::npos) {
+    ceph_assert((*str)[pos] == '$');
+    if (pos > last_pos) {
+      out += str->substr(last_pos, pos - last_pos);
+    }
+
+    // try to parse the variable name into var, either \$\{(.+)\} or
+    // \$[a-z\_]+
+    const char *valid_chars = "abcdefghijklmnopqrstuvwxyz_";
+    string var;
+    size_t endpos = 0;
+    if ((*str)[pos+1] == '{') {
+      // ...${foo_bar}...
+      endpos = str->find_first_not_of(valid_chars, pos + 2);
+      if (endpos != std::string::npos &&
+	  (*str)[endpos] == '}') {
+	var = str->substr(pos + 2, endpos - pos - 2);
+	endpos++;
+      }
+    } else {
+      // ...$foo...
+      endpos = str->find_first_not_of(valid_chars, pos + 1);
+      if (endpos != std::string::npos)
+	var = str->substr(pos + 1, endpos - pos - 1);
+      else
+	var = str->substr(pos + 1);
+    }
+    last_pos = endpos;
+
+    if (!var.size()) {
+      out += '$';
+    } else {
+      //cout << " found var " << var << std::endl;
+      // special metavariable?
+      if (var == "type") {
+	out += values.name.get_type_name();
+      } else if (var == "cluster") {
+	out += values.cluster;
+      } else if (var == "name") {
+	out += values.name.to_cstr();
+      } else if (var == "host") {
+	if (values.host == "") {
+	  out += ceph_get_short_hostname();
+	} else {
+	  out += values.host;
+	}
+      } else if (var == "num") {
+	out += values.name.get_id().c_str();
+      } else if (var == "id") {
+	out += values.name.get_id();
+      } else if (var == "pid") {
+        char *_pid = getenv("PID");
+        if (_pid) {
+          out += _pid;
+        } else {
+          out += stringify(getpid());
+        }
+        if (o) {
+          may_reexpand_meta.push_back(o->name);
+        }
+      } else if (var == "cctid") {
+	out += stringify((unsigned long long)this);
+      } else if (var == "home") {
+	const char *home = getenv("HOME");
+	out = home ? std::string(home) : std::string();
+      } else if (var == "programdata") {
+        const char *home = getenv("ProgramData");
+        out = home ? std::string(home) : std::string();
+      }else {
+	if (var == "data_dir") {
+	  var = data_dir_option;
+	}
+	const Option *o = find_option(var);
+	if (!o) {
+	  out += str->substr(pos, endpos - pos);
+	} else {
+	  auto match = std::find_if(
+	    stack->begin(), stack->end(),
+	    [o](pair<const Option *,const Option::value_t*>& item) {
+	      return item.first == o;
+	    });
+	  if (match != stack->end()) {
+	    // substitution loop; break the cycle
+	    if (err) {
+	      *err << "variable expansion loop at " << var << "="
+		   << Option::to_str(*match->second) << "\n"
+		   << "expansion stack:\n";
+	      for (auto i = stack->rbegin(); i != stack->rend(); ++i) {
+		*err << i->first->name << "="
+		     << Option::to_str(*i->second) << "\n";
+	      }
+	    }
+	    return Option::value_t(std::string("$") + o->name);
+	  } else {
+	    // recursively evaluate!
+	    string n;
+	    conf_stringify(_get_val(values, *o, stack, err), &n);
+	    out += n;
+	  }
+	}
+      }
+    }
+    pos = str->find('$', last_pos);
+  }
+  if (last_pos != std::string::npos) {
+    out += str->substr(last_pos);
+  }
+  if (o) {
+    stack->pop_back();
+  }
+
+  return Option::value_t(out);
+}
+
+int md_config_t::_get_val_cstr(
+  const ConfigValues& values,
+  const std::string& key, char **buf, int len) const
+{
+  if (key.empty())
+    return -EINVAL;
+
+  string val;
+  if (conf_stringify(_get_val(values, key), &val) == 0) {
+    int l = val.length() + 1;
+    if (len == -1) {
+      *buf = (char*)malloc(l);
+      if (!*buf)
+        return -ENOMEM;
+      strncpy(*buf, val.c_str(), l);
+      return 0;
+    }
+    snprintf(*buf, len, "%s", val.c_str());
+    return (l > len) ? -ENAMETOOLONG : 0;
+  }
+
+  // couldn't find a configuration option with key 'k'
+  return -ENOENT;
+}
+
+void md_config_t::get_all_keys(std::vector<std::string> *keys) const {
+  const std::string negative_flag_prefix("no_");
+
+  keys->clear();
+  keys->reserve(schema.size());
+  for (const auto &i: schema) {
+    const Option &opt = i.second;
+    keys->push_back(opt.name);
+    if (opt.type == Option::TYPE_BOOL) {
+      keys->push_back(negative_flag_prefix + opt.name);
+    }
+  }
+}
+
+/* The order of the sections here is important.  The first section in the
+ * vector is the "highest priority" section; if we find it there, we'll stop
+ * looking. The lowest priority section is the one we look in only if all
+ * others had nothing.  This should always be the global section.
+ */
+std::vector <std::string>
+md_config_t::get_my_sections(const ConfigValues& values) const
+{
+  return {values.name.to_str(),
+	  values.name.get_type_name().data(),
+	  "global"};
+}
+
+// Return a list of all sections
+int md_config_t::get_all_sections(std::vector <std::string> &sections) const
+{
+  for (auto [section_name, section] : cf) {
+    sections.push_back(section_name);
+    std::ignore = section;
+  }
+  return 0;
+}
+
+int md_config_t::get_val_from_conf_file(
+  const ConfigValues& values,
+  const std::vector <std::string> &sections,
+  const std::string_view key,
+  std::string &out,
+  bool emeta) const
+{
+  int r = _get_val_from_conf_file(sections, key, out);
+  if (r < 0) {
+    return r;
+  }
+  if (emeta) {
+    expand_stack_t stack;
+    auto v = _expand_meta(values, Option::value_t(out), nullptr, &stack, nullptr);
+    conf_stringify(v, &out);
+  }
+  return 0;
+}
+
+int md_config_t::_get_val_from_conf_file(
+  const std::vector <std::string> &sections,
+  const std::string_view key,
+  std::string &out) const
+{
+  for (auto &s : sections) {
+    int ret = cf.read(s, key, out);
+    if (ret == 0) {
+      return 0;
+    } else if (ret != -ENOENT) {
+      return ret;
+    }
+  }
+  return -ENOENT;
+}
+
+int md_config_t::_set_val(
+  ConfigValues& values,
+  const ConfigTracker& observers,
+  const std::string &raw_val,
+  const Option &opt,
+  int level,
+  std::string *error_message)
+{
+  Option::value_t new_value;
+  ceph_assert(error_message);
+  int r = opt.parse_value(raw_val, &new_value, error_message);
+  if (r < 0) {
+    return r;
+  }
+
+  // unsafe runtime change?
+  if (!opt.can_update_at_runtime() &&
+      safe_to_start_threads &&
+      !observers.is_tracking(opt.name)) {
+    // accept value if it is not actually a change
+    if (new_value != _get_val_nometa(values, opt)) {
+      *error_message = string("Configuration option '") + opt.name +
+	"' may not be modified at runtime";
+      return -EPERM;
+    }
+  }
+
+  // Apply the value to its entry in the `values` map
+  auto result = values.set_value(opt.name, std::move(new_value), level);
+  switch (result) {
+  case ConfigValues::SET_NO_CHANGE:
+    break;
+  case ConfigValues::SET_NO_EFFECT:
+    values_bl.clear();
+    break;
+  case ConfigValues::SET_HAVE_EFFECT:
+    values_bl.clear();
+    _refresh(values, opt);
+    break;
+  }
+  return result;
+}
+
+void md_config_t::_refresh(ConfigValues& values, const Option& opt)
+{
+  // Apply the value to its legacy field, if it has one
+  auto legacy_ptr_iter = legacy_values.find(std::string(opt.name));
+  if (legacy_ptr_iter != legacy_values.end()) {
+    update_legacy_val(values, opt, legacy_ptr_iter->second);
+  }
+  // Was this a debug_* option update?
+  if (opt.subsys >= 0) {
+    string actual_val;
+    conf_stringify(_get_val(values, opt), &actual_val);
+    values.set_logging(opt.subsys, actual_val.c_str());
+  } else {
+    // normal option, advertise the change.
+    values.changed.insert(opt.name);
+  }
+}
+
+int md_config_t::_rm_val(ConfigValues& values,
+			 const std::string_view key,
+			 int level)
+{
+  if (schema.count(key) == 0) {
+    return -EINVAL;
+  }
+  auto ret = values.rm_val(std::string{key}, level);
+  if (ret < 0) {
+    return ret;
+  }
+  if (ret == ConfigValues::SET_HAVE_EFFECT) {
+    _refresh(values, *find_option(key));
+  }
+  values_bl.clear();
+  return 0;
+}
+
+namespace {
+template<typename Size>
+struct get_size_visitor
+{
+  get_size_visitor() {}
+
+  template<typename T>
+  Size operator()(const T&) const {
+    return -1;
+  }
+  Size operator()(const Option::size_t& sz) const {
+    return static_cast<Size>(sz.value);
+  }
+  Size operator()(const Size& v) const {
+    return v;
+  }
+};
+
+/**
+ * Handles assigning from a variant-of-types to a variant-of-pointers-to-types
+ */
+class assign_visitor
+{
+  ConfigValues *conf;
+  Option::value_t val;
+  public:
+
+  assign_visitor(ConfigValues *conf_, Option::value_t val_)
+    : conf(conf_), val(val_)
+  {}
+
+  template <typename T>
+  void operator()(T ConfigValues::* ptr) const
+  {
+    T *member = const_cast<T *>(&(conf->*(ptr)));
+
+    *member = std::get<T>(val);
+  }
+  void operator()(uint64_t ConfigValues::* ptr) const
+  {
+    using T = uint64_t;
+    auto member = const_cast<T*>(&(conf->*(ptr)));
+    *member = std::visit(get_size_visitor<T>{}, val);
+  }
+  void operator()(int64_t ConfigValues::* ptr) const
+  {
+    using T = int64_t;
+    auto member = const_cast<T*>(&(conf->*(ptr)));
+    *member = std::visit(get_size_visitor<T>{}, val);
+  }
+};
+} // anonymous namespace
+
+void md_config_t::update_legacy_vals(ConfigValues& values)
+{
+  for (const auto &i : legacy_values) {
+    const auto &name = i.first;
+    const auto &option = schema.at(name);
+    auto ptr = i.second;
+    update_legacy_val(values, option, ptr);
+  }
+}
+
+void md_config_t::update_legacy_val(ConfigValues& values,
+				    const Option &opt,
+                                    md_config_t::member_ptr_t member_ptr)
+{
+  Option::value_t v = _get_val(values, opt);
+  std::visit(assign_visitor(&values, v), member_ptr);
+}
+
+static void dump(Formatter *f, int level, Option::value_t in)
+{
+  if (const auto v = std::get_if<bool>(&in)) {
+    f->dump_bool(ceph_conf_level_name(level), *v);
+  } else if (const auto v = std::get_if<int64_t>(&in)) {
+    f->dump_int(ceph_conf_level_name(level), *v);
+  } else if (const auto v = std::get_if<uint64_t>(&in)) {
+    f->dump_unsigned(ceph_conf_level_name(level), *v);
+  } else if (const auto v = std::get_if<double>(&in)) {
+    f->dump_float(ceph_conf_level_name(level), *v);
+  } else {
+    f->dump_stream(ceph_conf_level_name(level)) << Option::to_str(in);
+  }
+}
+
+void md_config_t::diff(
+  const ConfigValues& values,
+  Formatter *f,
+  string name) const
+{
+  values.for_each([this, f, &values] (auto& name, auto& configs) {
+    if (configs.empty()) {
+      return;
+    }
+    f->open_object_section(std::string{name}.c_str());
+    const Option *o = find_option(name);
+    if (configs.size() &&
+	configs.begin()->first != CONF_DEFAULT) {
+      // show compiled-in default only if an override default wasn't provided
+      dump(f, CONF_DEFAULT, _get_val_default(*o));
+    }
+    for (auto& j : configs) {
+      dump(f, j.first, j.second);
+    }
+    dump(f, CONF_FINAL, _get_val(values, *o));
+    f->close_section();
+  });
+}
+
+void md_config_t::complain_about_parse_error(CephContext *cct)
+{
+  ::complain_about_parse_error(cct, parse_error);
+}
diff --git a/src/common/config.h b/src/common/config.h
new file mode 100644
index 000000000..bafac6631
--- /dev/null
+++ b/src/common/config.h
@@ -0,0 +1,374 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_CONFIG_H
+#define CEPH_CONFIG_H
+
+#include <map>
+#include <variant>
+#include <boost/container/small_vector.hpp>
+#include "common/ConfUtils.h"
+#include "common/code_environment.h"
+#include "log/SubsystemMap.h"
+#include "common/options.h"
+#include "common/subsys_types.h"
+#include "common/config_tracker.h"
+#include "common/config_values.h"
+#include "include/common_fwd.h"
+
+enum {
+  CONF_DEFAULT,
+  CONF_MON,
+  CONF_FILE,
+  CONF_ENV,
+  CONF_CMDLINE,
+  CONF_OVERRIDE,
+  CONF_FINAL
+};
+
+extern const char *ceph_conf_level_name(int level);
+
+/** This class represents the current Ceph configuration.
+ *
+ * For Ceph daemons, this is the daemon configuration.  Log levels, caching
+ * settings, btrfs settings, and so forth can all be found here.  For libcephfs
+ * and librados users, this is the configuration associated with their context.
+ *
+ * For information about how this class is loaded from a configuration file,
+ * see common/ConfUtils.
+ *
+ * ACCESS
+ *
+ * There are 3 ways to read the ceph context-- the old way and two new ways.
+ * In the old way, code would simply read the public variables of the
+ * configuration, without taking a lock. In the new way #1, code registers a
+ * configuration observer which receives callbacks when a value changes. These
+ * callbacks take place under the md_config_t lock. Alternatively one can use
+ * get_val(const char *name) method to safely get a copy of the value.
+ *
+ * To prevent serious problems resulting from thread-safety issues, we disallow
+ * changing std::string configuration values after
+ * md_config_t::safe_to_start_threads becomes true. You can still
+ * change integer or floating point values, and the option declared with
+ * SAFE_OPTION macro. Notice the latter options can not be read directly
+ * (conf->foo), one should use either observers or get_val() method
+ * (conf->get_val("foo")).
+ *
+ * FIXME: really we shouldn't allow changing integer or floating point values
+ * while another thread is reading them, either.
+ */
+struct md_config_t {
+public:
+  typedef std::variant<int64_t ConfigValues::*,
+                       uint64_t ConfigValues::*,
+                       std::string ConfigValues::*,
+                       double ConfigValues::*,
+                       bool ConfigValues::*,
+                       entity_addr_t ConfigValues::*,
+                       entity_addrvec_t ConfigValues::*,
+                       uuid_d ConfigValues::*> member_ptr_t;
+
+  // For use when intercepting configuration updates
+  typedef std::function<bool(
+      const std::string &k, const std::string &v)> config_callback;
+
+  /// true if we are a daemon (as per CephContext::code_env)
+  const bool is_daemon;
+
+  /*
+   * Mapping from legacy config option names to class members
+   */
+  std::map<std::string_view, member_ptr_t> legacy_values;
+
+  /**
+   * The configuration schema, in the form of Option objects describing
+   * possible settings.
+   */
+  std::map<std::string_view, const Option&> schema;
+
+  /// values from mon that we failed to set
+  std::map<std::string,std::string> ignored_mon_values;
+
+  /// original raw values saved that may need to re-expand at certain time
+  mutable std::vector<std::string> may_reexpand_meta;
+
+  /// encoded, cached copy of of values + ignored_mon_values
+  ceph::bufferlist values_bl;
+
+  /// version for values_bl; increments each time there is a change
+  uint64_t values_bl_version = 0;
+
+  /// encoded copy of defaults (map<string,string>)
+  ceph::bufferlist defaults_bl;
+
+  // Create a new md_config_t structure.
+  explicit md_config_t(ConfigValues& values,
+		       const ConfigTracker& tracker,
+		       bool is_daemon=false);
+  ~md_config_t();
+
+  // Parse a config file
+  int parse_config_files(ConfigValues& values, const ConfigTracker& tracker,
+			 const char *conf_files,
+			 std::ostream *warnings, int flags);
+  int parse_buffer(ConfigValues& values, const ConfigTracker& tracker,
+		   const char* buf, size_t len,
+		   std::ostream *warnings);
+  void update_legacy_vals(ConfigValues& values);
+  // Absorb config settings from the environment
+  void parse_env(unsigned entity_type,
+		 ConfigValues& values, const ConfigTracker& tracker,
+		 const char *env_var = "CEPH_ARGS");
+
+  // Absorb config settings from argv
+  int parse_argv(ConfigValues& values, const ConfigTracker& tracker,
+		 std::vector<const char*>& args, int level=CONF_CMDLINE);
+
+  // do any commands we got from argv (--show-config, --show-config-val)
+  void do_argv_commands(const ConfigValues& values) const;
+
+  bool _internal_field(const std::string& k);
+
+  void set_safe_to_start_threads();
+  void _clear_safe_to_start_threads();  // this is only used by the unit test
+
+  /// Look up an option in the schema
+  const Option *find_option(const std::string_view name) const;
+
+  /// Set a default value
+  void set_val_default(ConfigValues& values,
+		       const ConfigTracker& tracker,
+		       const std::string_view key, const std::string &val);
+
+  /// Set a values from mon
+  int set_mon_vals(CephContext *cct,
+		   ConfigValues& values,
+		   const ConfigTracker& tracker,
+		   const std::map<std::string,std::string, std::less<>>& kv,
+		   config_callback config_cb);
+
+  // Called by the Ceph daemons to make configuration changes at runtime
+  int injectargs(ConfigValues& values,
+		 const ConfigTracker& tracker,
+		 const std::string &s,
+		 std::ostream *oss);
+
+  // Set a configuration value, or crash
+  // Metavariables will be expanded.
+  void set_val_or_die(ConfigValues& values, const ConfigTracker& tracker,
+		      const std::string_view key, const std::string &val);
+
+  // Set a configuration value.
+  // Metavariables will be expanded.
+  int set_val(ConfigValues& values, const ConfigTracker& tracker,
+	      const std::string_view key, const char *val,
+              std::stringstream *err_ss=nullptr);
+  int set_val(ConfigValues& values, const ConfigTracker& tracker,
+	      const std::string_view key, const std::string& s,
+              std::stringstream *err_ss=nullptr) {
+    return set_val(values, tracker, key, s.c_str(), err_ss);
+  }
+
+  /// clear override value
+  int rm_val(ConfigValues& values, const std::string_view key);
+
+  /// get encoded map<string,map<int32_t,string>> of entire config
+  void get_config_bl(const ConfigValues& values,
+		     uint64_t have_version,
+		     ceph::buffer::list *bl,
+		     uint64_t *got_version);
+
+  /// get encoded map<string,string> of compiled-in defaults
+  void get_defaults_bl(const ConfigValues& values, ceph::buffer::list *bl);
+
+  /// Get the default value of a configuration option
+  std::optional<std::string> get_val_default(std::string_view key);
+
+  // Get a configuration value.
+  // No metavariables will be returned (they will have already been expanded)
+  int get_val(const ConfigValues& values, const std::string_view key, char **buf, int len) const;
+  int get_val(const ConfigValues& values, const std::string_view key, std::string *val) const;
+  template<typename T> const T get_val(const ConfigValues& values, const std::string_view key) const;
+  template<typename T, typename Callback, typename...Args>
+  auto with_val(const ConfigValues& values, const std::string_view key,
+		Callback&& cb, Args&&... args) const ->
+    std::result_of_t<Callback(const T&, Args...)> {
+    return std::forward<Callback>(cb)(
+      std::get<T>(this->get_val_generic(values, key)),
+      std::forward<Args>(args)...);
+  }
+
+  void get_all_keys(std::vector<std::string> *keys) const;
+
+  // Return a list of all the sections that the current entity is a member of.
+  std::vector<std::string> get_my_sections(const ConfigValues& values) const;
+
+  // Return a list of all sections
+  int get_all_sections(std::vector <std::string> &sections) const;
+
+  // Get a value from the configuration file that we read earlier.
+  // Metavariables will be expanded if emeta is true.
+  int get_val_from_conf_file(const ConfigValues& values,
+		   const std::vector <std::string> &sections,
+		   const std::string_view key, std::string &out, bool emeta) const;
+
+  /// dump all config values to a stream
+  void show_config(const ConfigValues& values, std::ostream& out) const;
+  /// dump all config values to a formatter
+  void show_config(const ConfigValues& values, ceph::Formatter *f) const;
+
+  /// dump all config settings to a formatter
+  void config_options(ceph::Formatter *f) const;
+
+  /// dump config diff from default, conf, mon, etc.
+  void diff(const ConfigValues& values,
+	    ceph::Formatter *f,
+	    std::string name = {}) const;
+
+  /// print/log warnings/errors from parsing the config
+  void complain_about_parse_error(CephContext *cct);
+
+private:
+  // we use this to avoid variable expansion loops
+  typedef boost::container::small_vector<std::pair<const Option*,
+						   const Option::value_t*>,
+					 4> expand_stack_t;
+
+  void validate_schema();
+  void validate_default_settings();
+
+  Option::value_t get_val_generic(const ConfigValues& values,
+				  const std::string_view key) const;
+  int _get_val_cstr(const ConfigValues& values,
+		    const std::string& key, char **buf, int len) const;
+  Option::value_t _get_val(const ConfigValues& values,
+			   const std::string_view key,
+			   expand_stack_t *stack=0,
+			   std::ostream *err=0) const;
+  Option::value_t _get_val(const ConfigValues& values,
+			   const Option& o,
+			   expand_stack_t *stack=0,
+			   std::ostream *err=0) const;
+  const Option::value_t& _get_val_default(const Option& o) const;
+  Option::value_t _get_val_nometa(const ConfigValues& values,
+				  const Option& o) const;
+
+  int _rm_val(ConfigValues& values, const std::string_view key, int level);
+
+  void _refresh(ConfigValues& values, const Option& opt);
+
+  void _show_config(const ConfigValues& values,
+		    std::ostream *out, ceph::Formatter *f) const;
+
+  int _get_val_from_conf_file(const std::vector<std::string> &sections,
+			      const std::string_view key, std::string &out) const;
+
+  int parse_option(ConfigValues& values,
+		   const ConfigTracker& tracker,
+		   std::vector<const char*>& args,
+		   std::vector<const char*>::iterator& i,
+		   std::ostream *oss,
+		   int level);
+  int parse_injectargs(ConfigValues& values,
+		       const ConfigTracker& tracker,
+		       std::vector<const char*>& args,
+		       std::ostream *oss);
+
+  // @returns negative number for an error, otherwise a
+  //          @c ConfigValues::set_value_result_t is returned.
+  int _set_val(
+    ConfigValues& values,
+    const ConfigTracker& tracker,
+    const std::string &val,
+    const Option &opt,
+    int level,  // CONF_*
+    std::string *error_message);
+
+  template <typename T>
+  void assign_member(member_ptr_t ptr, const Option::value_t &val);
+
+
+  void update_legacy_val(ConfigValues& values,
+			 const Option &opt,
+			 member_ptr_t member);
+
+  Option::value_t _expand_meta(
+    const ConfigValues& values,
+    const Option::value_t& in,
+    const Option *o,
+    expand_stack_t *stack,
+    std::ostream *err) const;
+
+public:  // for global_init
+  void early_expand_meta(const ConfigValues& values,
+			 std::string &val,
+			 std::ostream *oss) const;
+
+  // for those want to reexpand special meta, e.g, $pid
+  bool finalize_reexpand_meta(ConfigValues& values,
+			      const ConfigTracker& tracker);
+
+  std::list<std::string> get_conffile_paths(const ConfigValues& values,
+					    const char *conf_files,
+					    std::ostream *warnings,
+					    int flags) const;
+
+  const std::string& get_conf_path() const {
+    return conf_path;
+  }
+private:
+  static std::string get_cluster_name(const char* conffile_path);
+  // The configuration file we read, or NULL if we haven't read one.
+  ConfFile cf;
+  std::string conf_path;
+public:
+  std::string parse_error;
+private:
+
+  // This will be set to true when it is safe to start threads.
+  // Once it is true, it will never change.
+  bool safe_to_start_threads = false;
+
+  bool do_show_config = false;
+  std::string do_show_config_value;
+
+  std::vector<Option> subsys_options;
+
+public:
+  std::string data_dir_option;  ///< data_dir config option, if any
+
+public:
+  unsigned get_osd_pool_default_min_size(const ConfigValues& values,
+                                         uint8_t size) const {
+    uint8_t min_size = get_val<uint64_t>(values, "osd_pool_default_min_size");
+    return min_size ? std::min(min_size, size) : (size - size / 2);
+  }
+
+  friend class test_md_config_t;
+};
+
+template<typename T>
+const T md_config_t::get_val(const ConfigValues& values,
+			     const std::string_view key) const {
+  return std::get<T>(this->get_val_generic(values, key));
+}
+
+inline std::ostream& operator<<(std::ostream& o, const std::monostate&) {
+      return o << "INVALID_CONFIG_VALUE";
+}
+
+int ceph_resolve_file_search(const std::string& filename_list,
+			     std::string& result);
+
+#endif
diff --git a/src/common/config_cacher.h b/src/common/config_cacher.h
new file mode 100644
index 000000000..a84bad08e
--- /dev/null
+++ b/src/common/config_cacher.h
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_CONFIG_CACHER_H
+#define CEPH_CONFIG_CACHER_H
+
+#include "common/config_obs.h"
+#include "common/config.h"
+
+template <typename ValueT>
+class md_config_cacher_t : public md_config_obs_t {
+  ConfigProxy& conf;
+  const char* const option_name;
+  std::atomic<ValueT> value_cache;
+
+  const char** get_tracked_conf_keys() const override {
+    const static char* keys[] = { option_name, nullptr };
+    return keys;
+  }
+
+  void handle_conf_change(const ConfigProxy& conf,
+                          const std::set<std::string>& changed) override {
+    if (changed.count(option_name)) {
+      value_cache.store(conf.get_val<ValueT>(option_name));
+    }
+  }
+
+public:
+  md_config_cacher_t(ConfigProxy& conf,
+                     const char* const option_name)
+    : conf(conf),
+      option_name(option_name) {
+    conf.add_observer(this);
+    std::atomic_init(&value_cache,
+                     conf.get_val<ValueT>(option_name));
+  }
+
+  ~md_config_cacher_t() {
+    conf.remove_observer(this);
+  }
+
+  operator ValueT() const {
+    return value_cache.load();
+  }
+};
+
+#endif // CEPH_CONFIG_CACHER_H
+
diff --git a/src/common/config_fwd.h b/src/common/config_fwd.h
new file mode 100644
index 000000000..f29a1324c
--- /dev/null
+++ b/src/common/config_fwd.h
@@ -0,0 +1,9 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#pragma once
+#include "include/common_fwd.h"
+
+namespace TOPNSPC::common {
+  class ConfigProxy;
+}
+using TOPNSPC::common::ConfigProxy;
diff --git a/src/common/config_obs.h b/src/common/config_obs.h
new file mode 100644
index 000000000..20d12ad83
--- /dev/null
+++ b/src/common/config_obs.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_CONFIG_OBS_H
+#define CEPH_CONFIG_OBS_H
+
+#include <set>
+#include <string>
+
+#include "common/config_fwd.h"
+
+namespace ceph {
+/** @brief Base class for configuration observers.
+ * Use this as a base class for your object if it has to respond to configuration changes,
+ * for example by updating some values or modifying its behavior.
+ * Subscribe for configuration changes by calling the md_config_t::add_observer() method
+ * and unsubscribe using md_config_t::remove_observer().
+ */
+template<class ConfigProxy>
+class md_config_obs_impl {
+public:
+  virtual ~md_config_obs_impl() {}
+  /** @brief Get a table of strings specifying the configuration keys in which the object is interested.
+   * This is called when the object is subscribed to configuration changes with add_observer().
+   * The returned table should not be freed until the observer is removed with remove_observer().
+   * Note that it is not possible to change the set of tracked keys without re-subscribing. */
+  virtual const char** get_tracked_conf_keys() const = 0;
+  /// React to a configuration change.
+  virtual void handle_conf_change(const ConfigProxy& conf,
+				  const std::set <std::string> &changed) = 0;
+  /// Unused for now
+  virtual void handle_subsys_change(const ConfigProxy& conf,
+				    const std::set<int>& changed) { }
+};
+}
+
+using md_config_obs_t = ceph::md_config_obs_impl<ConfigProxy>;
+
+#endif
diff --git a/src/common/config_obs_mgr.h b/src/common/config_obs_mgr.h
new file mode 100644
index 000000000..06b3cf934
--- /dev/null
+++ b/src/common/config_obs_mgr.h
@@ -0,0 +1,118 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <string>
+
+#include "common/config_tracker.h"
+
+class ConfigValues;
+
+// @c ObserverMgr manages a set of config observers which are interested in
+// the changes of settings at runtime.
+template<class ConfigObs>
+class ObserverMgr : public ConfigTracker {
+  // Maps configuration options to the observer listening for them.
+  using obs_map_t = std::multimap<std::string, ConfigObs*>;
+  obs_map_t observers;
+
+public:
+  typedef std::map<ConfigObs*, std::set<std::string>> rev_obs_map;
+  typedef std::function<void(ConfigObs*, const std::string&)> config_gather_cb;
+
+  // Adds a new observer to this configuration. You can do this at any time,
+  // but it will only receive notifications for the changes that happen after
+  // you attach it, obviously.
+  //
+  // Most developers will probably attach their observers after global_init,
+  // but before anyone can call injectargs.
+  //
+  // The caller is responsible for allocating observers.
+  void add_observer(ConfigObs* observer);
+
+  // Remove an observer from this configuration.
+  // This doesn't delete the observer! If you allocated it with new(),
+  // you need to delete it yourself.
+  // This function will assert if you try to delete an observer that isn't
+  // there.
+  void remove_observer(ConfigObs* observer);
+  // invoke callback for every observers tracking keys
+  void for_each_observer(config_gather_cb callback);
+  // invoke callback for observers keys tracking the provided change set
+  template<class ConfigProxyT>
+  void for_each_change(const std::set<std::string>& changes,
+                       ConfigProxyT& proxy,
+                       config_gather_cb callback, std::ostream *oss);
+  bool is_tracking(const std::string& name) const override;
+};
+
+// we could put the implementations in a .cc file, and only instantiate the
+// used template specializations explicitly, but that forces us to involve
+// unused headers and libraries at compile-time. for instance, for instantiate,
+// to instantiate ObserverMgr for seastar, we will need to include seastar
+// headers to get the necessary types in place, but that would force us to link
+// the non-seastar binaries against seastar libraries. so, to avoid pulling
+// in unused dependencies at the expense of increasing compiling time, we put
+// the implementation in the header file.
+template<class ConfigObs>
+void ObserverMgr<ConfigObs>::add_observer(ConfigObs* observer)
+{
+  const char **keys = observer->get_tracked_conf_keys();
+  for (const char ** k = keys; *k; ++k) {
+    observers.emplace(*k, observer);
+  }
+}
+
+template<class ConfigObs>
+void ObserverMgr<ConfigObs>::remove_observer(ConfigObs* observer)
+{
+  [[maybe_unused]] bool found_obs = false;
+  for (auto o = observers.begin(); o != observers.end(); ) {
+    if (o->second == observer) {
+      observers.erase(o++);
+      found_obs = true;
+    } else {
+      ++o;
+    }
+  }
+  ceph_assert(found_obs);
+}
+
+template<class ConfigObs>
+void ObserverMgr<ConfigObs>::for_each_observer(config_gather_cb callback)
+{
+  for (const auto& [key, obs] : observers) {
+    callback(obs, key);
+  }
+}
+
+template<class ConfigObs>
+template<class ConfigProxyT>
+void ObserverMgr<ConfigObs>::for_each_change(const std::set<std::string>& changes,
+                                             ConfigProxyT& proxy,
+                                             config_gather_cb callback, std::ostream *oss)
+{
+  // create the reverse observer mapping, mapping observers to the set of
+  // changed keys that they'll get.
+  std::string val;
+  for (auto& key : changes) {
+    auto [first, last] = observers.equal_range(key);
+    if ((oss) && !proxy.get_val(key, &val)) {
+      (*oss) << key << " = '" << val << "' ";
+      if (first == last) {
+        (*oss) << "(not observed, change may require restart) ";
+      }
+    }
+    for (auto r = first; r != last; ++r) {
+      callback(r->second, key);
+    }
+  }
+}
+
+template<class ConfigObs>
+bool ObserverMgr<ConfigObs>::is_tracking(const std::string& name) const
+{
+  return observers.count(name) > 0;
+}
diff --git a/src/common/config_proxy.h b/src/common/config_proxy.h
new file mode 100644
index 000000000..02c670f60
--- /dev/null
+++ b/src/common/config_proxy.h
@@ -0,0 +1,352 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#pragma once
+
+#include <type_traits>
+#include "common/config.h"
+#include "common/config_obs.h"
+#include "common/config_obs_mgr.h"
+#include "common/ceph_mutex.h"
+
+// @c ConfigProxy is a facade of multiple config related classes. it exposes
+// the legacy settings with arrow operator, and the new-style config with its
+// member methods.
+namespace ceph::common {
+class ConfigProxy {
+  /**
+   * The current values of all settings described by the schema
+   */
+  ConfigValues values;
+  using md_config_obs_t = ceph::md_config_obs_impl<ConfigProxy>;
+  ObserverMgr<md_config_obs_t> obs_mgr;
+  md_config_t config;
+  /** A lock that protects the md_config_t internals. It is
+   * recursive, for simplicity.
+   * It is best if this lock comes first in the lock hierarchy. We will
+   * hold this lock when calling configuration observers.  */
+  mutable ceph::recursive_mutex lock =
+    ceph::make_recursive_mutex("ConfigProxy::lock");
+
+  class CallGate {
+  private:
+    uint32_t call_count = 0;
+    ceph::mutex lock;
+    ceph::condition_variable cond;
+  public:
+    CallGate()
+      : lock(ceph::make_mutex("call::gate::lock")) {
+    }
+
+    void enter() {
+      std::lock_guard<ceph::mutex> locker(lock);
+      ++call_count;
+    }
+    void leave() {
+      std::lock_guard<ceph::mutex> locker(lock);
+      ceph_assert(call_count > 0);
+      if (--call_count == 0) {
+        cond.notify_all();
+      }
+    }
+    void close() {
+      std::unique_lock<ceph::mutex> locker(lock);
+      while (call_count != 0) {
+        cond.wait(locker);
+      }
+    }
+  };
+
+  void call_gate_enter(md_config_obs_t *obs) {
+    auto p = obs_call_gate.find(obs);
+    ceph_assert(p != obs_call_gate.end());
+    p->second->enter();
+  }
+  void call_gate_leave(md_config_obs_t *obs) {
+    auto p = obs_call_gate.find(obs);
+    ceph_assert(p != obs_call_gate.end());
+    p->second->leave();
+  }
+  void call_gate_close(md_config_obs_t *obs) {
+    auto p = obs_call_gate.find(obs);
+    ceph_assert(p != obs_call_gate.end());
+    p->second->close();
+  }
+
+  using rev_obs_map_t = ObserverMgr<md_config_obs_t>::rev_obs_map;
+  typedef std::unique_ptr<CallGate> CallGateRef;
+
+  std::map<md_config_obs_t*, CallGateRef> obs_call_gate;
+
+  void call_observers(std::unique_lock<ceph::recursive_mutex>& locker,
+                      rev_obs_map_t& rev_obs) {
+    // observers are notified outside of lock
+    locker.unlock();
+    for (auto& [obs, keys] : rev_obs) {
+      obs->handle_conf_change(*this, keys);
+    }
+    locker.lock();
+
+    for (auto& rev_ob : rev_obs) {
+      call_gate_leave(rev_ob.first);
+    }
+  }
+
+  void map_observer_changes(md_config_obs_t *obs, const std::string &key,
+                            rev_obs_map_t *rev_obs) {
+    ceph_assert(ceph_mutex_is_locked(lock));
+
+    auto [it, new_entry] = rev_obs->emplace(obs, std::set<std::string>{});
+    it->second.emplace(key);
+    if (new_entry) {
+      // this needs to be done under lock as once this lock is
+      // dropped (before calling observers) a remove_observer()
+      // can sneak in and cause havoc.
+      call_gate_enter(obs);
+    }
+  }
+
+public:
+  explicit ConfigProxy(bool is_daemon)
+    : config{values, obs_mgr, is_daemon}
+  {}
+  ConfigProxy(const ConfigProxy &config_proxy)
+    : values(config_proxy.get_config_values()),
+      config{values, obs_mgr, config_proxy.config.is_daemon}
+  {}
+  const ConfigValues* operator->() const noexcept {
+    return &values;
+  }
+  ConfigValues* operator->() noexcept {
+    return &values;
+  }
+  ConfigValues get_config_values() const {
+    std::lock_guard l{lock};
+    return values;
+  }
+  void set_config_values(const ConfigValues& val) {
+#ifndef WITH_SEASTAR
+    std::lock_guard l{lock};
+#endif
+    values = val;
+  }
+  int get_val(const std::string_view key, char** buf, int len) const {
+    std::lock_guard l{lock};
+    return config.get_val(values, key, buf, len);
+  }
+  int get_val(const std::string_view key, std::string *val) const {
+    std::lock_guard l{lock};
+    return config.get_val(values, key, val);
+  }
+  template<typename T>
+  const T get_val(const std::string_view key) const {
+    std::lock_guard l{lock};
+    return config.template get_val<T>(values, key);
+  }
+  template<typename T, typename Callback, typename...Args>
+  auto with_val(const std::string_view key, Callback&& cb, Args&&... args) const {
+    std::lock_guard l{lock};
+    return config.template with_val<T>(values, key,
+				       std::forward<Callback>(cb),
+				       std::forward<Args>(args)...);
+  }
+  void config_options(ceph::Formatter *f) const {
+    config.config_options(f);
+  }
+  const decltype(md_config_t::schema)& get_schema() const {
+    return config.schema;
+  }
+  const Option* get_schema(const std::string_view key) const {
+    auto found = config.schema.find(key);
+    if (found == config.schema.end()) {
+      return nullptr;
+    } else {
+      return &found->second;
+    }
+  }
+  const Option *find_option(const std::string& name) const {
+    return config.find_option(name);
+  }
+  void diff(ceph::Formatter *f, const std::string& name = {}) const {
+    std::lock_guard l{lock};
+    return config.diff(values, f, name);
+  }
+  std::vector<std::string> get_my_sections() const {
+    std::lock_guard l{lock};
+    return config.get_my_sections(values);
+  }
+  int get_all_sections(std::vector<std::string>& sections) const {
+    std::lock_guard l{lock};
+    return config.get_all_sections(sections);
+  }
+  int get_val_from_conf_file(const std::vector<std::string>& sections,
+			     const std::string_view key, std::string& out,
+			     bool emeta) const {
+    std::lock_guard l{lock};
+    return config.get_val_from_conf_file(values,
+					 sections, key, out, emeta);
+  }
+  unsigned get_osd_pool_default_min_size(uint8_t size) const {
+    return config.get_osd_pool_default_min_size(values, size);
+  }
+  void early_expand_meta(std::string &val,
+			 std::ostream *oss) const {
+    std::lock_guard l{lock};
+    return config.early_expand_meta(values, val, oss);
+  }
+  // for those want to reexpand special meta, e.g, $pid
+  void finalize_reexpand_meta() {
+    std::unique_lock locker(lock);
+    rev_obs_map_t rev_obs;
+    if (config.finalize_reexpand_meta(values, obs_mgr)) {
+      _gather_changes(values.changed, &rev_obs, nullptr);
+    }
+
+    call_observers(locker, rev_obs);
+  }
+  void add_observer(md_config_obs_t* obs) {
+    std::lock_guard l(lock);
+    obs_mgr.add_observer(obs);
+    obs_call_gate.emplace(obs, std::make_unique<CallGate>());
+  }
+  void remove_observer(md_config_obs_t* obs) {
+    std::lock_guard l(lock);
+    call_gate_close(obs);
+    obs_call_gate.erase(obs);
+    obs_mgr.remove_observer(obs);
+  }
+  void call_all_observers() {
+    std::unique_lock locker(lock);
+    rev_obs_map_t rev_obs;
+    obs_mgr.for_each_observer(
+      [this, &rev_obs](md_config_obs_t *obs, const std::string &key) {
+        map_observer_changes(obs, key, &rev_obs);
+      });
+
+    call_observers(locker, rev_obs);
+  }
+  void set_safe_to_start_threads() {
+    config.set_safe_to_start_threads();
+  }
+  void _clear_safe_to_start_threads() {
+    config._clear_safe_to_start_threads();
+  }
+  void show_config(std::ostream& out) {
+    std::lock_guard l{lock};
+    config.show_config(values, out);
+  }
+  void show_config(ceph::Formatter *f) {
+    std::lock_guard l{lock};
+    config.show_config(values, f);
+  }
+  void config_options(ceph::Formatter *f) {
+    std::lock_guard l{lock};
+    config.config_options(f);
+  }
+  int rm_val(const std::string_view key) {
+    std::lock_guard l{lock};
+    return config.rm_val(values, key);
+  }
+  // Expand all metavariables. Make any pending observer callbacks.
+  void apply_changes(std::ostream* oss) {
+    std::unique_lock locker(lock);
+    rev_obs_map_t rev_obs;
+
+    // apply changes until the cluster name is assigned
+    if (!values.cluster.empty()) {
+      // meta expands could have modified anything.  Copy it all out again.
+      _gather_changes(values.changed, &rev_obs, oss);
+    }
+
+    call_observers(locker, rev_obs);
+  }
+  void _gather_changes(std::set<std::string> &changes,
+                       rev_obs_map_t *rev_obs, std::ostream* oss) {
+    obs_mgr.for_each_change(
+      changes, *this,
+      [this, rev_obs](md_config_obs_t *obs, const std::string &key) {
+        map_observer_changes(obs, key, rev_obs);
+      }, oss);
+      changes.clear();
+  }
+  int set_val(const std::string_view key, const std::string& s,
+              std::stringstream* err_ss=nullptr) {
+    std::lock_guard l{lock};
+    return config.set_val(values, obs_mgr, key, s, err_ss);
+  }
+  void set_val_default(const std::string_view key, const std::string& val) {
+    std::lock_guard l{lock};
+    config.set_val_default(values, obs_mgr, key, val);
+  }
+  void set_val_or_die(const std::string_view key, const std::string& val) {
+    std::lock_guard l{lock};
+    config.set_val_or_die(values, obs_mgr, key, val);
+  }
+  int set_mon_vals(CephContext *cct,
+		   const std::map<std::string,std::string,std::less<>>& kv,
+		   md_config_t::config_callback config_cb) {
+    std::unique_lock locker(lock);
+    int ret = config.set_mon_vals(cct, values, obs_mgr, kv, config_cb);
+
+    rev_obs_map_t rev_obs;
+    _gather_changes(values.changed, &rev_obs, nullptr);
+
+    call_observers(locker, rev_obs);
+    return ret;
+  }
+  int injectargs(const std::string &s, std::ostream *oss) {
+    std::unique_lock locker(lock);
+    int ret = config.injectargs(values, obs_mgr, s, oss);
+
+    rev_obs_map_t rev_obs;
+    _gather_changes(values.changed, &rev_obs, oss);
+
+    call_observers(locker, rev_obs);
+    return ret;
+  }
+  void parse_env(unsigned entity_type,
+		 const char *env_var = "CEPH_ARGS") {
+    std::lock_guard l{lock};
+    config.parse_env(entity_type, values, obs_mgr, env_var);
+  }
+  int parse_argv(std::vector<const char*>& args, int level=CONF_CMDLINE) {
+    std::lock_guard l{lock};
+    return config.parse_argv(values, obs_mgr, args, level);
+  }
+  int parse_config_files(const char *conf_files,
+			 std::ostream *warnings, int flags) {
+    std::lock_guard l{lock};
+    return config.parse_config_files(values, obs_mgr,
+				     conf_files, warnings, flags);
+  }
+  bool has_parse_error() const {
+    return !config.parse_error.empty();
+  }
+  std::string get_parse_error() {
+    return config.parse_error;
+  }
+  void complain_about_parse_error(CephContext *cct) {
+    return config.complain_about_parse_error(cct);
+  }
+  void do_argv_commands() const {
+    std::lock_guard l{lock};
+    config.do_argv_commands(values);
+  }
+  void get_config_bl(uint64_t have_version,
+		     ceph::buffer::list *bl,
+		     uint64_t *got_version) {
+    std::lock_guard l{lock};
+    config.get_config_bl(values, have_version, bl, got_version);
+  }
+  void get_defaults_bl(ceph::buffer::list *bl) {
+    std::lock_guard l{lock};
+    config.get_defaults_bl(values, bl);
+  }
+  const std::string& get_conf_path() const {
+    return config.get_conf_path();
+  }
+  std::optional<std::string> get_val_default(std::string_view key) {
+    return config.get_val_default(key);
+  }
+};
+
+}
diff --git a/src/common/config_tracker.h b/src/common/config_tracker.h
new file mode 100644
index 000000000..783e6f2ee
--- /dev/null
+++ b/src/common/config_tracker.h
@@ -0,0 +1,18 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#pragma once
+
+#include <string>
+
+// @ConfigTracker is queried to see if any added observers is tracking one or
+// more changed settings.
+//
+// this class is introduced in hope to decouple @c md_config_t from any instantiated
+// class of @c ObserverMgr, as what the former wants is but @c is_tracking(), and to
+// make ObserverMgr a template parameter of md_config_t's methods just complicates
+// the dependencies between header files, and slows down the compiling.
+class ConfigTracker {
+public:
+  virtual ~ConfigTracker() = default;
+  virtual bool is_tracking(const std::string& name) const = 0;
+};
diff --git a/src/common/config_values.cc b/src/common/config_values.cc
new file mode 100644
index 000000000..f4a0a1959
--- /dev/null
+++ b/src/common/config_values.cc
@@ -0,0 +1,90 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+#include "config_values.h"
+
+#include "config.h"
+#if WITH_SEASTAR
+#include "crimson/common/log.h"
+#endif
+
+ConfigValues::set_value_result_t
+ConfigValues::set_value(const std::string_view key,
+                        Option::value_t&& new_value,
+                        int level)
+{  
+  if (auto p = values.find(key); p != values.end()) {
+    auto q = p->second.find(level);
+    if (q != p->second.end()) {
+      if (new_value == q->second) {
+        return SET_NO_CHANGE;
+      }
+      q->second = std::move(new_value);
+    } else {
+      p->second[level] = std::move(new_value);
+    }
+    if (p->second.rbegin()->first > level) {
+      // there was a higher priority value; no effect
+      return SET_NO_EFFECT;
+    } else {
+      return SET_HAVE_EFFECT;
+    }
+  } else {
+    values[key][level] = std::move(new_value);
+    return SET_HAVE_EFFECT;
+  }
+}
+
+int ConfigValues::rm_val(const std::string_view key, int level)
+{
+  auto i = values.find(key);
+  if (i == values.end()) {
+    return -ENOENT;
+  }
+  auto j = i->second.find(level);
+  if (j == i->second.end()) {
+    return -ENOENT;
+  }
+  bool matters = (j->first == i->second.rbegin()->first);
+  i->second.erase(j);
+  if (matters) {
+    return SET_HAVE_EFFECT;
+  } else {
+    return SET_NO_EFFECT;
+  }
+}
+
+std::pair<Option::value_t, bool>
+ConfigValues::get_value(const std::string_view name, int level) const
+{
+  auto p = values.find(name);
+  if (p != values.end() && !p->second.empty()) {
+    // use highest-priority value available (see CONF_*)
+    if (level < 0) {
+      return {p->second.rbegin()->second, true};
+    } else if (auto found = p->second.find(level);
+               found != p->second.end()) {
+      return {found->second, true};
+    }
+  }
+  return {Option::value_t{}, false};
+}
+
+void ConfigValues::set_logging(int which, const char* val)
+{
+  int log, gather;
+  int r = sscanf(val, "%d/%d", &log, &gather);
+  if (r >= 1) {
+    if (r < 2) {
+      gather = log;
+    }
+    subsys.set_log_level(which, log);
+    subsys.set_gather_level(which, gather);
+#if WITH_SEASTAR
+    crimson::get_logger(which).set_level(crimson::to_log_level(log));
+#endif
+  }
+}
+
+bool ConfigValues::contains(const std::string_view key) const
+{
+  return values.count(key);
+}
diff --git a/src/common/config_values.h b/src/common/config_values.h
new file mode 100644
index 000000000..6347709cc
--- /dev/null
+++ b/src/common/config_values.h
@@ -0,0 +1,100 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#pragma once
+
+#include <cstdint>
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+
+#include "common/entity_name.h"
+#include "common/options.h"
+#include "log/SubsystemMap.h"
+#include "msg/msg_types.h"
+
+// @c ConfigValues keeps track of mappings from the config names to their values,
+// debug logging settings, and some other "unnamed" settings, like entity name of
+// the daemon.
+class ConfigValues {
+  using values_t = std::map<std::string_view, std::map<int32_t,Option::value_t>>;
+  values_t values;
+  // for populating md_config_impl::legacy_values in ctor
+  friend struct md_config_t;
+
+public:
+  EntityName name;
+  /// cluster name
+  std::string cluster;
+  ceph::logging::SubsystemMap subsys;
+  bool no_mon_config = false;
+  // Set of configuration options that have changed since the last
+  // apply_changes
+  using changed_set_t = std::set<std::string>;
+  changed_set_t changed;
+
+// This macro block defines C members of the md_config_t struct
+// corresponding to the definitions in legacy_config_opts.h.
+// These C members are consumed by code that was written before
+// the new options.cc infrastructure: all newer code should
+// be consume options via explicit get() rather than C members.
+#define OPTION_OPT_INT(name) int64_t name;
+#define OPTION_OPT_LONGLONG(name) int64_t name;
+#define OPTION_OPT_STR(name) std::string name;
+#define OPTION_OPT_DOUBLE(name) double name;
+#define OPTION_OPT_FLOAT(name) double name;
+#define OPTION_OPT_BOOL(name) bool name;
+#define OPTION_OPT_ADDR(name) entity_addr_t name;
+#define OPTION_OPT_ADDRVEC(name) entity_addrvec_t name;
+#define OPTION_OPT_U32(name) uint64_t name;
+#define OPTION_OPT_U64(name) uint64_t name;
+#define OPTION_OPT_UUID(name) uuid_d name;
+#define OPTION_OPT_SIZE(name) uint64_t name;
+#define OPTION(name, ty)       \
+  public:                      \
+    OPTION_##ty(name)          
+#define SAFE_OPTION(name, ty)       \
+  protected:                        \
+    OPTION_##ty(name)               
+#include "common/options/legacy_config_opts.h"
+#undef OPTION_OPT_INT
+#undef OPTION_OPT_LONGLONG
+#undef OPTION_OPT_STR
+#undef OPTION_OPT_DOUBLE
+#undef OPTION_OPT_FLOAT
+#undef OPTION_OPT_BOOL
+#undef OPTION_OPT_ADDR
+#undef OPTION_OPT_ADDRVEC
+#undef OPTION_OPT_U32
+#undef OPTION_OPT_U64
+#undef OPTION_OPT_UUID
+#undef OPTION
+#undef SAFE_OPTION
+
+public:
+  enum set_value_result_t {
+    SET_NO_CHANGE,
+    SET_NO_EFFECT,
+    SET_HAVE_EFFECT,
+  };
+  /**
+   * @return true if changed, false otherwise
+   */
+  set_value_result_t set_value(std::string_view key,
+                               Option::value_t&& value,
+                               int level);
+  int rm_val(const std::string_view key, int level);
+  void set_logging(int which, const char* val);
+  /**
+   * @param level the level of the setting, -1 for the one with the 
+   *              highest-priority
+   */
+  std::pair<Option::value_t, bool> get_value(const std::string_view name,
+                                             int level) const;
+  template<typename Func> void for_each(Func&& func) const {
+    for (const auto& [name,configs] : values) {
+      func(name, configs);
+    }
+  }
+  bool contains(const std::string_view key) const;
+};
diff --git a/src/common/containers.h b/src/common/containers.h
new file mode 100644
index 000000000..c0aa83544
--- /dev/null
+++ b/src/common/containers.h
@@ -0,0 +1,195 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+//
+// Ceph - scalable distributed file system
+//
+// Copyright (C) 2018 Red Hat, Inc.
+//
+// This is free software; you can redistribute it and/or
+// modify it under the terms of the GNU Lesser General Public
+// License version 2.1, as published by the Free Software
+// Foundation.  See file COPYING.
+//
+
+#ifndef CEPH_COMMON_CONTAINERS_H
+#define CEPH_COMMON_CONTAINERS_H
+
+#include <cstdint>
+#include <type_traits>
+
+namespace ceph::containers {
+
+// tiny_vector – CPU friendly, small_vector-like container for mutexes,
+// atomics and other non-movable things.
+//
+// The purpose of the container is to store arbitrary number of objects
+// with absolutely minimal requirements regarding constructibility
+// and assignability while minimizing memory indirection.
+// There is no obligation for MoveConstructibility, CopyConstructibility,
+// MoveAssignability, CopyAssignability nor even DefaultConstructibility
+// which allows to handle std::mutexes, std::atomics or any type embedding
+// them.
+//
+// Few requirements translate into tiny interface. The container isn't
+// Copy- nor MoveConstructible. Although it does offer random access
+// iterator, insertion in the middle is not allowed. The maximum number
+// of elements must be known at run-time. This shouldn't be an issue in
+// the intended use case: sharding.
+//
+// For the special case of no internal slots (InternalCapacity eq 0),
+// tiny_vector doesn't require moving any elements (changing pointers
+// is enough), and thus should be MoveConstructibile.
+//
+// Alternatives:
+//  1. std::vector<boost::optional<ValueT>> initialized with the known
+//     size and emplace_backed(). boost::optional inside provides
+//     the DefaultConstructibility. Imposes extra memory indirection.
+//  2. boost::container::small_vector + boost::optional always
+//     requires MoveConstructibility.
+//  3. boost::container::static_vector feed via emplace_back().
+//     Good for performance but enforces upper limit on elements count.
+//     For sharding this means we can't handle arbitrary number of
+//     shards (weird configs).
+//  4. std::unique_ptr<ValueT>: extra indirection together with memory
+//     fragmentation.
+
+template<typename Value, std::size_t InternalCapacity = 0>
+class tiny_vector {
+  // NOTE: to avoid false sharing consider aligning to cache line
+  using storage_unit_t = \
+    std::aligned_storage_t<sizeof(Value), alignof(Value)>;
+
+  std::size_t _size = 0;
+  storage_unit_t* const data = nullptr;
+  storage_unit_t internal[InternalCapacity];
+
+public:
+  typedef std::size_t size_type;
+  typedef std::add_lvalue_reference_t<Value> reference;
+  typedef std::add_const_t<reference> const_reference;
+  typedef std::add_pointer_t<Value> pointer;
+
+  // emplacer is the piece of weirdness that comes from handling
+  // unmovable-and-uncopyable things. The only way to instantiate
+  // such types I know is to create instances in-place perfectly
+  // forwarding necessary data to constructor.
+  // Abstracting that is the exact purpose of emplacer.
+  //
+  // The usage scenario is:
+  //   1. The tiny_vector's ctor is provided with a) maximum number
+  //      of instances and b) a callable taking emplacer.
+  //   2. The callable can (but isn't obliged to!) use emplacer to
+  //      construct an instance without knowing at which address
+  //      in memory it will be put. Callable is also supplied with
+  //      an unique integer from the range <0, maximum number of
+  //      instances).
+  //   3. If callable decides to instantiate, it calls ::emplace
+  //      of emplacer passing all arguments required by the type
+  //      hold in tiny_vector.
+  //
+  // Example:
+  // ```
+  //   static constexpr const num_internally_allocated_slots = 32;
+  //   tiny_vector<T, num_internally_allocated_slots> mytinyvec {
+  //     num_of_instances,
+  //     [](const size_t i, auto emplacer) {
+  //       emplacer.emplace(argument_for_T_ctor);
+  //     }
+  //   }
+  // ```
+  //
+  // For the sake of supporting the ceph::make_mutex() family of
+  // factories, which relies on C++17's guaranteed copy elision,
+  // the emplacer provides `data()` to retrieve the location for
+  // constructing the instance with placement-new. This is handy
+  // as the `emplace()` depends on perfect forwarding, and thus
+  // interfere with the elision for cases like:
+  // ```
+  //   emplacer.emplace(ceph::make_mutex("mtx-name"));
+  // ```
+  // See: https://stackoverflow.com/a/52498826
+
+  class emplacer {
+    friend class tiny_vector;
+
+    tiny_vector* parent;
+    emplacer(tiny_vector* const parent)
+      : parent(parent) {
+    }
+
+  public:
+    void* data() {
+      void* const ret = &parent->data[parent->_size++];
+      parent = nullptr;
+      return ret;
+    }
+
+    template<class... Args>
+    void emplace(Args&&... args) {
+      if (parent) {
+        new (data()) Value(std::forward<Args>(args)...);
+      }
+    }
+  };
+
+  template<typename F>
+  tiny_vector(const std::size_t count, F&& f)
+    : data(count <= InternalCapacity ? internal
+                                     : new storage_unit_t[count]) {
+    for (std::size_t i = 0; i < count; ++i) {
+      // caller MAY emplace up to `count` elements but it IS NOT
+      // obliged to do so. The emplacer guarantees that the limit
+      // will never be exceeded.
+      f(i, emplacer(this));
+    }
+  }
+
+  ~tiny_vector() {
+    for (auto& elem : *this) {
+      elem.~Value();
+    }
+
+    const auto data_addr = reinterpret_cast<std::uintptr_t>(data);
+    const auto this_addr = reinterpret_cast<std::uintptr_t>(this);
+    if (data_addr < this_addr ||
+        data_addr >= this_addr + sizeof(*this)) {
+      delete[] data;
+    }
+  }
+
+  reference       operator[](size_type pos) {
+    return reinterpret_cast<reference>(data[pos]);
+  }
+  const_reference operator[](size_type pos) const {
+    return reinterpret_cast<const_reference>(data[pos]);
+  }
+
+  size_type size() const {
+    return _size;
+  }
+
+  pointer begin() {
+    return reinterpret_cast<pointer>(&data[0]);
+  }
+  pointer end() {
+    return reinterpret_cast<pointer>(&data[_size]);
+  }
+
+  const pointer begin() const {
+    return reinterpret_cast<pointer>(&data[0]);
+  }
+  const pointer end() const {
+    return reinterpret_cast<pointer>(&data[_size]);
+  }
+
+  const pointer cbegin() const {
+    return reinterpret_cast<pointer>(&data[0]);
+  }
+  const pointer cend() const {
+    return reinterpret_cast<pointer>(&data[_size]);
+  }
+};
+
+} // namespace ceph::containers
+
+#endif // CEPH_COMMON_CONTAINERS_H
diff --git a/src/common/convenience.h b/src/common/convenience.h
new file mode 100644
index 000000000..db803c59d
--- /dev/null
+++ b/src/common/convenience.h
@@ -0,0 +1,135 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <mutex>
+#include <memory>
+#include <optional>
+#include <shared_mutex>
+#include <type_traits>
+#include <utility>
+
+#include <boost/optional.hpp>
+
+#ifndef CEPH_COMMON_CONVENIENCE_H
+#define CEPH_COMMON_CONVENIENCE_H
+
+namespace ceph {
+// boost::optional is wonderful! Unfortunately it lacks a function for
+// the thing you would most obviously want to do with it: apply a
+// function to its contents.
+
+// There are two obvious candidates. The first is a function that
+// takes a function and an optional value and returns an optional
+// value, either holding the return value of the function or holding
+// nothing.
+//
+// I'd considered making more overloads for mutable lvalue
+// references, but those are going a bit beyond likely use cases.
+//
+template<typename T, typename F>
+auto maybe_do(const boost::optional<T>& t, F&& f) ->
+  boost::optional<std::result_of_t<F(const std::decay_t<T>)>>
+{
+  if (t)
+    return { std::forward<F>(f)(*t) };
+  else
+    return boost::none;
+}
+
+// The other obvious function takes an optional but returns an
+// ‘unwrapped’ value, either the result of evaluating the function or
+// a provided alternate value.
+//
+template<typename T, typename F, typename U>
+auto maybe_do_or(const boost::optional<T>& t, F&& f, U&& u) ->
+  std::result_of_t<F(const std::decay_t<T>)>
+{
+  static_assert(std::is_convertible_v<U, std::result_of_t<F(T)>>,
+		"Alternate value must be convertible to function return type.");
+  if (t)
+    return std::forward<F>(f)(*t);
+  else
+    return std::forward<U>(u);
+}
+
+
+// Same thing but for std::optional
+
+template<typename T, typename F>
+auto maybe_do(const std::optional<T>& t, F&& f) ->
+  std::optional<std::result_of_t<F(const std::decay_t<T>)>>
+{
+  if (t)
+    return { std::forward<F>(f)(*t) };
+  else
+    return std::nullopt;
+}
+
+// The other obvious function takes an optional but returns an
+// ‘unwrapped’ value, either the result of evaluating the function or
+// a provided alternate value.
+//
+template<typename T, typename F, typename U>
+auto maybe_do_or(const std::optional<T>& t, F&& f, U&& u) ->
+  std::result_of_t<F(const std::decay_t<T>)>
+{
+  static_assert(std::is_convertible_v<U, std::result_of_t<F(T)>>,
+		"Alternate value must be convertible to function return type.");
+  if (t)
+    return std::forward<F>(f)(*t);
+  else
+    return std::forward<U>(u);
+}
+
+namespace _convenience {
+template<typename... Ts, typename F,  std::size_t... Is>
+inline void for_each_helper(const std::tuple<Ts...>& t, const F& f,
+			    std::index_sequence<Is...>) {
+  (f(std::get<Is>(t)), ..., void());
+}
+template<typename... Ts, typename F,  std::size_t... Is>
+inline void for_each_helper(std::tuple<Ts...>& t, const F& f,
+			    std::index_sequence<Is...>) {
+  (f(std::get<Is>(t)), ..., void());
+}
+template<typename... Ts, typename F,  std::size_t... Is>
+inline void for_each_helper(const std::tuple<Ts...>& t, F& f,
+			    std::index_sequence<Is...>) {
+  (f(std::get<Is>(t)), ..., void());
+}
+template<typename... Ts, typename F,  std::size_t... Is>
+inline void for_each_helper(std::tuple<Ts...>& t, F& f,
+			    std::index_sequence<Is...>) {
+  (f(std::get<Is>(t)), ..., void());
+}
+}
+
+template<typename... Ts, typename F>
+inline void for_each(const std::tuple<Ts...>& t, const F& f) {
+  _convenience::for_each_helper(t, f, std::index_sequence_for<Ts...>{});
+}
+template<typename... Ts, typename F>
+inline void for_each(std::tuple<Ts...>& t, const F& f) {
+  _convenience::for_each_helper(t, f, std::index_sequence_for<Ts...>{});
+}
+template<typename... Ts, typename F>
+inline void for_each(const std::tuple<Ts...>& t, F& f) {
+  _convenience::for_each_helper(t, f, std::index_sequence_for<Ts...>{});
+}
+template<typename... Ts, typename F>
+inline void for_each(std::tuple<Ts...>& t, F& f) {
+  _convenience::for_each_helper(t, f, std::index_sequence_for<Ts...>{});
+}
+}
+#endif // CEPH_COMMON_CONVENIENCE_H
diff --git a/src/common/crc32c.cc b/src/common/crc32c.cc
new file mode 100644
index 000000000..e4a77ae99
--- /dev/null
+++ b/src/common/crc32c.cc
@@ -0,0 +1,240 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/crc32c.h"
+#include "arch/probe.h"
+#include "arch/intel.h"
+#include "arch/arm.h"
+#include "arch/ppc.h"
+#include "common/sctp_crc32.h"
+#include "common/crc32c_intel_fast.h"
+#include "common/crc32c_aarch64.h"
+#include "common/crc32c_ppc.h"
+
+/*
+ * choose best implementation based on the CPU architecture.
+ */
+ceph_crc32c_func_t ceph_choose_crc32(void)
+{
+  // make sure we've probed cpu features; this might depend on the
+  // link order of this file relative to arch/probe.cc.
+  ceph_arch_probe();
+
+  // if the CPU supports it, *and* the fast version is compiled in,
+  // use that.
+#if defined(__i386__) || defined(__x86_64__)
+  if (ceph_arch_intel_sse42 && ceph_crc32c_intel_fast_exists()) {
+    return ceph_crc32c_intel_fast;
+  }
+#elif defined(__arm__) || defined(__aarch64__)
+# if defined(HAVE_ARMV8_CRC)
+  if (ceph_arch_aarch64_crc32){
+    return ceph_crc32c_aarch64;
+  }
+# endif
+#elif defined(__powerpc__) || defined(__ppc__)
+  if (ceph_arch_ppc_crc32) {
+    return ceph_crc32c_ppc;
+  }
+#endif
+  // default
+  return ceph_crc32c_sctp;
+}
+
+/*
+ * static global
+ *
+ * This is a bit of a no-no for shared libraries, but we don't care.
+ * It is effectively constant for the executing process as the value
+ * depends on the CPU architecture.
+ *
+ * We initialize it during program init using the magic of C++.
+ */
+ceph_crc32c_func_t ceph_crc32c_func = ceph_choose_crc32();
+
+
+/*
+ * Look: http://crcutil.googlecode.com/files/crc-doc.1.0.pdf
+ * Here is implementation that goes 1 logical step further,
+ * it splits calculating CRC into jumps of length 1, 2, 4, 8, ....
+ * Each jump is performed on single input bit separately, xor-ed after that.
+ *
+ * This function is unused. It is here to show how crc_turbo_table was obtained.
+ */
+void create_turbo_table(uint32_t table[32][32])
+{
+  //crc_turbo_struct table;
+  for (int bit = 0 ; bit < 32 ; bit++) {
+    table[0][bit] = ceph_crc32c_sctp(1UL << bit, nullptr, 1);
+  }
+  for (int range = 1; range <32 ; range++) {
+    for (int bit = 0 ; bit < 32 ; bit++) {
+      uint32_t crc_x = table[range-1][bit];
+      uint32_t crc_y = 0;
+      for (int b = 0 ; b < 32 ; b++) {
+        if ( (crc_x & (1UL << b)) != 0 ) {
+          crc_y = crc_y ^ table[range-1][b];
+        }
+      }
+      table[range][bit] = crc_y;
+    }
+  }
+}
+
+static uint32_t crc_turbo_table[32][32] =
+{
+    {0xf26b8303, 0xe13b70f7, 0xc79a971f, 0x8ad958cf, 0x105ec76f, 0x20bd8ede, 0x417b1dbc, 0x82f63b78,
+     0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020, 0x00000040, 0x00000080,
+     0x00000100, 0x00000200, 0x00000400, 0x00000800, 0x00001000, 0x00002000, 0x00004000, 0x00008000,
+     0x00010000, 0x00020000, 0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000},
+    {0x13a29877, 0x274530ee, 0x4e8a61dc, 0x9d14c3b8, 0x3fc5f181, 0x7f8be302, 0xff17c604, 0xfbc3faf9,
+     0xf26b8303, 0xe13b70f7, 0xc79a971f, 0x8ad958cf, 0x105ec76f, 0x20bd8ede, 0x417b1dbc, 0x82f63b78,
+     0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020, 0x00000040, 0x00000080,
+     0x00000100, 0x00000200, 0x00000400, 0x00000800, 0x00001000, 0x00002000, 0x00004000, 0x00008000},
+    {0xdd45aab8, 0xbf672381, 0x7b2231f3, 0xf64463e6, 0xe964b13d, 0xd725148b, 0xaba65fe7, 0x52a0c93f,
+     0xa541927e, 0x4f6f520d, 0x9edea41a, 0x38513ec5, 0x70a27d8a, 0xe144fb14, 0xc76580d9, 0x8b277743,
+     0x13a29877, 0x274530ee, 0x4e8a61dc, 0x9d14c3b8, 0x3fc5f181, 0x7f8be302, 0xff17c604, 0xfbc3faf9,
+     0xf26b8303, 0xe13b70f7, 0xc79a971f, 0x8ad958cf, 0x105ec76f, 0x20bd8ede, 0x417b1dbc, 0x82f63b78},
+    {0x493c7d27, 0x9278fa4e, 0x211d826d, 0x423b04da, 0x847609b4, 0x0d006599, 0x1a00cb32, 0x34019664,
+     0x68032cc8, 0xd0065990, 0xa5e0c5d1, 0x4e2dfd53, 0x9c5bfaa6, 0x3d5b83bd, 0x7ab7077a, 0xf56e0ef4,
+     0xef306b19, 0xdb8ca0c3, 0xb2f53777, 0x6006181f, 0xc00c303e, 0x85f4168d, 0x0e045beb, 0x1c08b7d6,
+     0x38116fac, 0x7022df58, 0xe045beb0, 0xc5670b91, 0x8f2261d3, 0x1ba8b557, 0x37516aae, 0x6ea2d55c},
+    {0xf20c0dfe, 0xe1f46d0d, 0xc604aceb, 0x89e52f27, 0x162628bf, 0x2c4c517e, 0x5898a2fc, 0xb13145f8,
+     0x678efd01, 0xcf1dfa02, 0x9bd782f5, 0x3243731b, 0x6486e636, 0xc90dcc6c, 0x97f7ee29, 0x2a03aaa3,
+     0x54075546, 0xa80eaa8c, 0x55f123e9, 0xabe247d2, 0x5228f955, 0xa451f2aa, 0x4d4f93a5, 0x9a9f274a,
+     0x30d23865, 0x61a470ca, 0xc348e194, 0x837db5d9, 0x03171d43, 0x062e3a86, 0x0c5c750c, 0x18b8ea18},
+    {0x3da6d0cb, 0x7b4da196, 0xf69b432c, 0xe8daf0a9, 0xd45997a3, 0xad5f59b7, 0x5f52c59f, 0xbea58b3e,
+     0x78a7608d, 0xf14ec11a, 0xe771f4c5, 0xcb0f9f7b, 0x93f34807, 0x220ae6ff, 0x4415cdfe, 0x882b9bfc,
+     0x15bb4109, 0x2b768212, 0x56ed0424, 0xadda0848, 0x5e586661, 0xbcb0ccc2, 0x7c8def75, 0xf91bdeea,
+     0xf7dbcb25, 0xea5be0bb, 0xd15bb787, 0xa75b19ff, 0x4b5a450f, 0x96b48a1e, 0x288562cd, 0x510ac59a},
+    {0x740eef02, 0xe81dde04, 0xd5d7caf9, 0xae43e303, 0x596bb0f7, 0xb2d761ee, 0x6042b52d, 0xc0856a5a,
+     0x84e6a245, 0x0c21327b, 0x184264f6, 0x3084c9ec, 0x610993d8, 0xc21327b0, 0x81ca3991, 0x067805d3,
+     0x0cf00ba6, 0x19e0174c, 0x33c02e98, 0x67805d30, 0xcf00ba60, 0x9bed0231, 0x32367293, 0x646ce526,
+     0xc8d9ca4c, 0x945fe269, 0x2d53b223, 0x5aa76446, 0xb54ec88c, 0x6f71e7e9, 0xdee3cfd2, 0xb82be955},
+    {0x6992cea2, 0xd3259d44, 0xa3a74c79, 0x42a2ee03, 0x8545dc06, 0x0f67cefd, 0x1ecf9dfa, 0x3d9f3bf4,
+     0x7b3e77e8, 0xf67cefd0, 0xe915a951, 0xd7c72453, 0xaa623e57, 0x51280a5f, 0xa25014be, 0x414c5f8d,
+     0x8298bf1a, 0x00dd08c5, 0x01ba118a, 0x03742314, 0x06e84628, 0x0dd08c50, 0x1ba118a0, 0x37423140,
+     0x6e846280, 0xdd08c500, 0xbffdfcf1, 0x7a178f13, 0xf42f1e26, 0xedb24abd, 0xde88e38b, 0xb8fdb1e7},
+    {0xdcb17aa4, 0xbc8e83b9, 0x7cf17183, 0xf9e2e306, 0xf629b0fd, 0xe9bf170b, 0xd69258e7, 0xa8c8c73f,
+     0x547df88f, 0xa8fbf11e, 0x541b94cd, 0xa837299a, 0x558225c5, 0xab044b8a, 0x53e4e1e5, 0xa7c9c3ca,
+     0x4a7ff165, 0x94ffe2ca, 0x2c13b365, 0x582766ca, 0xb04ecd94, 0x6571edd9, 0xcae3dbb2, 0x902bc195,
+     0x25bbf5db, 0x4b77ebb6, 0x96efd76c, 0x2833d829, 0x5067b052, 0xa0cf60a4, 0x4472b7b9, 0x88e56f72},
+    {0xbd6f81f8, 0x7f337501, 0xfe66ea02, 0xf921a2f5, 0xf7af331b, 0xeab210c7, 0xd088577f, 0xa4fcd80f,
+     0x4c15c6ef, 0x982b8dde, 0x35bb6d4d, 0x6b76da9a, 0xd6edb534, 0xa8371c99, 0x55824fc3, 0xab049f86,
+     0x53e549fd, 0xa7ca93fa, 0x4a795105, 0x94f2a20a, 0x2c0932e5, 0x581265ca, 0xb024cb94, 0x65a5e1d9,
+     0xcb4bc3b2, 0x937bf195, 0x231b95db, 0x46372bb6, 0x8c6e576c, 0x1d30d829, 0x3a61b052, 0x74c360a4},
+    {0xfe314258, 0xf98ef241, 0xf6f19273, 0xe80f5217, 0xd5f2d2df, 0xae09d34f, 0x59ffd06f, 0xb3ffa0de,
+     0x6213374d, 0xc4266e9a, 0x8da0abc5, 0x1ead217b, 0x3d5a42f6, 0x7ab485ec, 0xf5690bd8, 0xef3e6141,
+     0xdb90b473, 0xb2cd1e17, 0x60764adf, 0xc0ec95be, 0x84355d8d, 0x0d86cdeb, 0x1b0d9bd6, 0x361b37ac,
+     0x6c366f58, 0xd86cdeb0, 0xb535cb91, 0x6f87e1d3, 0xdf0fc3a6, 0xbbf3f1bd, 0x720b958b, 0xe4172b16},
+    {0xf7506984, 0xeb4ca5f9, 0xd3753d03, 0xa3060cf7, 0x43e06f1f, 0x87c0de3e, 0x0a6dca8d, 0x14db951a,
+     0x29b72a34, 0x536e5468, 0xa6dca8d0, 0x48552751, 0x90aa4ea2, 0x24b8ebb5, 0x4971d76a, 0x92e3aed4,
+     0x202b2b59, 0x405656b2, 0x80acad64, 0x04b52c39, 0x096a5872, 0x12d4b0e4, 0x25a961c8, 0x4b52c390,
+     0x96a58720, 0x28a778b1, 0x514ef162, 0xa29de2c4, 0x40d7b379, 0x81af66f2, 0x06b2bb15, 0x0d65762a},
+    {0xc2a5b65e, 0x80a71a4d, 0x04a2426b, 0x094484d6, 0x128909ac, 0x25121358, 0x4a2426b0, 0x94484d60,
+     0x2d7cec31, 0x5af9d862, 0xb5f3b0c4, 0x6e0b1779, 0xdc162ef2, 0xbdc02b15, 0x7e6c20db, 0xfcd841b6,
+     0xfc5cf59d, 0xfd559dcb, 0xff474d67, 0xfb62ec3f, 0xf329ae8f, 0xe3bf2bef, 0xc292212f, 0x80c834af,
+     0x047c1faf, 0x08f83f5e, 0x11f07ebc, 0x23e0fd78, 0x47c1faf0, 0x8f83f5e0, 0x1aeb9d31, 0x35d73a62},
+    {0xe040e0ac, 0xc56db7a9, 0x8f3719a3, 0x1b8245b7, 0x37048b6e, 0x6e0916dc, 0xdc122db8, 0xbdc82d81,
+     0x7e7c2df3, 0xfcf85be6, 0xfc1cc13d, 0xfdd5f48b, 0xfe479fe7, 0xf963493f, 0xf72ae48f, 0xebb9bfef,
+     0xd29f092f, 0xa0d264af, 0x4448bfaf, 0x88917f5e, 0x14ce884d, 0x299d109a, 0x533a2134, 0xa6744268,
+     0x4904f221, 0x9209e442, 0x21ffbe75, 0x43ff7cea, 0x87fef9d4, 0x0a118559, 0x14230ab2, 0x28461564},
+    {0xc7cacead, 0x8a79ebab, 0x111fa1a7, 0x223f434e, 0x447e869c, 0x88fd0d38, 0x14166c81, 0x282cd902,
+     0x5059b204, 0xa0b36408, 0x448abee1, 0x89157dc2, 0x17c68d75, 0x2f8d1aea, 0x5f1a35d4, 0xbe346ba8,
+     0x7984a1a1, 0xf3094342, 0xe3fef075, 0xc211961b, 0x81cf5ac7, 0x0672c37f, 0x0ce586fe, 0x19cb0dfc,
+     0x33961bf8, 0x672c37f0, 0xce586fe0, 0x995ca931, 0x37552493, 0x6eaa4926, 0xdd54924c, 0xbf455269},
+    {0x04fcdcbf, 0x09f9b97e, 0x13f372fc, 0x27e6e5f8, 0x4fcdcbf0, 0x9f9b97e0, 0x3adb5931, 0x75b6b262,
+     0xeb6d64c4, 0xd336bf79, 0xa3810803, 0x42ee66f7, 0x85dccdee, 0x0e55ed2d, 0x1cabda5a, 0x3957b4b4,
+     0x72af6968, 0xe55ed2d0, 0xcf51d351, 0x9b4fd053, 0x3373d657, 0x66e7acae, 0xcdcf595c, 0x9e72c449,
+     0x3909fe63, 0x7213fcc6, 0xe427f98c, 0xcda385e9, 0x9eab7d23, 0x38ba8cb7, 0x7175196e, 0xe2ea32dc},
+    {0x6bafcc21, 0xd75f9842, 0xab534675, 0x534afa1b, 0xa695f436, 0x48c79e9d, 0x918f3d3a, 0x26f20c85,
+     0x4de4190a, 0x9bc83214, 0x327c12d9, 0x64f825b2, 0xc9f04b64, 0x960ce039, 0x29f5b683, 0x53eb6d06,
+     0xa7d6da0c, 0x4a41c2e9, 0x948385d2, 0x2ceb7d55, 0x59d6faaa, 0xb3adf554, 0x62b79c59, 0xc56f38b2,
+     0x8f320795, 0x1b8879db, 0x3710f3b6, 0x6e21e76c, 0xdc43ced8, 0xbd6beb41, 0x7f3ba073, 0xfe7740e6},
+    {0x140441c6, 0x2808838c, 0x50110718, 0xa0220e30, 0x45a86a91, 0x8b50d522, 0x134ddcb5, 0x269bb96a,
+     0x4d3772d4, 0x9a6ee5a8, 0x3131bda1, 0x62637b42, 0xc4c6f684, 0x8c619bf9, 0x1d2f4103, 0x3a5e8206,
+     0x74bd040c, 0xe97a0818, 0xd71866c1, 0xabdcbb73, 0x52550017, 0xa4aa002e, 0x4cb876ad, 0x9970ed5a,
+     0x370dac45, 0x6e1b588a, 0xdc36b114, 0xbd8114d9, 0x7eee5f43, 0xfddcbe86, 0xfe550bfd, 0xf946610b},
+    {0x68175a0a, 0xd02eb414, 0xa5b11ed9, 0x4e8e4b43, 0x9d1c9686, 0x3fd55bfd, 0x7faab7fa, 0xff556ff4,
+     0xfb46a919, 0xf36124c3, 0xe32e3f77, 0xc3b0081f, 0x828c66cf, 0x00f4bb6f, 0x01e976de, 0x03d2edbc,
+     0x07a5db78, 0x0f4bb6f0, 0x1e976de0, 0x3d2edbc0, 0x7a5db780, 0xf4bb6f00, 0xec9aa8f1, 0xdcd92713,
+     0xbc5e38d7, 0x7d50075f, 0xfaa00ebe, 0xf0ac6b8d, 0xe4b4a1eb, 0xcc853527, 0x9ce61cbf, 0x3c204f8f},
+    {0xe1ff3667, 0xc6121a3f, 0x89c8428f, 0x167cf3ef, 0x2cf9e7de, 0x59f3cfbc, 0xb3e79f78, 0x62234801,
+     0xc4469002, 0x8d6156f5, 0x1f2edb1b, 0x3e5db636, 0x7cbb6c6c, 0xf976d8d8, 0xf701c741, 0xebeff873,
+     0xd2338617, 0xa18b7adf, 0x46fa834f, 0x8df5069e, 0x1e067bcd, 0x3c0cf79a, 0x7819ef34, 0xf033de68,
+     0xe58bca21, 0xcefbe2b3, 0x981bb397, 0x35db11df, 0x6bb623be, 0xd76c477c, 0xab34f809, 0x538586e3},
+    {0x8b7230ec, 0x13081729, 0x26102e52, 0x4c205ca4, 0x9840b948, 0x356d0461, 0x6ada08c2, 0xd5b41184,
+     0xae8455f9, 0x58e4dd03, 0xb1c9ba06, 0x667f02fd, 0xccfe05fa, 0x9c107d05, 0x3dcc8cfb, 0x7b9919f6,
+     0xf73233ec, 0xeb881129, 0xd2fc54a3, 0xa014dfb7, 0x45c5c99f, 0x8b8b933e, 0x12fb508d, 0x25f6a11a,
+     0x4bed4234, 0x97da8468, 0x2a597e21, 0x54b2fc42, 0xa965f884, 0x572787f9, 0xae4f0ff2, 0x59726915},
+    {0x56175f20, 0xac2ebe40, 0x5db10a71, 0xbb6214e2, 0x73285f35, 0xe650be6a, 0xc94d0a25, 0x977662bb,
+     0x2b00b387, 0x5601670e, 0xac02ce1c, 0x5de9eac9, 0xbbd3d592, 0x724bddd5, 0xe497bbaa, 0xccc301a5,
+     0x9c6a75bb, 0x3d389d87, 0x7a713b0e, 0xf4e2761c, 0xec289ac9, 0xddbd4363, 0xbe96f037, 0x78c1969f,
+     0xf1832d3e, 0xe6ea2c8d, 0xc8382feb, 0x959c2927, 0x2ed424bf, 0x5da8497e, 0xbb5092fc, 0x734d5309},
+    {0xb9a3dcd0, 0x76abcf51, 0xed579ea2, 0xdf434bb5, 0xbb6ae19b, 0x7339b5c7, 0xe6736b8e, 0xc90aa1ed,
+     0x97f9352b, 0x2a1e1ca7, 0x543c394e, 0xa878729c, 0x551c93c9, 0xaa392792, 0x519e39d5, 0xa33c73aa,
+     0x439491a5, 0x8729234a, 0x0bbe3065, 0x177c60ca, 0x2ef8c194, 0x5df18328, 0xbbe30650, 0x722a7a51,
+     0xe454f4a2, 0xcd459fb5, 0x9f67499b, 0x3b22e5c7, 0x7645cb8e, 0xec8b971c, 0xdcfb58c9, 0xbc1ac763},
+    {0xdd2d789e, 0xbfb687cd, 0x7a81796b, 0xf502f2d6, 0xefe9935d, 0xda3f504b, 0xb192d667, 0x66c9da3f,
+     0xcd93b47e, 0x9ecb1e0d, 0x387a4aeb, 0x70f495d6, 0xe1e92bac, 0xc63e21a9, 0x899035a3, 0x16cc1db7,
+     0x2d983b6e, 0x5b3076dc, 0xb660edb8, 0x692dad81, 0xd25b5b02, 0xa15ac0f5, 0x4759f71b, 0x8eb3ee36,
+     0x188baa9d, 0x3117553a, 0x622eaa74, 0xc45d54e8, 0x8d56df21, 0x1f41c8b3, 0x3e839166, 0x7d0722cc},
+    {0x44036c4a, 0x8806d894, 0x15e1c7d9, 0x2bc38fb2, 0x57871f64, 0xaf0e3ec8, 0x5bf00b61, 0xb7e016c2,
+     0x6a2c5b75, 0xd458b6ea, 0xad5d1b25, 0x5f5640bb, 0xbeac8176, 0x78b5741d, 0xf16ae83a, 0xe739a685,
+     0xcb9f3bfb, 0x92d20107, 0x204874ff, 0x4090e9fe, 0x8121d3fc, 0x07afd109, 0x0f5fa212, 0x1ebf4424,
+     0x3d7e8848, 0x7afd1090, 0xf5fa2120, 0xee1834b1, 0xd9dc1f93, 0xb65449d7, 0x6944e55f, 0xd289cabe},
+    {0x4612657d, 0x8c24cafa, 0x1da5e305, 0x3b4bc60a, 0x76978c14, 0xed2f1828, 0xdfb246a1, 0xba88fbb3,
+     0x70fd8197, 0xe1fb032e, 0xc61a70ad, 0x89d897ab, 0x165d59a7, 0x2cbab34e, 0x5975669c, 0xb2eacd38,
+     0x6039ec81, 0xc073d902, 0x850bc4f5, 0x0ffbff1b, 0x1ff7fe36, 0x3feffc6c, 0x7fdff8d8, 0xffbff1b0,
+     0xfa939591, 0xf0cb5dd3, 0xe47acd57, 0xcd19ec5f, 0x9fdfae4f, 0x3a532a6f, 0x74a654de, 0xe94ca9bc},
+    {0x584d5569, 0xb09aaad2, 0x64d92355, 0xc9b246aa, 0x9688fba5, 0x28fd81bb, 0x51fb0376, 0xa3f606ec,
+     0x42007b29, 0x8400f652, 0x0ded9a55, 0x1bdb34aa, 0x37b66954, 0x6f6cd2a8, 0xded9a550, 0xb85f3c51,
+     0x75520e53, 0xeaa41ca6, 0xd0a44fbd, 0xa4a4e98b, 0x4ca5a5e7, 0x994b4bce, 0x377ae16d, 0x6ef5c2da,
+     0xddeb85b4, 0xbe3b7d99, 0x799a8dc3, 0xf3351b86, 0xe38641fd, 0xc2e0f50b, 0x802d9ce7, 0x05b74f3f},
+    {0xe8cd33e2, 0xd4761135, 0xad00549b, 0x5fecdfc7, 0xbfd9bf8e, 0x7a5f09ed, 0xf4be13da, 0xec905145,
+     0xdcccd47b, 0xbc75de07, 0x7d07caff, 0xfa0f95fe, 0xf1f35d0d, 0xe60acceb, 0xc9f9ef27, 0x961fa8bf,
+     0x29d3278f, 0x53a64f1e, 0xa74c9e3c, 0x4b754a89, 0x96ea9512, 0x28395cd5, 0x5072b9aa, 0xa0e57354,
+     0x44269059, 0x884d20b2, 0x15763795, 0x2aec6f2a, 0x55d8de54, 0xabb1bca8, 0x528f0fa1, 0xa51e1f42},
+    {0x82f63b78, 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020, 0x00000040,
+     0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800, 0x00001000, 0x00002000, 0x00004000,
+     0x00008000, 0x00010000, 0x00020000, 0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000,
+     0x00800000, 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000, 0x40000000},
+    {0x417b1dbc, 0x82f63b78, 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020,
+     0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800, 0x00001000, 0x00002000,
+     0x00004000, 0x00008000, 0x00010000, 0x00020000, 0x00040000, 0x00080000, 0x00100000, 0x00200000,
+     0x00400000, 0x00800000, 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000},
+    {0x105ec76f, 0x20bd8ede, 0x417b1dbc, 0x82f63b78, 0x00000001, 0x00000002, 0x00000004, 0x00000008,
+     0x00000010, 0x00000020, 0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800,
+     0x00001000, 0x00002000, 0x00004000, 0x00008000, 0x00010000, 0x00020000, 0x00040000, 0x00080000,
+     0x00100000, 0x00200000, 0x00400000, 0x00800000, 0x01000000, 0x02000000, 0x04000000, 0x08000000},
+    {0xf26b8303, 0xe13b70f7, 0xc79a971f, 0x8ad958cf, 0x105ec76f, 0x20bd8ede, 0x417b1dbc, 0x82f63b78,
+     0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020, 0x00000040, 0x00000080,
+     0x00000100, 0x00000200, 0x00000400, 0x00000800, 0x00001000, 0x00002000, 0x00004000, 0x00008000,
+     0x00010000, 0x00020000, 0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000}
+};
+
+uint32_t ceph_crc32c_zeros(uint32_t crc, unsigned len)
+{
+  int range = 0;
+  unsigned remainder = len & 15;
+  len = len >> 4;
+  range = 4;
+  while (len != 0) {
+    if ((len & 1) == 1) {
+      uint32_t crc1 = 0;
+      uint32_t* ptr = crc_turbo_table/*.val*/[range];
+      while (crc != 0) {
+        uint32_t mask = ~((crc & 1) - 1);
+        crc1 = crc1 ^ (mask & *ptr);
+        crc = crc >> 1;
+        ptr++;
+      }
+      crc = crc1;
+    }
+    len = len >> 1;
+    range++;
+  }
+  if (remainder > 0)
+    crc = ceph_crc32c(crc, nullptr, remainder);
+  return crc;
+}
diff --git a/src/common/crc32c_aarch64.c b/src/common/crc32c_aarch64.c
new file mode 100644
index 000000000..99e588399
--- /dev/null
+++ b/src/common/crc32c_aarch64.c
@@ -0,0 +1,274 @@
+#include "acconfig.h"
+#include "include/int_types.h"
+#include "common/crc32c_aarch64.h"
+#include "arch/arm.h"
+
+#ifndef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
+/* Request crc extension capabilities from the assembler */
+asm(".arch_extension crc");
+
+#ifdef HAVE_ARMV8_CRYPTO
+/* Request crypto extension capabilities from the assembler */
+asm(".arch_extension crypto");
+#endif
+
+#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+
+#define CRC32C3X8(ITR) \
+	__asm__("crc32cx %w[c1], %w[c1], %x[v]":[c1]"+r"(crc1):[v]"r"(*((const uint64_t *)buffer + 42*1 + (ITR))));\
+	__asm__("crc32cx %w[c2], %w[c2], %x[v]":[c2]"+r"(crc2):[v]"r"(*((const uint64_t *)buffer + 42*2 + (ITR))));\
+	__asm__("crc32cx %w[c0], %w[c0], %x[v]":[c0]"+r"(crc0):[v]"r"(*((const uint64_t *)buffer + 42*0 + (ITR))));
+
+#define CRC32C3X8_ZERO \
+	__asm__("crc32cx %w[c0], %w[c0], xzr":[c0]"+r"(crc0));
+
+#else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+
+#include <arm_acle.h>
+#include <arm_neon.h>
+
+#define CRC32CX(crc, value) (crc) = __crc32cd((crc), (value))
+#define CRC32CW(crc, value) (crc) = __crc32cw((crc), (value))
+#define CRC32CH(crc, value) (crc) = __crc32ch((crc), (value))
+#define CRC32CB(crc, value) (crc) = __crc32cb((crc), (value))
+
+#define CRC32C3X8(ITR) \
+	crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\
+	crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\
+	crc0 = __crc32cd(crc0, *((const uint64_t *)buffer + 42*0 + (ITR)));
+
+#define CRC32C3X8_ZERO \
+	crc0 = __crc32cd(crc0, (const uint64_t)0);
+
+#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+
+#define CRC32C7X3X8(ITR) do {\
+	CRC32C3X8((ITR)*7+0) \
+	CRC32C3X8((ITR)*7+1) \
+	CRC32C3X8((ITR)*7+2) \
+	CRC32C3X8((ITR)*7+3) \
+	CRC32C3X8((ITR)*7+4) \
+	CRC32C3X8((ITR)*7+5) \
+	CRC32C3X8((ITR)*7+6) \
+	} while(0)
+
+#define CRC32C7X3X8_ZERO do {\
+	CRC32C3X8_ZERO \
+	CRC32C3X8_ZERO \
+	CRC32C3X8_ZERO \
+	CRC32C3X8_ZERO \
+	CRC32C3X8_ZERO \
+	CRC32C3X8_ZERO \
+	CRC32C3X8_ZERO \
+	} while(0)
+
+#define PREF4X64L1(PREF_OFFSET, ITR) \
+	__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
+	__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
+	__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
+	__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
+
+#define PREF1KL1(PREF_OFFSET) \
+	PREF4X64L1((PREF_OFFSET), 0) \
+	PREF4X64L1((PREF_OFFSET), 4) \
+	PREF4X64L1((PREF_OFFSET), 8) \
+	PREF4X64L1((PREF_OFFSET), 12)
+
+#define PREF4X64L2(PREF_OFFSET, ITR) \
+	__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
+	__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
+	__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
+	__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
+
+#define PREF1KL2(PREF_OFFSET) \
+	PREF4X64L2((PREF_OFFSET), 0) \
+	PREF4X64L2((PREF_OFFSET), 4) \
+	PREF4X64L2((PREF_OFFSET), 8) \
+	PREF4X64L2((PREF_OFFSET), 12)
+
+
+uint32_t ceph_crc32c_aarch64(uint32_t crc, unsigned char const *buffer, unsigned len)
+{
+	int64_t length = len;
+	uint32_t crc0, crc1, crc2;
+
+	if (buffer) {
+#ifdef HAVE_ARMV8_CRYPTO
+	        if (ceph_arch_aarch64_pmull) {
+#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
+		/* Calculate reflected crc with PMULL Instruction */
+		const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014;
+		uint64_t t0, t1;
+
+		/* crc done "by 3" for fixed input block size of 1024 bytes */
+		while ((length -= 1024) >= 0) {
+			/* Prefetch data for following block to avoid cache miss */
+			PREF1KL2(1024*3);
+			/* Do first 8 bytes here for better pipelining */
+			crc0 = __crc32cd(crc, *(const uint64_t *)buffer);
+			crc1 = 0;
+			crc2 = 0;
+			buffer += sizeof(uint64_t);
+
+			/* Process block inline
+			Process crc0 last to avoid dependency with above */
+			CRC32C7X3X8(0);
+			CRC32C7X3X8(1);
+			CRC32C7X3X8(2);
+			CRC32C7X3X8(3);
+			CRC32C7X3X8(4);
+			CRC32C7X3X8(5);
+
+			buffer += 42*3*sizeof(uint64_t);
+			/* Prefetch data for following block to avoid cache miss */
+			PREF1KL1(1024);
+
+			/* Merge crc0 and crc1 into crc2
+			   crc1 multiply by K2
+			   crc0 multiply by K1 */
+
+			t1 = (uint64_t)vmull_p64(crc1, k2);
+			t0 = (uint64_t)vmull_p64(crc0, k1);
+			crc = __crc32cd(crc2, *(const uint64_t *)buffer);
+			crc1 = __crc32cd(0, t1);
+			crc ^= crc1;
+			crc0 = __crc32cd(0, t0);
+			crc ^= crc0;
+
+			buffer += sizeof(uint64_t);
+		}
+#else /* !HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+		__asm__("mov    x16,            #0xf38a         \n\t"
+			"movk   x16,            #0xe417, lsl 16 \n\t"
+			"mov    v1.2d[0],       x16             \n\t"
+			"mov    x16,            #0x8014         \n\t"
+			"movk   x16,            #0x8f15, lsl 16 \n\t"
+			"mov    v0.2d[0],       x16             \n\t"
+			:::"x16","v0","v1");
+
+		while ((length -= 1024) >= 0) {
+			PREF1KL2(1024*3);
+			__asm__("crc32cx %w[c0], %w[c], %x[v]\n\t"
+				:[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(*(const uint64_t *)buffer):);
+			crc1 = 0;
+			crc2 = 0;
+			buffer += sizeof(uint64_t);
+
+			CRC32C7X3X8(0);
+			CRC32C7X3X8(1);
+			CRC32C7X3X8(2);
+			CRC32C7X3X8(3);
+			CRC32C7X3X8(4);
+			CRC32C7X3X8(5);
+
+			buffer += 42*3*sizeof(uint64_t);
+			PREF1KL1(1024);
+			__asm__("mov            v2.2d[0],       %x[c1]          \n\t"
+				"pmull          v2.1q,          v2.1d,  v0.1d   \n\t"
+				"mov            v3.2d[0],       %x[c0]          \n\t"
+				"pmull          v3.1q,          v3.1d,  v1.1d   \n\t"
+				"crc32cx        %w[c],          %w[c2], %x[v]   \n\t"
+				"mov            %x[c1],         v2.2d[0]        \n\t"
+				"crc32cx        %w[c1],         wzr,    %x[c1]  \n\t"
+				"eor            %w[c],          %w[c],  %w[c1]  \n\t"
+				"mov            %x[c0],         v3.2d[0]        \n\t"
+				"crc32cx        %w[c0],         wzr,    %x[c0]  \n\t"
+				"eor            %w[c],          %w[c],  %w[c0]  \n\t"
+				:[c1]"+r"(crc1), [c0]"+r"(crc0), [c2]"+r"(crc2), [c]"+r"(crc)
+				:[v]"r"(*((const uint64_t *)buffer))
+				:"v0","v1","v2","v3");
+			buffer += sizeof(uint64_t);
+		}
+#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+
+		if(!(length += 1024))
+			return crc;
+	        }
+#endif /* HAVE_ARMV8_CRYPTO */
+		while ((length -= sizeof(uint64_t)) >= 0) {
+			CRC32CX(crc, *(uint64_t *)buffer);
+			buffer += sizeof(uint64_t);
+		}
+
+		/* The following is more efficient than the straight loop */
+		if (length & sizeof(uint32_t)) {
+			CRC32CW(crc, *(uint32_t *)buffer);
+			buffer += sizeof(uint32_t);
+		}
+		if (length & sizeof(uint16_t)) {
+			CRC32CH(crc, *(uint16_t *)buffer);
+			buffer += sizeof(uint16_t);
+		}
+		if (length & sizeof(uint8_t))
+			CRC32CB(crc, *buffer);
+	} else {
+#ifdef HAVE_ARMV8_CRYPTO
+	        if (ceph_arch_aarch64_pmull) {
+#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
+		const poly64_t k1 = 0xe417f38a;
+		uint64_t t0;
+
+		while ((length -= 1024) >= 0) {
+			crc0 = __crc32cd(crc, 0);
+
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+
+			/* Merge crc0 into crc: crc0 multiply by K1 */
+
+			t0 = (uint64_t)vmull_p64(crc0, k1);
+			crc = __crc32cd(0, t0);
+		}
+#else /* !HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+		__asm__("mov    x16,            #0xf38a         \n\t"
+			"movk   x16,            #0xe417, lsl 16 \n\t"
+			"mov    v1.2d[0],       x16             \n\t"
+			:::"x16","v1");
+
+		while ((length -= 1024) >= 0) {
+			__asm__("crc32cx %w[c0], %w[c], xzr\n\t"
+				:[c0]"=r"(crc0):[c]"r"(crc));
+
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+
+			__asm__("mov            v3.2d[0],       %x[c0]          \n\t"
+				"pmull          v3.1q,          v3.1d,  v1.1d   \n\t"
+				"mov            %x[c0],         v3.2d[0]        \n\t"
+				"crc32cx        %w[c],          wzr,    %x[c0]  \n\t"
+				:[c]"=r"(crc)
+				:[c0]"r"(crc0)
+				:"v1","v3");
+		}
+#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+
+		if(!(length += 1024))
+			return crc;
+	        }
+#endif /* HAVE_ARMV8_CRYPTO */
+		while ((length -= sizeof(uint64_t)) >= 0)
+			CRC32CX(crc, 0);
+
+		/* The following is more efficient than the straight loop */
+		if (length & sizeof(uint32_t))
+			CRC32CW(crc, 0);
+
+		if (length & sizeof(uint16_t))
+			CRC32CH(crc, 0);
+
+		if (length & sizeof(uint8_t))
+			CRC32CB(crc, 0);
+	}
+	return crc;
+}
diff --git a/src/common/crc32c_aarch64.h b/src/common/crc32c_aarch64.h
new file mode 100644
index 000000000..51f0542fe
--- /dev/null
+++ b/src/common/crc32c_aarch64.h
@@ -0,0 +1,28 @@
+#ifndef CEPH_COMMON_CRC32C_AARCH64_H
+#define CEPH_COMMON_CRC32C_AARCH64_H
+
+#include "acconfig.h"
+#include "arch/arm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef HAVE_ARMV8_CRC
+
+extern uint32_t ceph_crc32c_aarch64(uint32_t crc, unsigned char const *buffer, unsigned len);
+
+#else
+
+static inline uint32_t ceph_crc32c_aarch64(uint32_t crc, unsigned char const *buffer, unsigned len)
+{
+	return 0;
+}
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/common/crc32c_intel_baseline.c b/src/common/crc32c_intel_baseline.c
new file mode 100644
index 000000000..2862f6272
--- /dev/null
+++ b/src/common/crc32c_intel_baseline.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright 2012-2013 Intel Corporation All Rights Reserved.
+ * All rights reserved.
+ *
+ * http://opensource.org/licenses/BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ *
+ * * Neither the name of the Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "include/int_types.h"
+
+#define MAX_ITER	8
+
+unsigned long crc32_table_iscsi_base[256] = {
+	0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, 
+	0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB, 
+	0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, 
+	0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24, 
+	0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B, 
+	0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384, 
+	0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54, 
+	0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B, 
+	0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A, 
+	0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35, 
+	0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5, 
+	0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA, 
+	0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45, 
+	0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A, 
+	0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A, 
+	0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595, 
+	0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48, 
+	0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957, 
+	0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687, 
+	0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198, 
+	0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927, 
+	0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38, 
+	0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8, 
+	0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7, 
+	0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096, 
+	0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789, 
+	0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859, 
+	0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46, 
+	0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9, 
+	0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6, 
+	0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36, 
+	0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829, 
+	0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C, 
+	0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93, 
+	0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043, 
+	0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C, 
+	0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3, 
+	0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC, 
+	0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C, 
+	0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033, 
+	0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652, 
+	0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D, 
+	0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D, 
+	0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982, 
+	0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D, 
+	0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622, 
+	0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2, 
+	0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED, 
+	0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530, 
+	0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F, 
+	0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF, 
+	0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0, 
+	0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F, 
+	0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540, 
+	0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90, 
+	0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F, 
+	0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE, 
+	0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1, 
+	0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321, 
+	0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E, 
+	0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81, 
+	0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E, 
+	0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, 
+	0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351, 
+};
+
+
+// iSCSI CRC baseline function
+uint32_t ceph_crc32c_intel_baseline(uint32_t crc_init2, unsigned char const *buffer, unsigned len)
+{
+	unsigned int crc_init = crc_init2;
+	unsigned int crc;
+	unsigned char* p_buf;
+
+	if (buffer) {
+		p_buf = (unsigned char*)buffer;
+		unsigned char const * p_end = buffer + len;
+
+		crc = crc_init;
+
+		while (p_buf < (unsigned char *) p_end ){
+			crc = (crc >> 8) ^ crc32_table_iscsi_base[(crc & 0x000000FF) ^ *p_buf++];
+		}
+	} else {
+		crc = crc_init;
+		while (len--) {
+			crc = (crc >> 8) ^ crc32_table_iscsi_base[(crc & 0x000000FF)];
+		}
+
+	}
+	return crc;	 
+}
diff --git a/src/common/crc32c_intel_baseline.h b/src/common/crc32c_intel_baseline.h
new file mode 100644
index 000000000..e463575e2
--- /dev/null
+++ b/src/common/crc32c_intel_baseline.h
@@ -0,0 +1,16 @@
+#ifndef CEPH_COMMON_CRC32C_INTEL_BASELINE_H
+#define CEPH_COMMON_CRC32C_INTEL_BASELINE_H
+
+#include "include/int_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint32_t ceph_crc32c_intel_baseline(uint32_t crc, unsigned char const *buffer, unsigned len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/common/crc32c_intel_fast.c b/src/common/crc32c_intel_fast.c
new file mode 100644
index 000000000..28bd93416
--- /dev/null
+++ b/src/common/crc32c_intel_fast.c
@@ -0,0 +1,51 @@
+#include "acconfig.h"
+#include "common/crc32c_intel_baseline.h"
+
+extern unsigned int crc32_iscsi_00(unsigned char const *buffer, uint64_t len, uint64_t crc) asm("crc32_iscsi_00");
+extern unsigned int crc32_iscsi_zero_00(unsigned char const *buffer, uint64_t len, uint64_t crc) asm("crc32_iscsi_zero_00");
+
+#ifdef HAVE_NASM_X64
+
+uint32_t ceph_crc32c_intel_fast(uint32_t crc, unsigned char const *buffer, unsigned len)
+{
+	uint32_t v;
+	unsigned left;
+
+	if (!buffer)
+	{
+	  return crc32_iscsi_zero_00(buffer, len, crc);
+	}
+
+	/*
+	 * the crc32_iscsi_00 method reads past buffer+len (because it
+	 * reads full words) which makes valgrind unhappy.  don't do
+	 * that.
+	 */
+	if (len < 16)
+		return ceph_crc32c_intel_baseline(crc, buffer, len);
+	left = ((unsigned long)buffer + len) & 7;
+	len -= left;
+	v = crc32_iscsi_00(buffer, len, crc);
+	if (left)
+		v = ceph_crc32c_intel_baseline(v, buffer + len, left);
+	return v;
+}
+
+int ceph_crc32c_intel_fast_exists(void)
+{
+	return 1;
+}
+
+#else
+
+int ceph_crc32c_intel_fast_exists(void)
+{
+	return 0;
+}
+
+uint32_t ceph_crc32c_intel_fast(uint32_t crc, unsigned char const *buffer, unsigned len)
+{
+	return 0;
+}
+
+#endif
diff --git a/src/common/crc32c_intel_fast.h b/src/common/crc32c_intel_fast.h
new file mode 100644
index 000000000..26a444f60
--- /dev/null
+++ b/src/common/crc32c_intel_fast.h
@@ -0,0 +1,28 @@
+#ifndef CEPH_COMMON_CRC32C_INTEL_FAST_H
+#define CEPH_COMMON_CRC32C_INTEL_FAST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* is the fast version compiled in */
+extern int ceph_crc32c_intel_fast_exists(void);
+
+#ifdef __x86_64__
+
+extern uint32_t ceph_crc32c_intel_fast(uint32_t crc, unsigned char const *buffer, unsigned len);
+
+#else
+
+static inline uint32_t ceph_crc32c_intel_fast(uint32_t crc, unsigned char const *buffer, unsigned len)
+{
+	return 0;
+}
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/common/crc32c_intel_fast_zero_asm.s b/src/common/crc32c_intel_fast_zero_asm.s
new file mode 100644
index 000000000..216ecf639
--- /dev/null
+++ b/src/common/crc32c_intel_fast_zero_asm.s
@@ -0,0 +1,657 @@
+;
+; Copyright 2012-2013 Intel Corporation All Rights Reserved.
+; All rights reserved.
+;
+; http://opensource.org/licenses/BSD-3-Clause
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following
+; conditions are met:
+;
+; * Redistributions of source code must retain the above copyright
+;   notice, this list of conditions and the following disclaimer.
+;
+; * Redistributions in binary form must reproduce the above copyright
+;   notice, this list of conditions and the following disclaimer in
+;   the documentation and/or other materials provided with the
+;   distribution.
+;
+; * Neither the name of the Intel Corporation nor the names of its
+;   contributors may be used to endorse or promote products derived
+;   from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+; FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+; COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+; INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+; HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+; STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+; ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+; OF THE POSSIBILITY OF SUCH DAMAGE.
+;
+
+; Function to compute iscsi CRC32 with table-based recombination
+; crc done "by 3" with block sizes 1920, 960, 480, 240
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+default rel
+
+; crcB3 MACRO to implement crc32 on 3 %%bSize-byte blocks
+%macro  crcB3 3
+%define %%bSize   %1    ; 1/3 of buffer size
+%define %%td2     %2    ; table offset for crc0 (2/3 of buffer)
+%define %%td1     %3    ; table offset for crc1 (1/3 of buffer)
+
+%IF %%bSize=640
+	sub     len, %%bSize*3
+	js      %%crcB3_end           ;; jump to next level if 3*blockSize > len
+%ELSE
+	cmp     len, %%bSize*3
+	jnae    %%crcB3_end           ;; jump to next level if 3*blockSize > len
+%ENDIF
+	;;;;;; Calculate CRC of 3 blocks of the buffer ;;;;;;
+%%crcB3_loop:
+					;; rax = crc0 = initial crc
+	xor     rbx, rbx                ;; rbx = crc1 = 0;
+	xor     r10, r10                ;; r10 = crc2 = 0;
+
+ %assign i 0
+ %rep %%bSize/8 - 1
+	crc32   rax, bufptmp  ;; update crc0
+	crc32   rbx, bufptmp  ;; update crc1
+	crc32   r10, bufptmp  ;; update crc2
+	%assign i (i+8)
+ %endrep
+	crc32   rax, bufptmp  ;; update crc0
+	crc32   rbx, bufptmp  ;; update crc1
+; SKIP  ;crc32  r10, bufptmp  ;; update crc2
+
+	; merge in crc0
+	movzx   bufp_dw, al
+	mov     r9d, [crc_init + bufp*4 + %%td2]
+	movzx   bufp_dw, ah
+	shr     eax, 16
+	mov     r11d, [crc_init + bufp*4 + %%td2]
+	shl     r11, 8
+	xor     r9, r11
+
+	movzx   bufp_dw, al
+	mov     r11d, [crc_init + bufp*4 + %%td2]
+	movzx   bufp_dw, ah
+	shl     r11, 16
+	xor     r9, r11
+	mov     r11d, [crc_init + bufp*4 + %%td2]
+	shl     r11, 24
+	xor     r9, r11
+
+	; merge in crc1
+
+	movzx   bufp_dw, bl
+	mov     r11d, [crc_init + bufp*4 + %%td1]
+	movzx   bufp_dw, bh
+	shr     ebx, 16
+	xor     r9, r11
+	mov     r11d, [crc_init + bufp*4 + %%td1]
+	shl     r11, 8
+	xor     r9, r11
+
+	movzx   bufp_dw, bl
+	mov     r11d, [crc_init + bufp*4 + %%td1]
+	movzx   bufp_dw, bh
+	shl     r11, 16
+	xor     r9, r11
+	mov     r11d, [crc_init + bufp*4 + %%td1]
+	shl     r11, 24
+	xor     r9, r11
+
+	; xor     r9, [bufptmp+i + 2*%%bSize]
+	crc32   r10, r9
+	mov     rax, r10
+
+	; add     bufptmp, %%bSize*3      ;; move to next block
+	sub     len, %%bSize*3
+%IF %%bSize=640
+	jns     %%crcB3_loop
+%ENDIF
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%%crcB3_end:
+%IF %%bSize=640
+	add     len, %%bSize*3
+%ENDIF
+	je      do_return               ;; return if remaining data is zero
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define ABI_IS_AMD64
+%elifidn __OUTPUT_FORMAT__, macho64
+%define ABI_IS_AMD64
+%endif
+
+;;; ISCSI CRC 32 Implementation with crc32 Instruction
+
+;;; unsigned int crc32_iscsi_00(unsigned char * buffer, int len, unsigned int crc_init);
+;;;
+;;;        *buf = rcx
+;;;         len = rdx
+;;;    crc_init = r8
+;;;
+
+global  crc32_iscsi_zero_00:function
+crc32_iscsi_zero_00:
+
+%ifdef ABI_IS_AMD64
+%define bufp            rdi
+%define bufp_dw         edi
+%define bufp_w          di
+%define bufp_b          dil
+%define bufptmp         rcx
+%define block_0         rcx
+%define block_1         r8
+%define block_2         r11
+%define len             rsi
+%define len_dw          esi
+%define len_w           si
+%define len_b           sil
+%define crc_init        rdx
+%define crc_init_dw     edx
+%else
+%define bufp            rcx
+%define bufp_dw         ecx
+%define bufp_w          cx
+%define bufp_b          cl
+%define bufptmp         rdi
+%define block_0         rdi
+%define block_1         rsi
+%define block_2         r11
+%define len             rdx
+%define len_dw          edx
+%define len_w           dx
+%define len_b           dl
+%define crc_init        r8
+%define crc_init_dw     r8d
+%endif
+
+
+	push    rdi
+	push    rbx
+
+	mov     rax, crc_init           ;; rax = crc_init;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; 1) ALIGN: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; no need for alignment
+	xor bufptmp, bufptmp
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; 2) BLOCK LEVEL: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+proc_block:
+	cmp     len, 240
+	jb      bit8
+
+	lea     crc_init, [mul_table_72]  ;; load table base address
+
+	crcB3   640, 0x1000, 0x0c00     ; 640*3 = 1920 (Tables 1280, 640)
+	crcB3   320, 0x0c00, 0x0800     ; 320*3 =  960 (Tables  640, 320)
+	crcB3   160, 0x0800, 0x0400     ; 160*3 =  480 (Tables  320, 160)
+	crcB3    80, 0x0400, 0x0000     ;  80*3 =  240 (Tables  160,  80)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;4) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of rdx are full)
+
+bit8:
+	shl     len_b, 1                ;; shift-out MSB (bit-7)
+	jnc     bit7                    ;; jump to bit-6 if bit-7 == 0
+ %assign i 0
+ %rep 16
+	crc32   rax, bufptmp        ;; compute crc32 of 8-byte data
+	%assign i (i+8)
+ %endrep
+	je      do_return               ;; return if remaining data is zero
+
+bit7:
+	shl     len_b, 1                ;; shift-out MSB (bit-7)
+	jnc     bit6                    ;; jump to bit-6 if bit-7 == 0
+ %assign i 0
+ %rep 8
+	crc32   rax, bufptmp        ;; compute crc32 of 8-byte data
+	%assign i (i+8)
+ %endrep
+	je      do_return               ;; return if remaining data is zero
+	; add     bufptmp, 64             ;; buf +=64; (next 64 bytes)
+bit6:
+	shl     len_b, 1                ;; shift-out MSB (bit-6)
+	jnc     bit5                    ;; jump to bit-5 if bit-6 == 0
+ %assign i 0
+ %rep 4
+	crc32   rax, bufptmp        ;;    compute crc32 of 8-byte data
+	%assign i (i+8)
+ %endrep
+	je      do_return               ;; return if remaining data is zero
+	; add     bufptmp, 32             ;; buf +=32; (next 32 bytes)
+bit5:
+	shl     len_b, 1                ;; shift-out MSB (bit-5)
+	jnc     bit4                    ;; jump to bit-4 if bit-5 == 0
+ %assign i 0
+ %rep 2
+	crc32   rax, bufptmp        ;;    compute crc32 of 8-byte data
+	%assign i (i+8)
+ %endrep
+	je      do_return               ;; return if remaining data is zero
+	; add     bufptmp, 16             ;; buf +=16; (next 16 bytes)
+bit4:
+	shl     len_b, 1                ;; shift-out MSB (bit-4)
+	jnc     bit3                    ;; jump to bit-3 if bit-4 == 0
+	crc32   rax, bufptmp          ;; compute crc32 of 8-byte data
+	je      do_return               ;; return if remaining data is zero
+	; add     bufptmp, 8              ;; buf +=8; (next 8 bytes)
+bit3:
+	mov     rbx, bufptmp          ;; load a 8-bytes from the buffer:
+	shl     len_b, 1                ;; shift-out MSB (bit-3)
+	jnc     bit2                    ;; jump to bit-2 if bit-3 == 0
+	crc32   eax, ebx                ;; compute crc32 of 4-byte data
+	je      do_return               ;; return if remaining data is zero
+	shr     rbx, 32                 ;; get next 3 bytes
+bit2:
+	shl     len_b, 1                ;; shift-out MSB (bit-2)
+	jnc     bit1                    ;; jump to bit-1 if bit-2 == 0
+	crc32   eax, bx                 ;; compute crc32 of 2-byte data
+	je      do_return               ;; return if remaining data is zero
+	shr     rbx, 16                 ;; next byte
+bit1:
+	test    len_b,len_b
+	je      do_return
+	crc32   eax, bl                 ;; compute crc32 of 1-byte data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+do_return:
+
+	pop     rbx
+	pop     rdi
+	ret
+
+less_than_8:
+	xor bufp, bufp
+	test    len,4
+	jz      less_than_4
+	crc32   eax, bufp_dw
+	add     bufptmp,4
+less_than_4:
+	test    len,2
+	jz      less_than_2
+	crc32   eax, bufp_w
+	add     bufptmp,2
+less_than_2:
+	test    len,1
+	jz      do_return
+	crc32   rax, bufp_b
+	pop     rbx
+	pop     bufptmp
+	ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; global mul_table_72, mul_table_152, mul_table_312, mul_table_632, mul_table_1272
+
+section .data
+align   8
+mul_table_72:
+DD 0x00000000,0x39d3b296,0x73a7652c,0x4a74d7ba
+DD 0xe74eca58,0xde9d78ce,0x94e9af74,0xad3a1de2
+DD 0xcb71e241,0xf2a250d7,0xb8d6876d,0x810535fb
+DD 0x2c3f2819,0x15ec9a8f,0x5f984d35,0x664bffa3
+DD 0x930fb273,0xaadc00e5,0xe0a8d75f,0xd97b65c9
+DD 0x7441782b,0x4d92cabd,0x07e61d07,0x3e35af91
+DD 0x587e5032,0x61ade2a4,0x2bd9351e,0x120a8788
+DD 0xbf309a6a,0x86e328fc,0xcc97ff46,0xf5444dd0
+DD 0x23f31217,0x1a20a081,0x5054773b,0x6987c5ad
+DD 0xc4bdd84f,0xfd6e6ad9,0xb71abd63,0x8ec90ff5
+DD 0xe882f056,0xd15142c0,0x9b25957a,0xa2f627ec
+DD 0x0fcc3a0e,0x361f8898,0x7c6b5f22,0x45b8edb4
+DD 0xb0fca064,0x892f12f2,0xc35bc548,0xfa8877de
+DD 0x57b26a3c,0x6e61d8aa,0x24150f10,0x1dc6bd86
+DD 0x7b8d4225,0x425ef0b3,0x082a2709,0x31f9959f
+DD 0x9cc3887d,0xa5103aeb,0xef64ed51,0xd6b75fc7
+DD 0x47e6242e,0x7e3596b8,0x34414102,0x0d92f394
+DD 0xa0a8ee76,0x997b5ce0,0xd30f8b5a,0xeadc39cc
+DD 0x8c97c66f,0xb54474f9,0xff30a343,0xc6e311d5
+DD 0x6bd90c37,0x520abea1,0x187e691b,0x21addb8d
+DD 0xd4e9965d,0xed3a24cb,0xa74ef371,0x9e9d41e7
+DD 0x33a75c05,0x0a74ee93,0x40003929,0x79d38bbf
+DD 0x1f98741c,0x264bc68a,0x6c3f1130,0x55eca3a6
+DD 0xf8d6be44,0xc1050cd2,0x8b71db68,0xb2a269fe
+DD 0x64153639,0x5dc684af,0x17b25315,0x2e61e183
+DD 0x835bfc61,0xba884ef7,0xf0fc994d,0xc92f2bdb
+DD 0xaf64d478,0x96b766ee,0xdcc3b154,0xe51003c2
+DD 0x482a1e20,0x71f9acb6,0x3b8d7b0c,0x025ec99a
+DD 0xf71a844a,0xcec936dc,0x84bde166,0xbd6e53f0
+DD 0x10544e12,0x2987fc84,0x63f32b3e,0x5a2099a8
+DD 0x3c6b660b,0x05b8d49d,0x4fcc0327,0x761fb1b1
+DD 0xdb25ac53,0xe2f61ec5,0xa882c97f,0x91517be9
+DD 0x8fcc485c,0xb61ffaca,0xfc6b2d70,0xc5b89fe6
+DD 0x68828204,0x51513092,0x1b25e728,0x22f655be
+DD 0x44bdaa1d,0x7d6e188b,0x371acf31,0x0ec97da7
+DD 0xa3f36045,0x9a20d2d3,0xd0540569,0xe987b7ff
+DD 0x1cc3fa2f,0x251048b9,0x6f649f03,0x56b72d95
+DD 0xfb8d3077,0xc25e82e1,0x882a555b,0xb1f9e7cd
+DD 0xd7b2186e,0xee61aaf8,0xa4157d42,0x9dc6cfd4
+DD 0x30fcd236,0x092f60a0,0x435bb71a,0x7a88058c
+DD 0xac3f5a4b,0x95ece8dd,0xdf983f67,0xe64b8df1
+DD 0x4b719013,0x72a22285,0x38d6f53f,0x010547a9
+DD 0x674eb80a,0x5e9d0a9c,0x14e9dd26,0x2d3a6fb0
+DD 0x80007252,0xb9d3c0c4,0xf3a7177e,0xca74a5e8
+DD 0x3f30e838,0x06e35aae,0x4c978d14,0x75443f82
+DD 0xd87e2260,0xe1ad90f6,0xabd9474c,0x920af5da
+DD 0xf4410a79,0xcd92b8ef,0x87e66f55,0xbe35ddc3
+DD 0x130fc021,0x2adc72b7,0x60a8a50d,0x597b179b
+DD 0xc82a6c72,0xf1f9dee4,0xbb8d095e,0x825ebbc8
+DD 0x2f64a62a,0x16b714bc,0x5cc3c306,0x65107190
+DD 0x035b8e33,0x3a883ca5,0x70fceb1f,0x492f5989
+DD 0xe415446b,0xddc6f6fd,0x97b22147,0xae6193d1
+DD 0x5b25de01,0x62f66c97,0x2882bb2d,0x115109bb
+DD 0xbc6b1459,0x85b8a6cf,0xcfcc7175,0xf61fc3e3
+DD 0x90543c40,0xa9878ed6,0xe3f3596c,0xda20ebfa
+DD 0x771af618,0x4ec9448e,0x04bd9334,0x3d6e21a2
+DD 0xebd97e65,0xd20accf3,0x987e1b49,0xa1ada9df
+DD 0x0c97b43d,0x354406ab,0x7f30d111,0x46e36387
+DD 0x20a89c24,0x197b2eb2,0x530ff908,0x6adc4b9e
+DD 0xc7e6567c,0xfe35e4ea,0xb4413350,0x8d9281c6
+DD 0x78d6cc16,0x41057e80,0x0b71a93a,0x32a21bac
+DD 0x9f98064e,0xa64bb4d8,0xec3f6362,0xd5ecd1f4
+DD 0xb3a72e57,0x8a749cc1,0xc0004b7b,0xf9d3f9ed
+DD 0x54e9e40f,0x6d3a5699,0x274e8123,0x1e9d33b5
+
+mul_table_152:
+DD 0x00000000,0x878a92a7,0x0af953bf,0x8d73c118
+DD 0x15f2a77e,0x927835d9,0x1f0bf4c1,0x98816666
+DD 0x2be54efc,0xac6fdc5b,0x211c1d43,0xa6968fe4
+DD 0x3e17e982,0xb99d7b25,0x34eeba3d,0xb364289a
+DD 0x57ca9df8,0xd0400f5f,0x5d33ce47,0xdab95ce0
+DD 0x42383a86,0xc5b2a821,0x48c16939,0xcf4bfb9e
+DD 0x7c2fd304,0xfba541a3,0x76d680bb,0xf15c121c
+DD 0x69dd747a,0xee57e6dd,0x632427c5,0xe4aeb562
+DD 0xaf953bf0,0x281fa957,0xa56c684f,0x22e6fae8
+DD 0xba679c8e,0x3ded0e29,0xb09ecf31,0x37145d96
+DD 0x8470750c,0x03fae7ab,0x8e8926b3,0x0903b414
+DD 0x9182d272,0x160840d5,0x9b7b81cd,0x1cf1136a
+DD 0xf85fa608,0x7fd534af,0xf2a6f5b7,0x752c6710
+DD 0xedad0176,0x6a2793d1,0xe75452c9,0x60dec06e
+DD 0xd3bae8f4,0x54307a53,0xd943bb4b,0x5ec929ec
+DD 0xc6484f8a,0x41c2dd2d,0xccb11c35,0x4b3b8e92
+DD 0x5ac60111,0xdd4c93b6,0x503f52ae,0xd7b5c009
+DD 0x4f34a66f,0xc8be34c8,0x45cdf5d0,0xc2476777
+DD 0x71234fed,0xf6a9dd4a,0x7bda1c52,0xfc508ef5
+DD 0x64d1e893,0xe35b7a34,0x6e28bb2c,0xe9a2298b
+DD 0x0d0c9ce9,0x8a860e4e,0x07f5cf56,0x807f5df1
+DD 0x18fe3b97,0x9f74a930,0x12076828,0x958dfa8f
+DD 0x26e9d215,0xa16340b2,0x2c1081aa,0xab9a130d
+DD 0x331b756b,0xb491e7cc,0x39e226d4,0xbe68b473
+DD 0xf5533ae1,0x72d9a846,0xffaa695e,0x7820fbf9
+DD 0xe0a19d9f,0x672b0f38,0xea58ce20,0x6dd25c87
+DD 0xdeb6741d,0x593ce6ba,0xd44f27a2,0x53c5b505
+DD 0xcb44d363,0x4cce41c4,0xc1bd80dc,0x4637127b
+DD 0xa299a719,0x251335be,0xa860f4a6,0x2fea6601
+DD 0xb76b0067,0x30e192c0,0xbd9253d8,0x3a18c17f
+DD 0x897ce9e5,0x0ef67b42,0x8385ba5a,0x040f28fd
+DD 0x9c8e4e9b,0x1b04dc3c,0x96771d24,0x11fd8f83
+DD 0xb58c0222,0x32069085,0xbf75519d,0x38ffc33a
+DD 0xa07ea55c,0x27f437fb,0xaa87f6e3,0x2d0d6444
+DD 0x9e694cde,0x19e3de79,0x94901f61,0x131a8dc6
+DD 0x8b9beba0,0x0c117907,0x8162b81f,0x06e82ab8
+DD 0xe2469fda,0x65cc0d7d,0xe8bfcc65,0x6f355ec2
+DD 0xf7b438a4,0x703eaa03,0xfd4d6b1b,0x7ac7f9bc
+DD 0xc9a3d126,0x4e294381,0xc35a8299,0x44d0103e
+DD 0xdc517658,0x5bdbe4ff,0xd6a825e7,0x5122b740
+DD 0x1a1939d2,0x9d93ab75,0x10e06a6d,0x976af8ca
+DD 0x0feb9eac,0x88610c0b,0x0512cd13,0x82985fb4
+DD 0x31fc772e,0xb676e589,0x3b052491,0xbc8fb636
+DD 0x240ed050,0xa38442f7,0x2ef783ef,0xa97d1148
+DD 0x4dd3a42a,0xca59368d,0x472af795,0xc0a06532
+DD 0x58210354,0xdfab91f3,0x52d850eb,0xd552c24c
+DD 0x6636ead6,0xe1bc7871,0x6ccfb969,0xeb452bce
+DD 0x73c44da8,0xf44edf0f,0x793d1e17,0xfeb78cb0
+DD 0xef4a0333,0x68c09194,0xe5b3508c,0x6239c22b
+DD 0xfab8a44d,0x7d3236ea,0xf041f7f2,0x77cb6555
+DD 0xc4af4dcf,0x4325df68,0xce561e70,0x49dc8cd7
+DD 0xd15deab1,0x56d77816,0xdba4b90e,0x5c2e2ba9
+DD 0xb8809ecb,0x3f0a0c6c,0xb279cd74,0x35f35fd3
+DD 0xad7239b5,0x2af8ab12,0xa78b6a0a,0x2001f8ad
+DD 0x9365d037,0x14ef4290,0x999c8388,0x1e16112f
+DD 0x86977749,0x011de5ee,0x8c6e24f6,0x0be4b651
+DD 0x40df38c3,0xc755aa64,0x4a266b7c,0xcdacf9db
+DD 0x552d9fbd,0xd2a70d1a,0x5fd4cc02,0xd85e5ea5
+DD 0x6b3a763f,0xecb0e498,0x61c32580,0xe649b727
+DD 0x7ec8d141,0xf94243e6,0x743182fe,0xf3bb1059
+DD 0x1715a53b,0x909f379c,0x1decf684,0x9a666423
+DD 0x02e70245,0x856d90e2,0x081e51fa,0x8f94c35d
+DD 0x3cf0ebc7,0xbb7a7960,0x3609b878,0xb1832adf
+DD 0x29024cb9,0xae88de1e,0x23fb1f06,0xa4718da1
+
+mul_table_312:
+DD 0x00000000,0xbac2fd7b,0x70698c07,0xcaab717c
+DD 0xe0d3180e,0x5a11e575,0x90ba9409,0x2a786972
+DD 0xc44a46ed,0x7e88bb96,0xb423caea,0x0ee13791
+DD 0x24995ee3,0x9e5ba398,0x54f0d2e4,0xee322f9f
+DD 0x8d78fb2b,0x37ba0650,0xfd11772c,0x47d38a57
+DD 0x6dabe325,0xd7691e5e,0x1dc26f22,0xa7009259
+DD 0x4932bdc6,0xf3f040bd,0x395b31c1,0x8399ccba
+DD 0xa9e1a5c8,0x132358b3,0xd98829cf,0x634ad4b4
+DD 0x1f1d80a7,0xa5df7ddc,0x6f740ca0,0xd5b6f1db
+DD 0xffce98a9,0x450c65d2,0x8fa714ae,0x3565e9d5
+DD 0xdb57c64a,0x61953b31,0xab3e4a4d,0x11fcb736
+DD 0x3b84de44,0x8146233f,0x4bed5243,0xf12faf38
+DD 0x92657b8c,0x28a786f7,0xe20cf78b,0x58ce0af0
+DD 0x72b66382,0xc8749ef9,0x02dfef85,0xb81d12fe
+DD 0x562f3d61,0xecedc01a,0x2646b166,0x9c844c1d
+DD 0xb6fc256f,0x0c3ed814,0xc695a968,0x7c575413
+DD 0x3e3b014e,0x84f9fc35,0x4e528d49,0xf4907032
+DD 0xdee81940,0x642ae43b,0xae819547,0x1443683c
+DD 0xfa7147a3,0x40b3bad8,0x8a18cba4,0x30da36df
+DD 0x1aa25fad,0xa060a2d6,0x6acbd3aa,0xd0092ed1
+DD 0xb343fa65,0x0981071e,0xc32a7662,0x79e88b19
+DD 0x5390e26b,0xe9521f10,0x23f96e6c,0x993b9317
+DD 0x7709bc88,0xcdcb41f3,0x0760308f,0xbda2cdf4
+DD 0x97daa486,0x2d1859fd,0xe7b32881,0x5d71d5fa
+DD 0x212681e9,0x9be47c92,0x514f0dee,0xeb8df095
+DD 0xc1f599e7,0x7b37649c,0xb19c15e0,0x0b5ee89b
+DD 0xe56cc704,0x5fae3a7f,0x95054b03,0x2fc7b678
+DD 0x05bfdf0a,0xbf7d2271,0x75d6530d,0xcf14ae76
+DD 0xac5e7ac2,0x169c87b9,0xdc37f6c5,0x66f50bbe
+DD 0x4c8d62cc,0xf64f9fb7,0x3ce4eecb,0x862613b0
+DD 0x68143c2f,0xd2d6c154,0x187db028,0xa2bf4d53
+DD 0x88c72421,0x3205d95a,0xf8aea826,0x426c555d
+DD 0x7c76029c,0xc6b4ffe7,0x0c1f8e9b,0xb6dd73e0
+DD 0x9ca51a92,0x2667e7e9,0xeccc9695,0x560e6bee
+DD 0xb83c4471,0x02feb90a,0xc855c876,0x7297350d
+DD 0x58ef5c7f,0xe22da104,0x2886d078,0x92442d03
+DD 0xf10ef9b7,0x4bcc04cc,0x816775b0,0x3ba588cb
+DD 0x11dde1b9,0xab1f1cc2,0x61b46dbe,0xdb7690c5
+DD 0x3544bf5a,0x8f864221,0x452d335d,0xffefce26
+DD 0xd597a754,0x6f555a2f,0xa5fe2b53,0x1f3cd628
+DD 0x636b823b,0xd9a97f40,0x13020e3c,0xa9c0f347
+DD 0x83b89a35,0x397a674e,0xf3d11632,0x4913eb49
+DD 0xa721c4d6,0x1de339ad,0xd74848d1,0x6d8ab5aa
+DD 0x47f2dcd8,0xfd3021a3,0x379b50df,0x8d59ada4
+DD 0xee137910,0x54d1846b,0x9e7af517,0x24b8086c
+DD 0x0ec0611e,0xb4029c65,0x7ea9ed19,0xc46b1062
+DD 0x2a593ffd,0x909bc286,0x5a30b3fa,0xe0f24e81
+DD 0xca8a27f3,0x7048da88,0xbae3abf4,0x0021568f
+DD 0x424d03d2,0xf88ffea9,0x32248fd5,0x88e672ae
+DD 0xa29e1bdc,0x185ce6a7,0xd2f797db,0x68356aa0
+DD 0x8607453f,0x3cc5b844,0xf66ec938,0x4cac3443
+DD 0x66d45d31,0xdc16a04a,0x16bdd136,0xac7f2c4d
+DD 0xcf35f8f9,0x75f70582,0xbf5c74fe,0x059e8985
+DD 0x2fe6e0f7,0x95241d8c,0x5f8f6cf0,0xe54d918b
+DD 0x0b7fbe14,0xb1bd436f,0x7b163213,0xc1d4cf68
+DD 0xebaca61a,0x516e5b61,0x9bc52a1d,0x2107d766
+DD 0x5d508375,0xe7927e0e,0x2d390f72,0x97fbf209
+DD 0xbd839b7b,0x07416600,0xcdea177c,0x7728ea07
+DD 0x991ac598,0x23d838e3,0xe973499f,0x53b1b4e4
+DD 0x79c9dd96,0xc30b20ed,0x09a05191,0xb362acea
+DD 0xd028785e,0x6aea8525,0xa041f459,0x1a830922
+DD 0x30fb6050,0x8a399d2b,0x4092ec57,0xfa50112c
+DD 0x14623eb3,0xaea0c3c8,0x640bb2b4,0xdec94fcf
+DD 0xf4b126bd,0x4e73dbc6,0x84d8aaba,0x3e1a57c1
+
+mul_table_632:
+DD 0x00000000,0x6b749fb2,0xd6e93f64,0xbd9da0d6
+DD 0xa83e0839,0xc34a978b,0x7ed7375d,0x15a3a8ef
+DD 0x55906683,0x3ee4f931,0x837959e7,0xe80dc655
+DD 0xfdae6eba,0x96daf108,0x2b4751de,0x4033ce6c
+DD 0xab20cd06,0xc05452b4,0x7dc9f262,0x16bd6dd0
+DD 0x031ec53f,0x686a5a8d,0xd5f7fa5b,0xbe8365e9
+DD 0xfeb0ab85,0x95c43437,0x285994e1,0x432d0b53
+DD 0x568ea3bc,0x3dfa3c0e,0x80679cd8,0xeb13036a
+DD 0x53adecfd,0x38d9734f,0x8544d399,0xee304c2b
+DD 0xfb93e4c4,0x90e77b76,0x2d7adba0,0x460e4412
+DD 0x063d8a7e,0x6d4915cc,0xd0d4b51a,0xbba02aa8
+DD 0xae038247,0xc5771df5,0x78eabd23,0x139e2291
+DD 0xf88d21fb,0x93f9be49,0x2e641e9f,0x4510812d
+DD 0x50b329c2,0x3bc7b670,0x865a16a6,0xed2e8914
+DD 0xad1d4778,0xc669d8ca,0x7bf4781c,0x1080e7ae
+DD 0x05234f41,0x6e57d0f3,0xd3ca7025,0xb8beef97
+DD 0xa75bd9fa,0xcc2f4648,0x71b2e69e,0x1ac6792c
+DD 0x0f65d1c3,0x64114e71,0xd98ceea7,0xb2f87115
+DD 0xf2cbbf79,0x99bf20cb,0x2422801d,0x4f561faf
+DD 0x5af5b740,0x318128f2,0x8c1c8824,0xe7681796
+DD 0x0c7b14fc,0x670f8b4e,0xda922b98,0xb1e6b42a
+DD 0xa4451cc5,0xcf318377,0x72ac23a1,0x19d8bc13
+DD 0x59eb727f,0x329fedcd,0x8f024d1b,0xe476d2a9
+DD 0xf1d57a46,0x9aa1e5f4,0x273c4522,0x4c48da90
+DD 0xf4f63507,0x9f82aab5,0x221f0a63,0x496b95d1
+DD 0x5cc83d3e,0x37bca28c,0x8a21025a,0xe1559de8
+DD 0xa1665384,0xca12cc36,0x778f6ce0,0x1cfbf352
+DD 0x09585bbd,0x622cc40f,0xdfb164d9,0xb4c5fb6b
+DD 0x5fd6f801,0x34a267b3,0x893fc765,0xe24b58d7
+DD 0xf7e8f038,0x9c9c6f8a,0x2101cf5c,0x4a7550ee
+DD 0x0a469e82,0x61320130,0xdcafa1e6,0xb7db3e54
+DD 0xa27896bb,0xc90c0909,0x7491a9df,0x1fe5366d
+DD 0x4b5bc505,0x202f5ab7,0x9db2fa61,0xf6c665d3
+DD 0xe365cd3c,0x8811528e,0x358cf258,0x5ef86dea
+DD 0x1ecba386,0x75bf3c34,0xc8229ce2,0xa3560350
+DD 0xb6f5abbf,0xdd81340d,0x601c94db,0x0b680b69
+DD 0xe07b0803,0x8b0f97b1,0x36923767,0x5de6a8d5
+DD 0x4845003a,0x23319f88,0x9eac3f5e,0xf5d8a0ec
+DD 0xb5eb6e80,0xde9ff132,0x630251e4,0x0876ce56
+DD 0x1dd566b9,0x76a1f90b,0xcb3c59dd,0xa048c66f
+DD 0x18f629f8,0x7382b64a,0xce1f169c,0xa56b892e
+DD 0xb0c821c1,0xdbbcbe73,0x66211ea5,0x0d558117
+DD 0x4d664f7b,0x2612d0c9,0x9b8f701f,0xf0fbefad
+DD 0xe5584742,0x8e2cd8f0,0x33b17826,0x58c5e794
+DD 0xb3d6e4fe,0xd8a27b4c,0x653fdb9a,0x0e4b4428
+DD 0x1be8ecc7,0x709c7375,0xcd01d3a3,0xa6754c11
+DD 0xe646827d,0x8d321dcf,0x30afbd19,0x5bdb22ab
+DD 0x4e788a44,0x250c15f6,0x9891b520,0xf3e52a92
+DD 0xec001cff,0x8774834d,0x3ae9239b,0x519dbc29
+DD 0x443e14c6,0x2f4a8b74,0x92d72ba2,0xf9a3b410
+DD 0xb9907a7c,0xd2e4e5ce,0x6f794518,0x040ddaaa
+DD 0x11ae7245,0x7adaedf7,0xc7474d21,0xac33d293
+DD 0x4720d1f9,0x2c544e4b,0x91c9ee9d,0xfabd712f
+DD 0xef1ed9c0,0x846a4672,0x39f7e6a4,0x52837916
+DD 0x12b0b77a,0x79c428c8,0xc459881e,0xaf2d17ac
+DD 0xba8ebf43,0xd1fa20f1,0x6c678027,0x07131f95
+DD 0xbfadf002,0xd4d96fb0,0x6944cf66,0x023050d4
+DD 0x1793f83b,0x7ce76789,0xc17ac75f,0xaa0e58ed
+DD 0xea3d9681,0x81490933,0x3cd4a9e5,0x57a03657
+DD 0x42039eb8,0x2977010a,0x94eaa1dc,0xff9e3e6e
+DD 0x148d3d04,0x7ff9a2b6,0xc2640260,0xa9109dd2
+DD 0xbcb3353d,0xd7c7aa8f,0x6a5a0a59,0x012e95eb
+DD 0x411d5b87,0x2a69c435,0x97f464e3,0xfc80fb51
+DD 0xe92353be,0x8257cc0c,0x3fca6cda,0x54bef368
+
+mul_table_1272:
+DD 0x00000000,0xdd66cbbb,0xbf21e187,0x62472a3c
+DD 0x7bafb5ff,0xa6c97e44,0xc48e5478,0x19e89fc3
+DD 0xf75f6bfe,0x2a39a045,0x487e8a79,0x951841c2
+DD 0x8cf0de01,0x519615ba,0x33d13f86,0xeeb7f43d
+DD 0xeb52a10d,0x36346ab6,0x5473408a,0x89158b31
+DD 0x90fd14f2,0x4d9bdf49,0x2fdcf575,0xf2ba3ece
+DD 0x1c0dcaf3,0xc16b0148,0xa32c2b74,0x7e4ae0cf
+DD 0x67a27f0c,0xbac4b4b7,0xd8839e8b,0x05e55530
+DD 0xd34934eb,0x0e2fff50,0x6c68d56c,0xb10e1ed7
+DD 0xa8e68114,0x75804aaf,0x17c76093,0xcaa1ab28
+DD 0x24165f15,0xf97094ae,0x9b37be92,0x46517529
+DD 0x5fb9eaea,0x82df2151,0xe0980b6d,0x3dfec0d6
+DD 0x381b95e6,0xe57d5e5d,0x873a7461,0x5a5cbfda
+DD 0x43b42019,0x9ed2eba2,0xfc95c19e,0x21f30a25
+DD 0xcf44fe18,0x122235a3,0x70651f9f,0xad03d424
+DD 0xb4eb4be7,0x698d805c,0x0bcaaa60,0xd6ac61db
+DD 0xa37e1f27,0x7e18d49c,0x1c5ffea0,0xc139351b
+DD 0xd8d1aad8,0x05b76163,0x67f04b5f,0xba9680e4
+DD 0x542174d9,0x8947bf62,0xeb00955e,0x36665ee5
+DD 0x2f8ec126,0xf2e80a9d,0x90af20a1,0x4dc9eb1a
+DD 0x482cbe2a,0x954a7591,0xf70d5fad,0x2a6b9416
+DD 0x33830bd5,0xeee5c06e,0x8ca2ea52,0x51c421e9
+DD 0xbf73d5d4,0x62151e6f,0x00523453,0xdd34ffe8
+DD 0xc4dc602b,0x19baab90,0x7bfd81ac,0xa69b4a17
+DD 0x70372bcc,0xad51e077,0xcf16ca4b,0x127001f0
+DD 0x0b989e33,0xd6fe5588,0xb4b97fb4,0x69dfb40f
+DD 0x87684032,0x5a0e8b89,0x3849a1b5,0xe52f6a0e
+DD 0xfcc7f5cd,0x21a13e76,0x43e6144a,0x9e80dff1
+DD 0x9b658ac1,0x4603417a,0x24446b46,0xf922a0fd
+DD 0xe0ca3f3e,0x3dacf485,0x5febdeb9,0x828d1502
+DD 0x6c3ae13f,0xb15c2a84,0xd31b00b8,0x0e7dcb03
+DD 0x179554c0,0xcaf39f7b,0xa8b4b547,0x75d27efc
+DD 0x431048bf,0x9e768304,0xfc31a938,0x21576283
+DD 0x38bffd40,0xe5d936fb,0x879e1cc7,0x5af8d77c
+DD 0xb44f2341,0x6929e8fa,0x0b6ec2c6,0xd608097d
+DD 0xcfe096be,0x12865d05,0x70c17739,0xada7bc82
+DD 0xa842e9b2,0x75242209,0x17630835,0xca05c38e
+DD 0xd3ed5c4d,0x0e8b97f6,0x6cccbdca,0xb1aa7671
+DD 0x5f1d824c,0x827b49f7,0xe03c63cb,0x3d5aa870
+DD 0x24b237b3,0xf9d4fc08,0x9b93d634,0x46f51d8f
+DD 0x90597c54,0x4d3fb7ef,0x2f789dd3,0xf21e5668
+DD 0xebf6c9ab,0x36900210,0x54d7282c,0x89b1e397
+DD 0x670617aa,0xba60dc11,0xd827f62d,0x05413d96
+DD 0x1ca9a255,0xc1cf69ee,0xa38843d2,0x7eee8869
+DD 0x7b0bdd59,0xa66d16e2,0xc42a3cde,0x194cf765
+DD 0x00a468a6,0xddc2a31d,0xbf858921,0x62e3429a
+DD 0x8c54b6a7,0x51327d1c,0x33755720,0xee139c9b
+DD 0xf7fb0358,0x2a9dc8e3,0x48dae2df,0x95bc2964
+DD 0xe06e5798,0x3d089c23,0x5f4fb61f,0x82297da4
+DD 0x9bc1e267,0x46a729dc,0x24e003e0,0xf986c85b
+DD 0x17313c66,0xca57f7dd,0xa810dde1,0x7576165a
+DD 0x6c9e8999,0xb1f84222,0xd3bf681e,0x0ed9a3a5
+DD 0x0b3cf695,0xd65a3d2e,0xb41d1712,0x697bdca9
+DD 0x7093436a,0xadf588d1,0xcfb2a2ed,0x12d46956
+DD 0xfc639d6b,0x210556d0,0x43427cec,0x9e24b757
+DD 0x87cc2894,0x5aaae32f,0x38edc913,0xe58b02a8
+DD 0x33276373,0xee41a8c8,0x8c0682f4,0x5160494f
+DD 0x4888d68c,0x95ee1d37,0xf7a9370b,0x2acffcb0
+DD 0xc478088d,0x191ec336,0x7b59e90a,0xa63f22b1
+DD 0xbfd7bd72,0x62b176c9,0x00f65cf5,0xdd90974e
+DD 0xd875c27e,0x051309c5,0x675423f9,0xba32e842
+DD 0xa3da7781,0x7ebcbc3a,0x1cfb9606,0xc19d5dbd
+DD 0x2f2aa980,0xf24c623b,0x900b4807,0x4d6d83bc
+DD 0x54851c7f,0x89e3d7c4,0xeba4fdf8,0x36c23643
+
+%macro slversion 4
+global %1_slver_%2%3%4
+global %1_slver
+%1_slver:
+%1_slver_%2%3%4:
+	dw 0x%4
+	db 0x%3, 0x%2
+%endmacro
+;;;       func            core, ver, snum
+slversion crc32_iscsi_zero_00, 00,   02,  0014
+%ifidn __OUTPUT_FORMAT__, elf64
+; inform linker that this doesn't require executable stack
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/common/crc32c_ppc.c b/src/common/crc32c_ppc.c
new file mode 100644
index 000000000..52fd1c4ee
--- /dev/null
+++ b/src/common/crc32c_ppc.c
@@ -0,0 +1,148 @@
+/* Copyright (C) 2017 International Business Machines Corp.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#define CRC_TABLE
+#define FAST_ZERO_TABLE
+
+#include "acconfig.h"
+#include "include/int_types.h"
+#include "crc32c_ppc_constants.h"
+#include "reverse.h"
+
+#include <stdlib.h>
+#include <strings.h>
+
+#define VMX_ALIGN	16
+#define VMX_ALIGN_MASK	(VMX_ALIGN-1)
+
+#ifdef HAVE_PPC64LE
+#ifdef REFLECT
+static unsigned int crc32_align(unsigned int crc, unsigned char const *p,
+                                unsigned long len)
+{
+  while (len--)
+    crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+  return crc;
+}
+#else
+static unsigned int crc32_align(unsigned int crc, unsigned char const *p,
+                                unsigned long len)
+{
+  while (len--)
+    crc = crc_table[((crc >> 24) ^ *p++) & 0xff] ^ (crc << 8);
+  return crc;
+}
+#endif
+
+static inline unsigned long polynomial_multiply(unsigned int a, unsigned int b) {
+        vector unsigned int va = {a, 0, 0, 0};
+        vector unsigned int vb = {b, 0, 0, 0};
+        vector unsigned long vt;
+
+        __asm__("vpmsumw %0,%1,%2" : "=v"(vt) : "v"(va), "v"(vb));
+
+        return vt[0];
+}
+
+unsigned int barrett_reduction(unsigned long val);
+
+static inline unsigned int gf_multiply(unsigned int a, unsigned int b) {
+        return barrett_reduction(polynomial_multiply(a, b));
+}
+
+unsigned int append_zeros(unsigned int crc, unsigned long length) {
+        unsigned long i = 0;
+
+        while (length) {
+                if (length & 1) {
+                        crc = gf_multiply(crc, crc_zero[i]);
+                }
+                i++;
+                length /= 2;
+        }
+
+        return crc;
+}
+
+
+unsigned int __crc32_vpmsum(unsigned int crc, unsigned char const *p,
+                            unsigned long len);
+
+static uint32_t crc32_vpmsum(uint32_t crc, unsigned char const *data,
+                             unsigned len)
+{
+  unsigned int prealign;
+  unsigned int tail;
+
+#ifdef CRC_XOR
+  crc ^= 0xffffffff;
+#endif
+
+  if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
+    crc = crc32_align(crc, data, (unsigned long)len);
+    goto out;
+  }
+
+  if ((unsigned long)data & VMX_ALIGN_MASK) {
+    prealign = VMX_ALIGN - ((unsigned long)data & VMX_ALIGN_MASK);
+    crc = crc32_align(crc, data, prealign);
+    len -= prealign;
+    data += prealign;
+  }
+
+  crc = __crc32_vpmsum(crc, data, (unsigned long)len & ~VMX_ALIGN_MASK);
+
+  tail = len & VMX_ALIGN_MASK;
+  if (tail) {
+    data += len & ~VMX_ALIGN_MASK;
+    crc = crc32_align(crc, data, tail);
+  }
+
+out:
+#ifdef CRC_XOR
+  crc ^= 0xffffffff;
+#endif
+
+  return crc;
+}
+
+/* This wrapper function works around the fact that crc32_vpmsum 
+ * does not gracefully handle the case where the data pointer is NULL.
+ */
+uint32_t ceph_crc32c_ppc(uint32_t crc, unsigned char const *data, unsigned len)
+{
+  if (!data) {
+    /* Handle the NULL buffer case. */
+#ifdef REFLECT
+    crc = reverse_bits(crc);
+#endif
+
+    crc = append_zeros(crc, len);
+
+#ifdef REFLECT
+    crc = reverse_bits(crc);
+#endif
+  } else {
+    /* Handle the valid buffer case. */
+    crc = crc32_vpmsum(crc, data, (unsigned long)len);
+  }
+  return crc;
+}
+
+#else /* HAVE_PPC64LE */
+
+/* This symbol has to exist on non-ppc architectures (and on legacy
+ * ppc systems using power7 or below) in order to compile properly
+ * there, even though it won't be called.
+ */
+uint32_t ceph_crc32c_ppc(uint32_t crc, unsigned char const *data, unsigned len)
+{
+  return 0;
+}
+
+#endif /* HAVE_PPC64LE */
diff --git a/src/common/crc32c_ppc.h b/src/common/crc32c_ppc.h
new file mode 100644
index 000000000..18021638f
--- /dev/null
+++ b/src/common/crc32c_ppc.h
@@ -0,0 +1,22 @@
+/* Copyright (C) 2017 International Business Machines Corp.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef CEPH_COMMON_CRC32C_PPC_H
+#define CEPH_COMMON_CRC32C_PPC_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint32_t ceph_crc32c_ppc(uint32_t crc, unsigned char const *buffer, unsigned len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/common/crc32c_ppc_asm.S b/src/common/crc32c_ppc_asm.S
new file mode 100644
index 000000000..096d98591
--- /dev/null
+++ b/src/common/crc32c_ppc_asm.S
@@ -0,0 +1,787 @@
+/*
+ * Calculate the checksum of data that is 16 byte aligned and a multiple of
+ * 16 bytes.
+ *
+ * The first step is to reduce it to 1024 bits. We do this in 8 parallel
+ * chunks in order to mask the latency of the vpmsum instructions. If we
+ * have more than 32 kB of data to checksum we repeat this step multiple
+ * times, passing in the previous 1024 bits.
+ *
+ * The next step is to reduce the 1024 bits to 64 bits. This step adds
+ * 32 bits of 0s to the end - this matches what a CRC does. We just
+ * calculate constants that land the data in this 32 bits.
+ *
+ * We then use fixed point Barrett reduction to compute a mod n over GF(2)
+ * for n = CRC using POWER8 instructions. We use x = 32.
+ *
+ * http://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of either:
+ *
+ *  a) the GNU General Public License as published by the Free Software
+ *     Foundation; either version 2 of the License, or (at your option)
+ *     any later version, or
+ *  b) the Apache License, Version 2.0
+ */
+
+#if defined (__clang__)
+#ifndef __ALTIVEC__
+#define __ALTIVEC__
+#endif
+#include "ppc-asm.h"
+#else
+#include <ppc-asm.h>
+#endif
+#include "ppc-opcode.h"
+
+#undef toc
+
+#ifndef r1
+#define r1 1
+#endif
+
+#ifndef r2
+#define r2 2
+#endif
+
+	.section	.rodata
+.balign 16
+
+.byteswap_constant:
+	/* byte reverse permute constant */
+	.octa 0x0F0E0D0C0B0A09080706050403020100
+
+#ifdef CRC32_CONSTANTS_HEADER
+#include CRC32_CONSTANTS_HEADER
+#else
+#include "crc32c_ppc_constants.h"
+#endif
+
+	.text
+
+#if defined(__BIG_ENDIAN__) && defined(REFLECT)
+#define BYTESWAP_DATA
+#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
+#define BYTESWAP_DATA
+#else
+#undef BYTESWAP_DATA
+#endif
+
+#define off16		r25
+#define off32		r26
+#define off48		r27
+#define off64		r28
+#define off80		r29
+#define off96		r30
+#define off112		r31
+
+#define const1		v24
+#define const2		v25
+
+#define byteswap	v26
+#define	mask_32bit	v27
+#define	mask_64bit	v28
+#define zeroes		v29
+
+#ifdef BYTESWAP_DATA
+#define VPERM(A, B, C, D) vperm	A, B, C, D
+#else
+#define VPERM(A, B, C, D)
+#endif
+
+#ifndef CRC32_FUNCTION_ASM
+#define CRC32_FUNCTION_ASM __crc32_vpmsum
+#endif
+
+/* unsigned int __crc32_vpmsum(unsigned int crc, void *p, unsigned long len) */
+FUNC_START(CRC32_FUNCTION_ASM)
+	std	r31,-8(r1)
+	std	r30,-16(r1)
+	std	r29,-24(r1)
+	std	r28,-32(r1)
+	std	r27,-40(r1)
+	std	r26,-48(r1)
+	std	r25,-56(r1)
+
+	li	off16,16
+	li	off32,32
+	li	off48,48
+	li	off64,64
+	li	off80,80
+	li	off96,96
+	li	off112,112
+	li	r0,0
+
+	/* Enough room for saving 10 non volatile VMX registers */
+	subi	r6,r1,56+10*16
+	subi	r7,r1,56+2*16
+
+	stvx	v20,0,r6
+	stvx	v21,off16,r6
+	stvx	v22,off32,r6
+	stvx	v23,off48,r6
+	stvx	v24,off64,r6
+	stvx	v25,off80,r6
+	stvx	v26,off96,r6
+	stvx	v27,off112,r6
+	stvx	v28,0,r7
+	stvx	v29,off16,r7
+
+	mr	r10,r3
+
+	vxor	zeroes,zeroes,zeroes
+	vspltisw v0,-1
+
+	vsldoi	mask_32bit,zeroes,v0,4
+	vsldoi	mask_64bit,zeroes,v0,8
+
+	/* Get the initial value into v8 */
+	vxor	v8,v8,v8
+	MTVRD(v8, r3)
+#ifdef REFLECT
+	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
+#else
+	vsldoi	v8,v8,zeroes,4	/* shift into top 32 bits */
+#endif
+
+#ifdef BYTESWAP_DATA
+	addis	r3,r2,.byteswap_constant@toc@ha
+	addi	r3,r3,.byteswap_constant@toc@l
+
+	lvx	byteswap,0,r3
+	addi	r3,r3,16
+#endif
+
+	cmpdi	r5,256
+	blt	.Lshort
+
+	rldicr	r6,r5,0,56
+
+	/* Checksum in blocks of MAX_SIZE */
+1:	lis	r7,MAX_SIZE@h
+	ori	r7,r7,MAX_SIZE@l
+	mr	r9,r7
+	cmpd	r6,r7
+	bgt	2f
+	mr	r7,r6
+2:	subf	r6,r7,r6
+
+	/* our main loop does 128 bytes at a time */
+	srdi	r7,r7,7
+
+	/*
+	 * Work out the offset into the constants table to start at. Each
+	 * constant is 16 bytes, and it is used against 128 bytes of input
+	 * data - 128 / 16 = 8
+	 */
+	sldi	r8,r7,4
+	srdi	r9,r9,3
+	subf	r8,r8,r9
+
+	/* We reduce our final 128 bytes in a separate step */
+	addi	r7,r7,-1
+	mtctr	r7
+
+	addis	r3,r2,.constants@toc@ha
+	addi	r3,r3,.constants@toc@l
+
+	/* Find the start of our constants */
+	add	r3,r3,r8
+
+	/* zero v0-v7 which will contain our checksums */
+	vxor	v0,v0,v0
+	vxor	v1,v1,v1
+	vxor	v2,v2,v2
+	vxor	v3,v3,v3
+	vxor	v4,v4,v4
+	vxor	v5,v5,v5
+	vxor	v6,v6,v6
+	vxor	v7,v7,v7
+
+	lvx	const1,0,r3
+
+	/*
+	 * If we are looping back to consume more data we use the values
+	 * already in v16-v23.
+	 */
+	cmpdi	r0,1
+	beq	2f
+
+	/* First warm up pass */
+	lvx	v16,0,r4
+	lvx	v17,off16,r4
+	VPERM(v16,v16,v16,byteswap)
+	VPERM(v17,v17,v17,byteswap)
+	lvx	v18,off32,r4
+	lvx	v19,off48,r4
+	VPERM(v18,v18,v18,byteswap)
+	VPERM(v19,v19,v19,byteswap)
+	lvx	v20,off64,r4
+	lvx	v21,off80,r4
+	VPERM(v20,v20,v20,byteswap)
+	VPERM(v21,v21,v21,byteswap)
+	lvx	v22,off96,r4
+	lvx	v23,off112,r4
+	VPERM(v22,v22,v22,byteswap)
+	VPERM(v23,v23,v23,byteswap)
+	addi	r4,r4,8*16
+
+	/* xor in initial value */
+	vxor	v16,v16,v8
+
+2:	bdz	.Lfirst_warm_up_done
+
+	addi	r3,r3,16
+	lvx	const2,0,r3
+
+	/* Second warm up pass */
+	VPMSUMD(v8,v16,const1)
+	lvx	v16,0,r4
+	VPERM(v16,v16,v16,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v9,v17,const1)
+	lvx	v17,off16,r4
+	VPERM(v17,v17,v17,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v10,v18,const1)
+	lvx	v18,off32,r4
+	VPERM(v18,v18,v18,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v11,v19,const1)
+	lvx	v19,off48,r4
+	VPERM(v19,v19,v19,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v12,v20,const1)
+	lvx	v20,off64,r4
+	VPERM(v20,v20,v20,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v13,v21,const1)
+	lvx	v21,off80,r4
+	VPERM(v21,v21,v21,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v14,v22,const1)
+	lvx	v22,off96,r4
+	VPERM(v22,v22,v22,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v15,v23,const1)
+	lvx	v23,off112,r4
+	VPERM(v23,v23,v23,byteswap)
+
+	addi	r4,r4,8*16
+
+	bdz	.Lfirst_cool_down
+
+	/*
+	 * main loop. We modulo schedule it such that it takes three iterations
+	 * to complete - first iteration load, second iteration vpmsum, third
+	 * iteration xor.
+	 */
+	.balign	16
+4:	lvx	const1,0,r3
+	addi	r3,r3,16
+	ori	r2,r2,0
+
+	vxor	v0,v0,v8
+	VPMSUMD(v8,v16,const2)
+	lvx	v16,0,r4
+	VPERM(v16,v16,v16,byteswap)
+	ori	r2,r2,0
+
+	vxor	v1,v1,v9
+	VPMSUMD(v9,v17,const2)
+	lvx	v17,off16,r4
+	VPERM(v17,v17,v17,byteswap)
+	ori	r2,r2,0
+
+	vxor	v2,v2,v10
+	VPMSUMD(v10,v18,const2)
+	lvx	v18,off32,r4
+	VPERM(v18,v18,v18,byteswap)
+	ori	r2,r2,0
+
+	vxor	v3,v3,v11
+	VPMSUMD(v11,v19,const2)
+	lvx	v19,off48,r4
+	VPERM(v19,v19,v19,byteswap)
+	lvx	const2,0,r3
+	ori	r2,r2,0
+
+	vxor	v4,v4,v12
+	VPMSUMD(v12,v20,const1)
+	lvx	v20,off64,r4
+	VPERM(v20,v20,v20,byteswap)
+	ori	r2,r2,0
+
+	vxor	v5,v5,v13
+	VPMSUMD(v13,v21,const1)
+	lvx	v21,off80,r4
+	VPERM(v21,v21,v21,byteswap)
+	ori	r2,r2,0
+
+	vxor	v6,v6,v14
+	VPMSUMD(v14,v22,const1)
+	lvx	v22,off96,r4
+	VPERM(v22,v22,v22,byteswap)
+	ori	r2,r2,0
+
+	vxor	v7,v7,v15
+	VPMSUMD(v15,v23,const1)
+	lvx	v23,off112,r4
+	VPERM(v23,v23,v23,byteswap)
+
+	addi	r4,r4,8*16
+
+	bdnz	4b
+
+.Lfirst_cool_down:
+	/* First cool down pass */
+	lvx	const1,0,r3
+	addi	r3,r3,16
+
+	vxor	v0,v0,v8
+	VPMSUMD(v8,v16,const1)
+	ori	r2,r2,0
+
+	vxor	v1,v1,v9
+	VPMSUMD(v9,v17,const1)
+	ori	r2,r2,0
+
+	vxor	v2,v2,v10
+	VPMSUMD(v10,v18,const1)
+	ori	r2,r2,0
+
+	vxor	v3,v3,v11
+	VPMSUMD(v11,v19,const1)
+	ori	r2,r2,0
+
+	vxor	v4,v4,v12
+	VPMSUMD(v12,v20,const1)
+	ori	r2,r2,0
+
+	vxor	v5,v5,v13
+	VPMSUMD(v13,v21,const1)
+	ori	r2,r2,0
+
+	vxor	v6,v6,v14
+	VPMSUMD(v14,v22,const1)
+	ori	r2,r2,0
+
+	vxor	v7,v7,v15
+	VPMSUMD(v15,v23,const1)
+	ori	r2,r2,0
+
+.Lsecond_cool_down:
+	/* Second cool down pass */
+	vxor	v0,v0,v8
+	vxor	v1,v1,v9
+	vxor	v2,v2,v10
+	vxor	v3,v3,v11
+	vxor	v4,v4,v12
+	vxor	v5,v5,v13
+	vxor	v6,v6,v14
+	vxor	v7,v7,v15
+
+#ifdef REFLECT
+	/*
+	 * vpmsumd produces a 96 bit result in the least significant bits
+	 * of the register. Since we are bit reflected we have to shift it
+	 * left 32 bits so it occupies the least significant bits in the
+	 * bit reflected domain.
+	 */
+	vsldoi	v0,v0,zeroes,4
+	vsldoi	v1,v1,zeroes,4
+	vsldoi	v2,v2,zeroes,4
+	vsldoi	v3,v3,zeroes,4
+	vsldoi	v4,v4,zeroes,4
+	vsldoi	v5,v5,zeroes,4
+	vsldoi	v6,v6,zeroes,4
+	vsldoi	v7,v7,zeroes,4
+#endif
+
+	/* xor with last 1024 bits */
+	lvx	v8,0,r4
+	lvx	v9,off16,r4
+	VPERM(v8,v8,v8,byteswap)
+	VPERM(v9,v9,v9,byteswap)
+	lvx	v10,off32,r4
+	lvx	v11,off48,r4
+	VPERM(v10,v10,v10,byteswap)
+	VPERM(v11,v11,v11,byteswap)
+	lvx	v12,off64,r4
+	lvx	v13,off80,r4
+	VPERM(v12,v12,v12,byteswap)
+	VPERM(v13,v13,v13,byteswap)
+	lvx	v14,off96,r4
+	lvx	v15,off112,r4
+	VPERM(v14,v14,v14,byteswap)
+	VPERM(v15,v15,v15,byteswap)
+
+	addi	r4,r4,8*16
+
+	vxor	v16,v0,v8
+	vxor	v17,v1,v9
+	vxor	v18,v2,v10
+	vxor	v19,v3,v11
+	vxor	v20,v4,v12
+	vxor	v21,v5,v13
+	vxor	v22,v6,v14
+	vxor	v23,v7,v15
+
+	li	r0,1
+	cmpdi	r6,0
+	addi	r6,r6,128
+	bne	1b
+
+	/* Work out how many bytes we have left */
+	andi.	r5,r5,127
+
+	/* Calculate where in the constant table we need to start */
+	subfic	r6,r5,128
+	add	r3,r3,r6
+
+	/* How many 16 byte chunks are in the tail */
+	srdi	r7,r5,4
+	mtctr	r7
+
+	/*
+	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
+	 * 32 bits to include the trailing 32 bits of zeros
+	 */
+	lvx	v0,0,r3
+	lvx	v1,off16,r3
+	lvx	v2,off32,r3
+	lvx	v3,off48,r3
+	lvx	v4,off64,r3
+	lvx	v5,off80,r3
+	lvx	v6,off96,r3
+	lvx	v7,off112,r3
+	addi	r3,r3,8*16
+
+	VPMSUMW(v0,v16,v0)
+	VPMSUMW(v1,v17,v1)
+	VPMSUMW(v2,v18,v2)
+	VPMSUMW(v3,v19,v3)
+	VPMSUMW(v4,v20,v4)
+	VPMSUMW(v5,v21,v5)
+	VPMSUMW(v6,v22,v6)
+	VPMSUMW(v7,v23,v7)
+
+	/* Now reduce the tail (0 - 112 bytes) */
+	cmpdi	r7,0
+	beq	1f
+
+	lvx	v16,0,r4
+	lvx	v17,0,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off32,r4
+	lvx	v17,off32,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off64,r4
+	lvx	v17,off64,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off96,r4
+	lvx	v17,off96,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+
+	/* Now xor all the parallel chunks together */
+1:	vxor	v0,v0,v1
+	vxor	v2,v2,v3
+	vxor	v4,v4,v5
+	vxor	v6,v6,v7
+
+	vxor	v0,v0,v2
+	vxor	v4,v4,v6
+
+	vxor	v0,v0,v4
+
+.Lbarrett_reduction:
+	/* Barrett constants */
+	addis	r3,r2,.barrett_constants@toc@ha
+	addi	r3,r3,.barrett_constants@toc@l
+
+	lvx	const1,0,r3
+	lvx	const2,off16,r3
+
+	vsldoi	v1,v0,v0,8
+	vxor	v0,v0,v1		/* xor two 64 bit results together */
+
+#ifdef REFLECT
+	/* shift left one bit */
+	vspltisb v1,1
+	vsl	v0,v0,v1
+#endif
+
+	vand	v0,v0,mask_64bit
+
+#ifndef REFLECT
+	/*
+	 * Now for the Barrett reduction algorithm. The idea is to calculate q,
+	 * the multiple of our polynomial that we need to subtract. By
+	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
+	 * result back down 2x bits, we round down to the nearest multiple.
+	 */
+	VPMSUMD(v1,v0,const1)	/* ma */
+	vsldoi	v1,zeroes,v1,8	/* q = floor(ma/(2^64)) */
+	VPMSUMD(v1,v1,const2)	/* qn */
+	vxor	v0,v0,v1	/* a - qn, subtraction is xor in GF(2) */
+
+	/*
+	 * Get the result into r3. We need to shift it left 8 bytes:
+	 * V0 [ 0 1 2 X ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+	vsldoi	v0,v0,zeroes,8	/* shift result into top 64 bits */
+#else
+	/*
+	 * The reflected version of Barrett reduction. Instead of bit
+	 * reflecting our data (which is expensive to do), we bit reflect our
+	 * constants and our algorithm, which means the intermediate data in
+	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
+	 * the algorithm because we don't carry in mod 2 arithmetic.
+	 */
+	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
+	VPMSUMD(v1,v1,const1)		/* ma */
+	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
+	VPMSUMD(v1,v1,const2)		/* qn */
+	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */
+
+	/*
+	 * Since we are bit reflected, the result (ie the low 32 bits) is in
+	 * the high 32 bits. We just need to shift it left 4 bytes
+	 * V0 [ 0 1 X 3 ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
+#endif
+
+	/* Get it into r3 */
+	MFVRD(r3, v0)
+
+.Lout:
+	subi	r6,r1,56+10*16
+	subi	r7,r1,56+2*16
+
+	lvx	v20,0,r6
+	lvx	v21,off16,r6
+	lvx	v22,off32,r6
+	lvx	v23,off48,r6
+	lvx	v24,off64,r6
+	lvx	v25,off80,r6
+	lvx	v26,off96,r6
+	lvx	v27,off112,r6
+	lvx	v28,0,r7
+	lvx	v29,off16,r7
+
+	ld	r31,-8(r1)
+	ld	r30,-16(r1)
+	ld	r29,-24(r1)
+	ld	r28,-32(r1)
+	ld	r27,-40(r1)
+	ld	r26,-48(r1)
+	ld	r25,-56(r1)
+
+	blr
+
+.Lfirst_warm_up_done:
+	lvx	const1,0,r3
+	addi	r3,r3,16
+
+	VPMSUMD(v8,v16,const1)
+	VPMSUMD(v9,v17,const1)
+	VPMSUMD(v10,v18,const1)
+	VPMSUMD(v11,v19,const1)
+	VPMSUMD(v12,v20,const1)
+	VPMSUMD(v13,v21,const1)
+	VPMSUMD(v14,v22,const1)
+	VPMSUMD(v15,v23,const1)
+
+	b	.Lsecond_cool_down
+
+.Lshort:
+	cmpdi	r5,0
+	beq	.Lzero
+
+	addis	r3,r2,.short_constants@toc@ha
+	addi	r3,r3,.short_constants@toc@l
+
+	/* Calculate where in the constant table we need to start */
+	subfic	r6,r5,256
+	add	r3,r3,r6
+
+	/* How many 16 byte chunks? */
+	srdi	r7,r5,4
+	mtctr	r7
+
+	vxor	v19,v19,v19
+	vxor	v20,v20,v20
+
+	lvx	v0,0,r4
+	lvx	v16,0,r3
+	VPERM(v0,v0,v16,byteswap)
+	vxor	v0,v0,v8	/* xor in initial value */
+	VPMSUMW(v0,v0,v16)
+	bdz	.Lv0
+
+	lvx	v1,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v1,v1,v17,byteswap)
+	VPMSUMW(v1,v1,v17)
+	bdz	.Lv1
+
+	lvx	v2,off32,r4
+	lvx	v16,off32,r3
+	VPERM(v2,v2,v16,byteswap)
+	VPMSUMW(v2,v2,v16)
+	bdz	.Lv2
+
+	lvx	v3,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v3,v3,v17,byteswap)
+	VPMSUMW(v3,v3,v17)
+	bdz	.Lv3
+
+	lvx	v4,off64,r4
+	lvx	v16,off64,r3
+	VPERM(v4,v4,v16,byteswap)
+	VPMSUMW(v4,v4,v16)
+	bdz	.Lv4
+
+	lvx	v5,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v5,v5,v17,byteswap)
+	VPMSUMW(v5,v5,v17)
+	bdz	.Lv5
+
+	lvx	v6,off96,r4
+	lvx	v16,off96,r3
+	VPERM(v6,v6,v16,byteswap)
+	VPMSUMW(v6,v6,v16)
+	bdz	.Lv6
+
+	lvx	v7,off112,r4
+	lvx	v17,off112,r3
+	VPERM(v7,v7,v17,byteswap)
+	VPMSUMW(v7,v7,v17)
+	bdz	.Lv7
+
+	addi	r3,r3,128
+	addi	r4,r4,128
+
+	lvx	v8,0,r4
+	lvx	v16,0,r3
+	VPERM(v8,v8,v16,byteswap)
+	VPMSUMW(v8,v8,v16)
+	bdz	.Lv8
+
+	lvx	v9,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v9,v9,v17,byteswap)
+	VPMSUMW(v9,v9,v17)
+	bdz	.Lv9
+
+	lvx	v10,off32,r4
+	lvx	v16,off32,r3
+	VPERM(v10,v10,v16,byteswap)
+	VPMSUMW(v10,v10,v16)
+	bdz	.Lv10
+
+	lvx	v11,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v11,v11,v17,byteswap)
+	VPMSUMW(v11,v11,v17)
+	bdz	.Lv11
+
+	lvx	v12,off64,r4
+	lvx	v16,off64,r3
+	VPERM(v12,v12,v16,byteswap)
+	VPMSUMW(v12,v12,v16)
+	bdz	.Lv12
+
+	lvx	v13,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v13,v13,v17,byteswap)
+	VPMSUMW(v13,v13,v17)
+	bdz	.Lv13
+
+	lvx	v14,off96,r4
+	lvx	v16,off96,r3
+	VPERM(v14,v14,v16,byteswap)
+	VPMSUMW(v14,v14,v16)
+	bdz	.Lv14
+
+	lvx	v15,off112,r4
+	lvx	v17,off112,r3
+	VPERM(v15,v15,v17,byteswap)
+	VPMSUMW(v15,v15,v17)
+
+.Lv15:	vxor	v19,v19,v15
+.Lv14:	vxor	v20,v20,v14
+.Lv13:	vxor	v19,v19,v13
+.Lv12:	vxor	v20,v20,v12
+.Lv11:	vxor	v19,v19,v11
+.Lv10:	vxor	v20,v20,v10
+.Lv9:	vxor	v19,v19,v9
+.Lv8:	vxor	v20,v20,v8
+.Lv7:	vxor	v19,v19,v7
+.Lv6:	vxor	v20,v20,v6
+.Lv5:	vxor	v19,v19,v5
+.Lv4:	vxor	v20,v20,v4
+.Lv3:	vxor	v19,v19,v3
+.Lv2:	vxor	v20,v20,v2
+.Lv1:	vxor	v19,v19,v1
+.Lv0:	vxor	v20,v20,v0
+
+	vxor	v0,v19,v20
+
+	b	.Lbarrett_reduction
+
+.Lzero:
+	mr	r3,r10
+	b	.Lout
+
+FUNC_END(CRC32_FUNCTION_ASM)
diff --git a/src/common/crc32c_ppc_constants.h b/src/common/crc32c_ppc_constants.h
new file mode 100644
index 000000000..12a1e1d51
--- /dev/null
+++ b/src/common/crc32c_ppc_constants.h
@@ -0,0 +1,979 @@
+/* Copyright (C) 2017 International Business Machines Corp.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#define CRC 0x1edc6f41
+#define REFLECT
+
+#ifndef __ASSEMBLY__
+#ifdef CRC_TABLE
+static const unsigned int crc_table[] = {
+	0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4,
+	0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb,
+	0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b,
+	0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24,
+	0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b,
+	0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
+	0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54,
+	0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b,
+	0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a,
+	0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35,
+	0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5,
+	0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
+	0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45,
+	0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a,
+	0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a,
+	0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595,
+	0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48,
+	0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
+	0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687,
+	0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198,
+	0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927,
+	0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38,
+	0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8,
+	0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7,
+	0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096,
+	0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789,
+	0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859,
+	0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46,
+	0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9,
+	0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
+	0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36,
+	0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829,
+	0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c,
+	0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93,
+	0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043,
+	0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
+	0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3,
+	0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc,
+	0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c,
+	0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033,
+	0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652,
+	0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
+	0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d,
+	0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982,
+	0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d,
+	0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622,
+	0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2,
+	0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed,
+	0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530,
+	0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f,
+	0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff,
+	0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0,
+	0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f,
+	0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540,
+	0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90,
+	0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f,
+	0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee,
+	0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1,
+	0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321,
+	0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
+	0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81,
+	0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e,
+	0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e,
+	0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351,};
+
+#endif
+
+#ifdef FAST_ZERO_TABLE
+/* fast zero table */
+unsigned int crc_zero[] = {
+	0x100,
+	0x10000,
+	0x1edc6f41,
+	0x3aab4576,
+	0x18571d18,
+	0x59a3508a,
+	0xaa97d41d,
+	0xe78dbf1d,
+	0x4ef6a711,
+	0x2506c32e,
+	0x68d4e827,
+	0x546ea6b0,
+	0x465cebac,
+	0x26a86214,
+	0x964aa2fd,
+	0x3b4c5747,
+	0x6702ee7f,
+	0xd086629f,
+	0xf1f2043c,
+	0xc761a1ca,
+	0xa8964e9a,
+	0x90cab2ce,
+	0xc6e3583d,
+	0x3344e0be,
+	0x7d53914b,
+	0x3d953297,
+	0xfcf2eda0,
+	0x42f878a5,
+	0x2,
+	0x4,
+	0x10,
+	0x100,
+	0x10000,
+	0x1edc6f41,
+	0x3aab4576,
+	0x18571d18,
+	0x59a3508a,
+	0xaa97d41d,
+	0xe78dbf1d,
+	0x4ef6a711,
+	0x2506c32e,
+	0x68d4e827,
+	0x546ea6b0,
+	0x465cebac,
+	0x26a86214,
+	0x964aa2fd,
+	0x3b4c5747,
+	0x6702ee7f,
+	0xd086629f,
+	0xf1f2043c,
+	0xc761a1ca,
+	0xa8964e9a,
+	0x90cab2ce,
+	0xc6e3583d,
+	0x3344e0be,
+	0x7d53914b,
+	0x3d953297,
+	0xfcf2eda0,
+	0x42f878a5,
+	0x2,
+	0x4,
+	0x10,
+	0x100,
+	0x10000
+};
+#endif
+
+#else
+#define MAX_SIZE	32768
+.constants:
+
+	/* Reduce 262144 kbits to 1024 bits */
+	/* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
+	.octa 0x00000000b6ca9e20000000009c37c408
+
+	/* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
+	.octa 0x00000000350249a800000001b51df26c
+
+	/* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
+	.octa 0x00000001862dac54000000000724b9d0
+
+	/* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
+	.octa 0x00000001d87fb48c00000001c00532fe
+
+	/* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
+	.octa 0x00000001f39b699e00000000f05a9362
+
+	/* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
+	.octa 0x0000000101da11b400000001e1007970
+
+	/* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
+	.octa 0x00000001cab571e000000000a57366ee
+
+	/* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
+	.octa 0x00000000c7020cfe0000000192011284
+
+	/* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
+	.octa 0x00000000cdaed1ae0000000162716d9a
+
+	/* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
+	.octa 0x00000001e804effc00000000cd97ecde
+
+	/* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
+	.octa 0x0000000077c3ea3a0000000058812bc0
+
+	/* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
+	.octa 0x0000000068df31b40000000088b8c12e
+
+	/* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
+	.octa 0x00000000b059b6c200000001230b234c
+
+	/* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
+	.octa 0x0000000145fb8ed800000001120b416e
+
+	/* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
+	.octa 0x00000000cbc0916800000001974aecb0
+
+	/* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
+	.octa 0x000000005ceeedc2000000008ee3f226
+
+	/* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
+	.octa 0x0000000047d74e8600000001089aba9a
+
+	/* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
+	.octa 0x00000001407e9e220000000065113872
+
+	/* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
+	.octa 0x00000001da967bda000000005c07ec10
+
+	/* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
+	.octa 0x000000006c8983680000000187590924
+
+	/* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
+	.octa 0x00000000f2d14c9800000000e35da7c6
+
+	/* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
+	.octa 0x00000001993c6ad4000000000415855a
+
+	/* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
+	.octa 0x000000014683d1ac0000000073617758
+
+	/* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
+	.octa 0x00000001a7c93e6c0000000176021d28
+
+	/* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
+	.octa 0x000000010211e90a00000001c358fd0a
+
+	/* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
+	.octa 0x000000001119403e00000001ff7a2c18
+
+	/* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
+	.octa 0x000000001c3261aa00000000f2d9f7e4
+
+	/* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
+	.octa 0x000000014e37a634000000016cf1f9c8
+
+	/* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
+	.octa 0x0000000073786c0c000000010af9279a
+
+	/* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
+	.octa 0x000000011dc037f80000000004f101e8
+
+	/* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
+	.octa 0x0000000031433dfc0000000070bcf184
+
+	/* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
+	.octa 0x000000009cde8348000000000a8de642
+
+	/* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
+	.octa 0x0000000038d3c2a60000000062ea130c
+
+	/* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
+	.octa 0x000000011b25f26000000001eb31cbb2
+
+	/* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
+	.octa 0x000000001629e6f00000000170783448
+
+	/* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
+	.octa 0x0000000160838b4c00000001a684b4c6
+
+	/* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
+	.octa 0x000000007a44011c00000000253ca5b4
+
+	/* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
+	.octa 0x00000000226f417a0000000057b4b1e2
+
+	/* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
+	.octa 0x0000000045eb2eb400000000b6bd084c
+
+	/* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
+	.octa 0x000000014459d70c0000000123c2d592
+
+	/* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
+	.octa 0x00000001d406ed8200000000159dafce
+
+	/* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
+	.octa 0x0000000160c8e1a80000000127e1a64e
+
+	/* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
+	.octa 0x0000000027ba80980000000056860754
+
+	/* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
+	.octa 0x000000006d92d01800000001e661aae8
+
+	/* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
+	.octa 0x000000012ed7e3f200000000f82c6166
+
+	/* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
+	.octa 0x000000002dc8778800000000c4f9c7ae
+
+	/* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
+	.octa 0x0000000018240bb80000000074203d20
+
+	/* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
+	.octa 0x000000001ad381580000000198173052
+
+	/* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
+	.octa 0x00000001396b78f200000001ce8aba54
+
+	/* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
+	.octa 0x000000011a68133400000001850d5d94
+
+	/* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
+	.octa 0x000000012104732e00000001d609239c
+
+	/* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
+	.octa 0x00000000a140d90c000000001595f048
+
+	/* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
+	.octa 0x00000001b7215eda0000000042ccee08
+
+	/* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
+	.octa 0x00000001aaf1df3c000000010a389d74
+
+	/* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
+	.octa 0x0000000029d15b8a000000012a840da6
+
+	/* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
+	.octa 0x00000000f1a96922000000001d181c0c
+
+	/* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
+	.octa 0x00000001ac80d03c0000000068b7d1f6
+
+	/* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
+	.octa 0x000000000f11d56a000000005b0f14fc
+
+	/* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
+	.octa 0x00000001f1c022a20000000179e9e730
+
+	/* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
+	.octa 0x0000000173d00ae200000001ce1368d6
+
+	/* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
+	.octa 0x00000001d4ffe4ac0000000112c3a84c
+
+	/* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
+	.octa 0x000000016edc5ae400000000de940fee
+
+	/* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
+	.octa 0x00000001f1a0214000000000fe896b7e
+
+	/* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
+	.octa 0x00000000ca0b28a000000001f797431c
+
+	/* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
+	.octa 0x00000001928e30a20000000053e989ba
+
+	/* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
+	.octa 0x0000000097b1b002000000003920cd16
+
+	/* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
+	.octa 0x00000000b15bf90600000001e6f579b8
+
+	/* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
+	.octa 0x00000000411c5d52000000007493cb0a
+
+	/* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
+	.octa 0x00000001c36f330000000001bdd376d8
+
+	/* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
+	.octa 0x00000001119227e0000000016badfee6
+
+	/* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
+	.octa 0x00000000114d47020000000071de5c58
+
+	/* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
+	.octa 0x00000000458b5b9800000000453f317c
+
+	/* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
+	.octa 0x000000012e31fb8e0000000121675cce
+
+	/* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
+	.octa 0x000000005cf619d800000001f409ee92
+
+	/* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
+	.octa 0x0000000063f4d8b200000000f36b9c88
+
+	/* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
+	.octa 0x000000004138dc8a0000000036b398f4
+
+	/* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
+	.octa 0x00000001d29ee8e000000001748f9adc
+
+	/* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
+	.octa 0x000000006a08ace800000001be94ec00
+
+	/* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
+	.octa 0x0000000127d4201000000000b74370d6
+
+	/* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
+	.octa 0x0000000019d76b6200000001174d0b98
+
+	/* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
+	.octa 0x00000001b1471f6e00000000befc06a4
+
+	/* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
+	.octa 0x00000001f64c19cc00000001ae125288
+
+	/* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
+	.octa 0x00000000003c0ea00000000095c19b34
+
+	/* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
+	.octa 0x000000014d73abf600000001a78496f2
+
+	/* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
+	.octa 0x00000001620eb84400000001ac5390a0
+
+	/* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
+	.octa 0x0000000147655048000000002a80ed6e
+
+	/* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
+	.octa 0x0000000067b5077e00000001fa9b0128
+
+	/* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
+	.octa 0x0000000010ffe20600000001ea94929e
+
+	/* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
+	.octa 0x000000000fee8f1e0000000125f4305c
+
+	/* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
+	.octa 0x00000001da26fbae00000001471e2002
+
+	/* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
+	.octa 0x00000001b3a8bd880000000132d2253a
+
+	/* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
+	.octa 0x00000000e8f3898e00000000f26b3592
+
+	/* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
+	.octa 0x00000000b0d0d28c00000000bc8b67b0
+
+	/* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
+	.octa 0x0000000030f2a798000000013a826ef2
+
+	/* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
+	.octa 0x000000000fba10020000000081482c84
+
+	/* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
+	.octa 0x00000000bdb9bd7200000000e77307c2
+
+	/* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
+	.octa 0x0000000075d3bf5a00000000d4a07ec8
+
+	/* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
+	.octa 0x00000000ef1f98a00000000017102100
+
+	/* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
+	.octa 0x00000000689c760200000000db406486
+
+	/* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
+	.octa 0x000000016d5fa5fe0000000192db7f88
+
+	/* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
+	.octa 0x00000001d0d2b9ca000000018bf67b1e
+
+	/* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
+	.octa 0x0000000041e7b470000000007c09163e
+
+	/* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
+	.octa 0x00000001cbb6495e000000000adac060
+
+	/* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
+	.octa 0x000000010052a0b000000000bd8316ae
+
+	/* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
+	.octa 0x00000001d8effb5c000000019f09ab54
+
+	/* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
+	.octa 0x00000001d969853c0000000125155542
+
+	/* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
+	.octa 0x00000000523ccce2000000018fdb5882
+
+	/* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
+	.octa 0x000000001e2436bc00000000e794b3f4
+
+	/* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
+	.octa 0x00000000ddd1c3a2000000016f9bb022
+
+	/* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
+	.octa 0x0000000019fcfe3800000000290c9978
+
+	/* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
+	.octa 0x00000001ce95db640000000083c0f350
+
+	/* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
+	.octa 0x00000000af5828060000000173ea6628
+
+	/* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
+	.octa 0x00000001006388f600000001c8b4e00a
+
+	/* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
+	.octa 0x0000000179eca00a00000000de95d6aa
+
+	/* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
+	.octa 0x0000000122410a6a000000010b7f7248
+
+	/* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
+	.octa 0x000000004288e87c00000001326e3a06
+
+	/* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
+	.octa 0x000000016c5490da00000000bb62c2e6
+
+	/* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
+	.octa 0x00000000d1c71f6e0000000156a4b2c2
+
+	/* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
+	.octa 0x00000001b4ce08a6000000011dfe763a
+
+	/* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
+	.octa 0x00000001466ba60c000000007bcca8e2
+
+	/* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
+	.octa 0x00000001f6c488a40000000186118faa
+
+	/* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
+	.octa 0x000000013bfb06820000000111a65a88
+
+	/* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
+	.octa 0x00000000690e9e54000000003565e1c4
+
+	/* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
+	.octa 0x00000000281346b6000000012ed02a82
+
+	/* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
+	.octa 0x000000015646402400000000c486ecfc
+
+	/* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
+	.octa 0x000000016063a8dc0000000001b951b2
+
+	/* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
+	.octa 0x0000000116a663620000000048143916
+
+	/* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
+	.octa 0x000000017e8aa4d200000001dc2ae124
+
+	/* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
+	.octa 0x00000001728eb10c00000001416c58d6
+
+	/* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
+	.octa 0x00000001b08fd7fa00000000a479744a
+
+	/* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
+	.octa 0x00000001092a16e80000000096ca3a26
+
+	/* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
+	.octa 0x00000000a505637c00000000ff223d4e
+
+	/* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
+	.octa 0x00000000d94869b2000000010e84da42
+
+	/* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
+	.octa 0x00000001c8b203ae00000001b61ba3d0
+
+	/* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
+	.octa 0x000000005704aea000000000680f2de8
+
+	/* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
+	.octa 0x000000012e295fa2000000008772a9a8
+
+	/* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
+	.octa 0x000000011d0908bc0000000155f295bc
+
+	/* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
+	.octa 0x0000000193ed97ea00000000595f9282
+
+	/* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
+	.octa 0x000000013a0f1c520000000164b1c25a
+
+	/* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
+	.octa 0x000000010c2c40c000000000fbd67c50
+
+	/* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
+	.octa 0x00000000ff6fac3e0000000096076268
+
+	/* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
+	.octa 0x000000017b3609c000000001d288e4cc
+
+	/* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
+	.octa 0x0000000088c8c92200000001eaac1bdc
+
+	/* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
+	.octa 0x00000001751baae600000001f1ea39e2
+
+	/* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
+	.octa 0x000000010795297200000001eb6506fc
+
+	/* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
+	.octa 0x0000000162b00abe000000010f806ffe
+
+	/* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
+	.octa 0x000000000d7b404c000000010408481e
+
+	/* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
+	.octa 0x00000000763b13d40000000188260534
+
+	/* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
+	.octa 0x00000000f6dc22d80000000058fc73e0
+
+	/* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
+	.octa 0x000000007daae06000000000391c59b8
+
+	/* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
+	.octa 0x000000013359ab7c000000018b638400
+
+	/* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
+	.octa 0x000000008add438a000000011738f5c4
+
+	/* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
+	.octa 0x00000001edbefdea000000008cf7c6da
+
+	/* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
+	.octa 0x000000004104e0f800000001ef97fb16
+
+	/* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
+	.octa 0x00000000b48a82220000000102130e20
+
+	/* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
+	.octa 0x00000001bcb4684400000000db968898
+
+	/* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
+	.octa 0x000000013293ce0a00000000b5047b5e
+
+	/* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
+	.octa 0x00000001710d0844000000010b90fdb2
+
+	/* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
+	.octa 0x0000000117907f6e000000004834a32e
+
+	/* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
+	.octa 0x0000000087ddf93e0000000059c8f2b0
+
+	/* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
+	.octa 0x000000005970e9b00000000122cec508
+
+	/* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
+	.octa 0x0000000185b2b7d0000000000a330cda
+
+	/* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
+	.octa 0x00000001dcee0efc000000014a47148c
+
+	/* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
+	.octa 0x0000000030da27220000000042c61cb8
+
+	/* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
+	.octa 0x000000012f925a180000000012fe6960
+
+	/* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
+	.octa 0x00000000dd2e357c00000000dbda2c20
+
+	/* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
+	.octa 0x00000000071c80de000000011122410c
+
+	/* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
+	.octa 0x000000011513140a00000000977b2070
+
+	/* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
+	.octa 0x00000001df876e8e000000014050438e
+
+	/* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
+	.octa 0x000000015f81d6ce0000000147c840e8
+
+	/* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
+	.octa 0x000000019dd94dbe00000001cc7c88ce
+
+	/* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
+	.octa 0x00000001373d206e00000001476b35a4
+
+	/* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
+	.octa 0x00000000668ccade000000013d52d508
+
+	/* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
+	.octa 0x00000001b192d268000000008e4be32e
+
+	/* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
+	.octa 0x00000000e30f3a7800000000024120fe
+
+	/* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
+	.octa 0x000000010ef1f7bc00000000ddecddb4
+
+	/* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
+	.octa 0x00000001f5ac738000000000d4d403bc
+
+	/* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
+	.octa 0x000000011822ea7000000001734b89aa
+
+	/* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
+	.octa 0x00000000c3a33848000000010e7a58d6
+
+	/* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
+	.octa 0x00000001bd151c2400000001f9f04e9c
+
+	/* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
+	.octa 0x0000000056002d7600000000b692225e
+
+	/* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
+	.octa 0x000000014657c4f4000000019b8d3f3e
+
+	/* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
+	.octa 0x0000000113742d7c00000001a874f11e
+
+	/* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
+	.octa 0x000000019c5920ba000000010d5a4254
+
+	/* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
+	.octa 0x000000005216d2d600000000bbb2f5d6
+
+	/* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
+	.octa 0x0000000136f5ad8a0000000179cc0e36
+
+	/* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
+	.octa 0x000000018b07beb600000001dca1da4a
+
+	/* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
+	.octa 0x00000000db1e93b000000000feb1a192
+
+	/* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
+	.octa 0x000000000b96fa3a00000000d1eeedd6
+
+	/* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
+	.octa 0x00000001d9968af0000000008fad9bb4
+
+	/* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
+	.octa 0x000000000e4a77a200000001884938e4
+
+	/* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
+	.octa 0x00000000508c2ac800000001bc2e9bc0
+
+	/* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
+	.octa 0x0000000021572a8000000001f9658a68
+
+	/* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
+	.octa 0x00000001b859daf2000000001b9224fc
+
+	/* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
+	.octa 0x000000016f7884740000000055b2fb84
+
+	/* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
+	.octa 0x00000001b438810e000000018b090348
+
+	/* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
+	.octa 0x0000000095ddc6f2000000011ccbd5ea
+
+	/* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
+	.octa 0x00000001d977c20c0000000007ae47f8
+
+	/* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
+	.octa 0x00000000ebedb99a0000000172acbec0
+
+	/* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
+	.octa 0x00000001df9e9e9200000001c6e3ff20
+
+	/* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
+	.octa 0x00000001a4a3f95200000000e1b38744
+
+	/* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
+	.octa 0x00000000e2f5122000000000791585b2
+
+	/* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
+	.octa 0x000000004aa01f3e00000000ac53b894
+
+	/* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
+	.octa 0x00000000b3e90a5800000001ed5f2cf4
+
+	/* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
+	.octa 0x000000000c9ca2aa00000001df48b2e0
+
+	/* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
+	.octa 0x000000015168231600000000049c1c62
+
+	/* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
+	.octa 0x0000000036fce78c000000017c460c12
+
+	/* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
+	.octa 0x000000009037dc10000000015be4da7e
+
+	/* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
+	.octa 0x00000000d3298582000000010f38f668
+
+	/* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
+	.octa 0x00000001b42e8ad60000000039f40a00
+
+	/* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
+	.octa 0x00000000142a983800000000bd4c10c4
+
+	/* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
+	.octa 0x0000000109c7f1900000000042db1d98
+
+	/* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
+	.octa 0x0000000056ff931000000001c905bae6
+
+	/* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
+	.octa 0x00000001594513aa00000000069d40ea
+
+	/* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
+	.octa 0x00000001e3b5b1e8000000008e4fbad0
+
+	/* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
+	.octa 0x000000011dd5fc080000000047bedd46
+
+	/* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
+	.octa 0x00000001675f0cc20000000026396bf8
+
+	/* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
+	.octa 0x00000000d1c8dd4400000000379beb92
+
+	/* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
+	.octa 0x0000000115ebd3d8000000000abae54a
+
+	/* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
+	.octa 0x00000001ecbd0dac0000000007e6a128
+
+	/* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
+	.octa 0x00000000cdf67af2000000000ade29d2
+
+	/* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
+	.octa 0x000000004c01ff4c00000000f974c45c
+
+	/* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
+	.octa 0x00000000f2d8657e00000000e77ac60a
+
+	/* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
+	.octa 0x000000006bae74c40000000145895816
+
+	/* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
+	.octa 0x0000000152af8aa00000000038e362be
+
+	/* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
+	.octa 0x0000000004663802000000007f991a64
+
+	/* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
+	.octa 0x00000001ab2f5afc00000000fa366d3a
+
+	/* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
+	.octa 0x0000000074a4ebd400000001a2bb34f0
+
+	/* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
+	.octa 0x00000001d7ab3a4c0000000028a9981e
+
+	/* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
+	.octa 0x00000001a8da60c600000001dbc672be
+
+	/* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
+	.octa 0x000000013cf6382000000000b04d77f6
+
+	/* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
+	.octa 0x00000000bec12e1e0000000124400d96
+
+	/* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
+	.octa 0x00000001c6368010000000014ca4b414
+
+	/* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
+	.octa 0x00000001e6e78758000000012fe2c938
+
+	/* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
+	.octa 0x000000008d7f2b3c00000001faed01e6
+
+	/* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
+	.octa 0x000000016b4a156e000000007e80ecfe
+
+	/* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
+	.octa 0x00000001c63cfeb60000000098daee94
+
+	/* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
+	.octa 0x000000015f902670000000010a04edea
+
+	/* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
+	.octa 0x00000001cd5de11e00000001c00b4524
+
+	/* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
+	.octa 0x000000001acaec540000000170296550
+
+	/* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
+	.octa 0x000000002bd0ca780000000181afaa48
+
+	/* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
+	.octa 0x0000000032d63d5c0000000185a31ffa
+
+	/* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
+	.octa 0x000000001c6d4e4c000000002469f608
+
+	/* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
+	.octa 0x0000000106a60b92000000006980102a
+
+	/* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
+	.octa 0x00000000d3855e120000000111ea9ca8
+
+	/* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
+	.octa 0x00000000e312563600000001bd1d29ce
+
+	/* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
+	.octa 0x000000009e8f7ea400000001b34b9580
+
+	/* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
+	.octa 0x00000001c82e562c000000003076054e
+
+	/* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
+	.octa 0x00000000ca9f09ce000000012a608ea4
+
+	/* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
+	.octa 0x00000000c63764e600000000784d05fe
+
+	/* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
+	.octa 0x0000000168d2e49e000000016ef0d82a
+
+	/* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
+	.octa 0x00000000e986c1480000000075bda454
+
+	/* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
+	.octa 0x00000000cfb65894000000003dc0a1c4
+
+	/* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
+	.octa 0x0000000111cadee400000000e9a5d8be
+
+	/* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
+	.octa 0x0000000171fb63ce00000001609bc4b4
+
+.short_constants:
+
+	/* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
+	/* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod p(x)` */
+	.octa 0x7fec2963e5bf80485cf015c388e56f72
+
+	/* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod p(x)` */
+	.octa 0x38e888d4844752a9963a18920246e2e6
+
+	/* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod p(x)` */
+	.octa 0x42316c00730206ad419a441956993a31
+
+	/* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod p(x)` */
+	.octa 0x543d5c543e65ddf9924752ba2b830011
+
+	/* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod p(x)` */
+	.octa 0x78e87aaf56767c9255bd7f9518e4a304
+
+	/* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod p(x)` */
+	.octa 0x8f68fcec1903da7f6d76739fe0553f1e
+
+	/* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod p(x)` */
+	.octa 0x3f4840246791d588c133722b1fe0b5c3
+
+	/* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod p(x)` */
+	.octa 0x34c96751b04de25a64b67ee0e55ef1f3
+
+	/* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` */
+	.octa 0x156c8e180b4a395b069db049b8fdb1e7
+
+	/* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */
+	.octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e
+
+	/* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */
+	.octa 0x041d37768cd75659817cdc5119b29a35
+
+	/* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */
+	.octa 0x3a0777818cfaa9651ce9d94b36c41f1c
+
+	/* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */
+	.octa 0x0e148e8252377a554f256efcb82be955
+
+	/* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */
+	.octa 0x9c25531d19e65ddeec1631edb2dea967
+
+	/* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */
+	.octa 0x790606ff9957c0a65d27e147510ac59a
+
+	/* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */
+	.octa 0x82f63b786ea2d55ca66805eb18b8ea18
+
+
+.barrett_constants:
+	/* 33 bit reflected Barrett constant m - (4^32)/n */
+	.octa 0x000000000000000000000000dea713f1	/* x^64 div p(x)` */
+	/* 33 bit reflected Barrett constant n */
+	.octa 0x00000000000000000000000105ec76f1
+#endif
diff --git a/src/common/crc32c_ppc_fast_zero_asm.S b/src/common/crc32c_ppc_fast_zero_asm.S
new file mode 100644
index 000000000..cff9cce7f
--- /dev/null
+++ b/src/common/crc32c_ppc_fast_zero_asm.S
@@ -0,0 +1,126 @@
+/*
+ * Use the fixed point version of Barrett reduction to compute a mod n
+ * over GF(2) for n = 0x104c11db7 using POWER8 instructions. We use k = 32.
+ *
+ * http://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of either:
+ *
+ *  a) the GNU General Public License as published by the Free Software
+ *     Foundation; either version 2 of the License, or (at your option)
+ *     any later version, or
+ *  b) the Apache License, Version 2.0
+ */
+
+#if defined (__clang__)
+#ifndef __ALTIVEC__
+#define __ALTIVEC__
+#endif
+#include "ppc-asm.h"
+#else
+#include <ppc-asm.h>
+#endif
+#include "ppc-opcode.h"
+
+	.section	.data
+.balign 16
+.constants:
+	/* Barrett constant m - (4^32)/n */
+	.octa 0x00000000000000000000000104d101df
+
+	/* Barrett constant n */
+	.octa 0x00000000000000000000000104c11db7
+
+.bit_reflected_constants:
+	/* 33 bit reflected Barrett constant m - (4^32)/n */
+	.octa 0x000000000000000000000001f7011641
+
+	/* 33 bit reflected Barrett constant n */
+	.octa 0x000000000000000000000001db710641
+
+	.text
+
+/* unsigned int barrett_reduction(unsigned long val) */
+FUNC_START(barrett_reduction)
+	lis	r4,.constants@ha
+	la	r4,.constants@l(r4)
+
+	li	r5,16
+	vxor	v1,v1,v1	/* zero v1 */
+
+	/* Get a into v0 */
+	MTVRD(v0, r3)
+	vsldoi	v0,v1,v0,8	/* shift into bottom 64 bits, this is a */
+
+	/* Load constants */
+	lvx	v2,0,r4		/* m */
+	lvx	v3,r5,r4	/* n */
+
+	/*
+	 * Now for the actual algorithm. The idea is to calculate q,
+	 * the multiple of our polynomial that we need to subtract. By
+	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
+	 * result back down 2x bits, we round down to the nearest multiple.
+	 */
+	VPMSUMD(v4,v0,v2)	/* ma */
+	vsldoi	v4,v1,v4,8	/* q = floor(ma/(2^64)) */
+	VPMSUMD(v4,v4,v3)	/* qn */
+	vxor	v0,v0,v4	/* a - qn, subtraction is xor in GF(2) */
+
+	/*
+	 * Get the result into r3. We need to shift it left 8 bytes:
+	 * V0 [ 0 1 2 X ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+	vsldoi	v0,v0,v1,8	/* shift result into top 64 bits of v0 */
+	MFVRD(r3, v0)
+
+	blr
+FUNC_END(barrett_reduction)
+
+/* unsigned int barrett_reduction_reflected(unsigned long val) */
+FUNC_START(barrett_reduction_reflected)
+	lis	r4,.bit_reflected_constants@ha
+	la	r4,.bit_reflected_constants@l(r4)
+
+	li	r5,16
+	vxor	v1,v1,v1	/* zero v1 */
+
+	/* Get a into v0 */
+	MTVRD(v0, r3)
+	vsldoi	v0,v1,v0,8	/* shift into bottom 64 bits, this is a */
+
+	/* Load constants */
+	lvx	v2,0,r4		/* m */
+	lvx	v3,r5,r4	/* n */
+
+	vspltisw v5,-1		/* all ones */
+	vsldoi	v6,v1,v5,4	/* bitmask with low 32 bits set */
+
+	/*
+	 * Now for the Barrett reduction algorithm. Instead of bit reflecting
+	 * our data (which is expensive to do), we bit reflect our constants
+	 * and our algorithm, which means the intermediate data in our vector
+	 * registers goes from 0-63 instead of 63-0. We can reflect the
+	 * algorithm because we don't carry in mod 2 arithmetic.
+	 */
+	vand	v4,v0,v6	/* bottom 32 bits of a */
+	VPMSUMD(v4,v4,v2)	/* ma */
+	vand	v4,v4,v6	/* bottom 32bits of ma */
+	VPMSUMD(v4,v4,v3)	/* qn */
+	vxor	v0,v0,v4	/* a - qn, subtraction is xor in GF(2) */
+
+	/*
+	 * Since we are bit reflected, the result (ie the low 32 bits) is in the
+	 * high 32 bits. We just need to shift it left 4 bytes
+	 * V0 [ 0 1 X 3 ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+	vsldoi	v0,v0,v1,4	/* shift result into top 64 bits of v0 */
+	MFVRD(r3, v0)
+
+	blr
+FUNC_END(barrett_reduction_reflected)
diff --git a/src/common/darwin_errno.cc b/src/common/darwin_errno.cc
new file mode 100644
index 000000000..4409abcd4
--- /dev/null
+++ b/src/common/darwin_errno.cc
@@ -0,0 +1,233 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <errno.h>
+#include "include/types.h"
+#include "include/compat.h"
+
+
+// converts from linux errno values to host values
+__s32 ceph_to_hostos_errno(__s32 r)
+{
+  if (r < -34) {
+    switch (r) {
+      case -35:
+        return -EDEADLK;
+      case -36:
+        return -ENAMETOOLONG;
+      case -37:
+        return -ENOLCK;
+      case -38:
+        return -ENOSYS;
+      case -39:
+        return -ENOTEMPTY;
+      case -40:
+        return -ELOOP;
+      case -42:
+        return -ENOMSG;
+      case -43:
+        return -EIDRM;
+      case -44:
+        return -EPERM; //TODO ECHRNG
+      case -45:
+        return -EPERM; //TODO EL2NSYNC
+      case -46:
+        return -EPERM; //TODO EL3HLT
+      case -47:
+        return -EPERM; //TODO EL3RST
+      case -48:
+        return -EPERM; //TODO ELNRNG
+      case -49:
+        return -EPERM; //TODO EUNATCH
+      case -51:
+        return -EPERM; //TODO EL2HLT;
+      case -52:
+        return -EPERM; //TODO EBADE
+      case -53:
+        return -EPERM; //TODO EBADR
+      case -54:
+        return -EPERM; //TODO EXFULL
+      case -55:
+        return -EPERM; //TODO ENOANO
+      case -56:
+        return -EPERM; //TODO EBADRQC
+      case -57:
+        return -EPERM; //TODO EBADSLT
+      case -59:
+        return -EPERM; //TODO EBFONT
+      case -60:
+        return -ENOSTR;
+      case -61:
+        return -ENODATA;
+      case -62:
+        return -ETIME;
+      case -63:
+        return -ENOSR;
+      case -64:
+        return -EPERM; //TODO ENONET
+      case -65:
+        return -EPERM; //TODO ENOPKG
+      case -66:
+        return -EREMOTE;
+      case -67:
+        return -ENOLINK;
+      case -68:
+        return -EPERM; //TODO EADV
+      case -69:
+        return -EPERM; //TODO ESRMNT
+      case -70:
+        return -EPERM; //TODO ECOMM
+      case -71:
+        return -EPROTO;
+      case -72:
+        return -EMULTIHOP;
+      case -73:
+        return -EPERM; //TODO EDOTDOT
+      case -74:
+        return -EBADMSG;
+      case -75:
+        return -EOVERFLOW;
+      case -76:
+        return -EPERM; //TODO ENOTUNIQ
+      case -77:
+        return -EPERM; //TODO EBADFD
+      case -78:
+        return -EPERM; //TODO EREMCHG
+      case -79:
+        return -EPERM; //TODO ELIBACC
+      case -80:
+        return -EPERM; //TODO ELIBBAD
+      case -81:
+        return -EPERM; //TODO ELIBSCN
+      case -82:
+        return -EPERM; //TODO ELIBMAX
+      case -83:
+	return -EPERM; // TODO ELIBEXEC
+      case -84:
+        return -EILSEQ;
+      case -85:
+        return -EINTR;
+      case -86:
+        return -EPERM; //ESTRPIPE;
+      case -87:
+        return -EUSERS;
+      case -88:
+        return -ENOTSOCK;
+      case -89:
+        return -EDESTADDRREQ;
+      case -90:
+        return -EMSGSIZE;
+      case -91:
+        return -EPROTOTYPE;
+      case -92:
+        return -ENOPROTOOPT;
+      case -93:
+        return -EPROTONOSUPPORT;
+      case -94:
+        return -ESOCKTNOSUPPORT;
+      case -95:
+        return -EOPNOTSUPP;
+      case -96:
+        return -EPFNOSUPPORT;
+      case -97:
+        return -EAFNOSUPPORT;
+      case -98:
+        return -EADDRINUSE;
+      case -99:
+        return -EADDRNOTAVAIL;
+      case -100:
+        return -ENETDOWN;
+      case -101:
+        return -ENETUNREACH;
+      case -102:
+        return -ENETRESET;
+      case -103:
+        return -ECONNABORTED;
+      case -104:
+        return -ECONNRESET;
+      case -105:
+        return -ENOBUFS;
+      case -106:
+        return -EISCONN;
+      case -107:
+        return -ENOTCONN;
+      case -108:
+        return -ESHUTDOWN;
+      case -109:
+        return -ETOOMANYREFS;
+      case -110:
+        return -ETIMEDOUT;
+      case -111:
+        return -ECONNREFUSED;
+      case -112:
+        return -EHOSTDOWN;
+      case -113:
+        return -EHOSTUNREACH;
+      case -114:
+        return -EALREADY;
+      case -115:
+        return -EINPROGRESS;
+      case -116:
+        return -ESTALE;
+      case -117:
+        return -EPERM; //TODO EUCLEAN
+      case -118:
+        return -EPERM; //TODO ENOTNAM
+      case -119:
+        return -EPERM; //TODO ENAVAIL
+      case -120:
+        return -EPERM; //TODO EISNAM
+      case -121:
+        return -EREMOTEIO;
+      case -122:
+        return -EDQUOT;
+      case -123:
+        return -EPERM; //TODO ENOMEDIUM
+      case -124:
+        return -EPERM; //TODO EMEDIUMTYPE - not used
+      case -125:
+        return -ECANCELED;
+      case -126:
+        return -EPERM; //TODO ENOKEY
+      case -127:
+        return -EPERM; //TODO EKEYEXPIRED
+      case -128:
+        return -EPERM; //TODO EKEYREVOKED
+      case -129:
+        return -EPERM; //TODO EKEYREJECTED
+      case -130:
+        return -EOWNERDEAD;
+      case -131:
+        return -ENOTRECOVERABLE;
+      case -132:
+        return -EPERM; //TODO ERFKILL
+      case -133:
+        return -EPERM; //TODO EHWPOISON
+
+      default: {
+        break;
+      }
+    }
+  }
+  return r; // otherwise return original value
+}
+
+// converts Host OS errno values to linux/Ceph values
+// XXX Currently not worked out
+__s32 hostos_to_ceph_errno(__s32 r)
+{
+  return r;
+}
+
+
diff --git a/src/common/debug.h b/src/common/debug.h
new file mode 100644
index 000000000..1d4c74701
--- /dev/null
+++ b/src/common/debug.h
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_DEBUG_H
+#define CEPH_DEBUG_H
+
+#include "common/dout.h"
+
+/* Global version of the stuff in common/dout.h
+ */
+
+#define dout(v) ldout((dout_context), (v))
+
+#define pdout(v, p) lpdout((dout_context), (v), (p))
+
+#define dlog_p(sub, v) ldlog_p1((dout_context), (sub), (v))
+
+#define generic_dout(v) lgeneric_dout((dout_context), (v))
+
+#define derr lderr((dout_context))
+
+#define generic_derr lgeneric_derr((dout_context))
+
+#endif
diff --git a/src/common/deleter.h b/src/common/deleter.h
new file mode 100644
index 000000000..d2272cace
--- /dev/null
+++ b/src/common/deleter.h
@@ -0,0 +1,261 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_COMMON_DELETER_H
+#define CEPH_COMMON_DELETER_H
+
+#include <atomic>
+#include <cstdlib>
+#include <new>
+#include <utility>
+
+/// \addtogroup memory-module
+/// @{
+
+/// Provides a mechanism for managing the lifetime of a buffer.
+///
+/// A \c deleter is an object that is used to inform the consumer
+/// of some buffer (not referenced by the deleter itself) how to
+/// delete the buffer.  This can be by calling an arbitrary function
+/// or destroying an object carried by the deleter.  Examples of
+/// a deleter's encapsulated actions are:
+///
+///  - calling \c std::free(p) on some captured pointer, p
+///  - calling \c delete \c p on some captured pointer, p
+///  - decrementing a reference count somewhere
+///
+/// A deleter performs its action from its destructor.
+class deleter final {
+ public:
+  /// \cond internal
+  struct impl;
+  struct raw_object_tag {};
+  /// \endcond
+ private:
+  // if bit 0 set, point to object to be freed directly.
+  impl* _impl = nullptr;
+ public:
+  /// Constructs an empty deleter that does nothing in its destructor.
+  deleter() = default;
+  deleter(const deleter&) = delete;
+  /// Moves a deleter.
+  deleter(deleter&& x) noexcept : _impl(x._impl) { x._impl = nullptr; }
+  /// \cond internal
+  explicit deleter(impl* i) : _impl(i) {}
+  deleter(raw_object_tag tag, void* object)
+          : _impl(from_raw_object(object)) {}
+  /// \endcond
+  /// Destroys the deleter and carries out the encapsulated action.
+  ~deleter();
+  deleter& operator=(deleter&& x);
+  deleter& operator=(deleter&) = delete;
+  /// Performs a sharing operation.  The encapsulated action will only
+  /// be carried out after both the original deleter and the returned
+  /// deleter are both destroyed.
+  ///
+  /// \return a deleter with the same encapsulated action as this one.
+  deleter share();
+  /// Checks whether the deleter has an associated action.
+  explicit operator bool() const { return bool(_impl); }
+  /// \cond internal
+  void reset(impl* i) {
+    this->~deleter();
+    new (this) deleter(i);
+  }
+  /// \endcond
+  /// Appends another deleter to this deleter.  When this deleter is
+  /// destroyed, both encapsulated actions will be carried out.
+  void append(deleter d);
+ private:
+  static bool is_raw_object(impl* i) {
+    auto x = reinterpret_cast<uintptr_t>(i);
+    return x & 1;
+  }
+  bool is_raw_object() const {
+    return is_raw_object(_impl);
+  }
+  static void* to_raw_object(impl* i) {
+    auto x = reinterpret_cast<uintptr_t>(i);
+    return reinterpret_cast<void*>(x & ~uintptr_t(1));
+  }
+  void* to_raw_object() const {
+    return to_raw_object(_impl);
+  }
+  impl* from_raw_object(void* object) {
+    auto x = reinterpret_cast<uintptr_t>(object);
+    return reinterpret_cast<impl*>(x | 1);
+  }
+};
+
+/// \cond internal
+struct deleter::impl {
+  std::atomic_uint refs;
+  deleter next;
+  impl(deleter next) : refs(1), next(std::move(next)) {}
+  virtual ~impl() {}
+};
+/// \endcond
+
+inline deleter::~deleter() {
+  if (is_raw_object()) {
+    std::free(to_raw_object());
+    return;
+  }
+  if (_impl && --_impl->refs == 0) {
+    delete _impl;
+  }
+}
+
+inline deleter& deleter::operator=(deleter&& x) {
+  if (this != &x) {
+    this->~deleter();
+    new (this) deleter(std::move(x));
+  }
+  return *this;
+}
+
+/// \cond internal
+template <typename Deleter>
+struct lambda_deleter_impl final : deleter::impl {
+  Deleter del;
+  lambda_deleter_impl(deleter next, Deleter&& del)
+          : impl(std::move(next)), del(std::move(del)) {}
+  ~lambda_deleter_impl() override { del(); }
+};
+
+template <typename Object>
+struct object_deleter_impl final : deleter::impl {
+  Object obj;
+  object_deleter_impl(deleter next, Object&& obj)
+          : impl(std::move(next)), obj(std::move(obj)) {}
+};
+
+template <typename Object>
+inline
+object_deleter_impl<Object>* make_object_deleter_impl(deleter next, Object obj) {
+  return new object_deleter_impl<Object>(std::move(next), std::move(obj));
+}
+/// \endcond
+
+/// Makes a \ref deleter that encapsulates the action of
+/// destroying an object, as well as running another deleter.  The input
+/// object is moved to the deleter, and destroyed when the deleter is destroyed.
+///
+/// \param d deleter that will become part of the new deleter's encapsulated action
+/// \param o object whose destructor becomes part of the new deleter's encapsulated action
+/// \related deleter
+template <typename Object>
+deleter make_deleter(deleter next, Object o) {
+  return deleter(new lambda_deleter_impl<Object>(std::move(next), std::move(o)));
+}
+
+/// Makes a \ref deleter that encapsulates the action of destroying an object.  The input
+/// object is moved to the deleter, and destroyed when the deleter is destroyed.
+///
+/// \param o object whose destructor becomes the new deleter's encapsulated action
+/// \related deleter
+template <typename Object>
+deleter make_deleter(Object o) {
+  return make_deleter(deleter(), std::move(o));
+}
+
+/// \cond internal
+struct free_deleter_impl final : deleter::impl {
+  void* obj;
+  free_deleter_impl(void* obj) : impl(deleter()), obj(obj) {}
+  ~free_deleter_impl() override { std::free(obj); }
+};
+/// \endcond
+
+inline deleter deleter::share() {
+  if (!_impl) {
+    return deleter();
+  }
+  if (is_raw_object()) {
+    _impl = new free_deleter_impl(to_raw_object());
+  }
+  ++_impl->refs;
+  return deleter(_impl);
+}
+
+// Appends 'd' to the chain of deleters. Avoids allocation if possible. For
+// performance reasons the current chain should be shorter and 'd' should be
+// longer.
+inline void deleter::append(deleter d) {
+  if (!d._impl) {
+    return;
+  }
+  impl* next_impl = _impl;
+  deleter* next_d = this;
+  while (next_impl) {
+    if (next_impl == d._impl)
+      return ;
+    if (is_raw_object(next_impl)) {
+      next_d->_impl = next_impl = new free_deleter_impl(to_raw_object(next_impl));
+    }
+    if (next_impl->refs != 1) {
+      next_d->_impl = next_impl = make_object_deleter_impl(std::move(next_impl->next), deleter(next_impl));
+    }
+    next_d = &next_impl->next;
+    next_impl = next_d->_impl;
+  }
+  next_d->_impl = d._impl;
+  d._impl = nullptr;
+}
+
+/// Makes a deleter that calls \c std::free() when it is destroyed.
+///
+/// \param obj object to free.
+/// \related deleter
+inline deleter make_free_deleter(void* obj) {
+  if (!obj) {
+    return deleter();
+  }
+  return deleter(deleter::raw_object_tag(), obj);
+}
+
+/// Makes a deleter that calls \c std::free() when it is destroyed, as well
+/// as invoking the encapsulated action of another deleter.
+///
+/// \param d deleter to invoke.
+/// \param obj object to free.
+/// \related deleter
+inline deleter make_free_deleter(deleter next, void* obj) {
+  return make_deleter(std::move(next), [obj] () mutable { std::free(obj); });
+}
+
+/// \see make_deleter(Object)
+/// \related deleter
+template <typename T>
+inline deleter make_object_deleter(T&& obj) {
+  return deleter{make_object_deleter_impl(deleter(), std::move(obj))};
+}
+
+/// \see make_deleter(deleter, Object)
+/// \related deleter
+template <typename T>
+inline deleter make_object_deleter(deleter d, T&& obj) {
+  return deleter{make_object_deleter_impl(std::move(d), std::move(obj))};
+}
+
+/// @}
+
+#endif /* CEPH_COMMON_DELETER_H */
diff --git a/src/common/detail/construct_suspended.h b/src/common/detail/construct_suspended.h
new file mode 100644
index 000000000..521bda0f8
--- /dev/null
+++ b/src/common/detail/construct_suspended.h
@@ -0,0 +1,24 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat <contact@redhat.com>
+ * Author: Adam C. Emerson <aemerson@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_DETAIL_CONSTRUCT_SUSPENDED_H
+#define CEPH_COMMON_DETAIL_CONSTRUCT_SUSPENDED_H
+
+namespace ceph {
+  struct construct_suspended_t { };
+  inline constexpr construct_suspended_t construct_suspended { };
+}
+
+#endif // CEPH_COMMON_DETAIL_CONSTRUCT_SUSPENDED_H
diff --git a/src/common/dns_resolve.cc b/src/common/dns_resolve.cc
new file mode 100644
index 000000000..a44510d6d
--- /dev/null
+++ b/src/common/dns_resolve.cc
@@ -0,0 +1,372 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <arpa/inet.h>
+
+#include "include/scope_guard.h"
+#include "dns_resolve.h"
+#include "common/debug.h"
+
+#define dout_subsys ceph_subsys_
+
+using std::map;
+using std::string;
+
+namespace ceph {
+
+#ifdef HAVE_RES_NQUERY
+
+int ResolvHWrapper::res_nquery(res_state s, const char *hostname, int cls,
+    int type, u_char *buf, int bufsz) {
+  return ::res_nquery(s, hostname, cls, type, buf, bufsz);
+}
+
+int ResolvHWrapper::res_nsearch(res_state s, const char *hostname, int cls,
+    int type, u_char *buf, int bufsz) {
+  return ::res_nsearch(s, hostname, cls, type, buf, bufsz);
+}
+
+#else
+
+int ResolvHWrapper::res_query(const char *hostname, int cls,
+    int type, u_char *buf, int bufsz) {
+  return ::res_query(hostname, cls, type, buf, bufsz);
+}
+
+int ResolvHWrapper::res_search(const char *hostname, int cls,
+    int type, u_char *buf, int bufsz) {
+  return ::res_search(hostname, cls, type, buf, bufsz);
+}
+
+#endif
+
+DNSResolver::~DNSResolver()
+{
+#ifdef HAVE_RES_NQUERY
+  for (auto iter = states.begin(); iter != states.end(); ++iter) {
+    struct __res_state *s = *iter;
+    delete s;
+  }
+#endif
+  delete resolv_h;
+}
+
+#ifdef HAVE_RES_NQUERY
+int DNSResolver::get_state(CephContext *cct, res_state *ps)
+{
+  lock.lock();
+  if (!states.empty()) {
+    res_state s = states.front();
+    states.pop_front();
+    lock.unlock();
+    *ps = s;
+    return 0;
+  }
+  lock.unlock();
+  struct __res_state *s = new struct __res_state;
+  s->options = 0;
+  if (res_ninit(s) < 0) {
+    delete s;
+    lderr(cct) << "ERROR: failed to call res_ninit()" << dendl;
+    return -EINVAL;
+  }
+  *ps = s;
+  return 0;
+}
+
+void DNSResolver::put_state(res_state s)
+{
+  std::lock_guard l(lock);
+  states.push_back(s);
+}
+#endif
+
+int DNSResolver::resolve_cname(CephContext *cct, const string& hostname,
+    string *cname, bool *found)
+{
+  *found = false;
+
+#ifdef HAVE_RES_NQUERY
+  res_state res;
+  int r = get_state(cct, &res);
+  if (r < 0) {
+    return r;
+  }
+  auto put_state = make_scope_guard([res, this] {
+      this->put_state(res);
+    });
+#endif
+
+#define LARGE_ENOUGH_DNS_BUFSIZE 1024
+  unsigned char buf[LARGE_ENOUGH_DNS_BUFSIZE];
+
+#define MAX_FQDN_SIZE 255
+  char host[MAX_FQDN_SIZE + 1];
+  const char *origname = hostname.c_str();
+  unsigned char *pt, *answer;
+  unsigned char *answend;
+  int len;
+
+#ifdef HAVE_RES_NQUERY
+  len = resolv_h->res_nquery(res, origname, ns_c_in, ns_t_cname, buf, sizeof(buf));
+#else
+  {
+# ifndef HAVE_THREAD_SAFE_RES_QUERY
+    std::lock_guard l(lock);
+# endif
+    len = resolv_h->res_query(origname, ns_c_in, ns_t_cname, buf, sizeof(buf));
+  }
+#endif
+  if (len < 0) {
+    lderr(cct) << "res_query() failed" << dendl;
+    return 0;
+  }
+
+  answer = buf;
+  pt = answer + NS_HFIXEDSZ;
+  answend = answer + len;
+
+  /* read query */
+  if ((len = dn_expand(answer, answend, pt, host, sizeof(host))) < 0) {
+    lderr(cct) << "ERROR: dn_expand() failed" << dendl;
+    return -EINVAL;
+  }
+  pt += len;
+
+  if (pt + 4 > answend) {
+    lderr(cct) << "ERROR: bad reply" << dendl;
+    return -EIO;
+  }
+
+  int type;
+  NS_GET16(type, pt);
+
+  if (type != ns_t_cname) {
+    lderr(cct) << "ERROR: failed response type: type=" << type <<
+      " (was expecting " << ns_t_cname << ")" << dendl;
+    return -EIO;
+  }
+
+  pt += NS_INT16SZ; /* class */
+
+  /* read answer */
+  if ((len = dn_expand(answer, answend, pt, host, sizeof(host))) < 0) {
+    return 0;
+  }
+  pt += len;
+  ldout(cct, 20) << "name=" << host << dendl;
+
+  if (pt + 10 > answend) {
+    lderr(cct) << "ERROR: bad reply" << dendl;
+    return -EIO;
+  }
+
+  NS_GET16(type, pt);
+  pt += NS_INT16SZ; /* class */
+  pt += NS_INT32SZ; /* ttl */
+  pt += NS_INT16SZ; /* size */
+
+  if ((len = dn_expand(answer, answend, pt, host, sizeof(host))) < 0) {
+    return 0;
+  }
+  ldout(cct, 20) << "cname host=" << host << dendl;
+  *cname = host;
+
+  *found = true;
+  return 0;
+}
+
+
+int DNSResolver::resolve_ip_addr(CephContext *cct, const string& hostname,
+    entity_addr_t *addr) {
+
+#ifdef HAVE_RES_NQUERY
+  res_state res;
+  int r = get_state(cct, &res);
+  if (r < 0) {
+    return r;
+  }
+  auto put_state = make_scope_guard([res, this] {
+      this->put_state(res);
+    });
+  return this->resolve_ip_addr(cct, &res, hostname, addr);
+#else
+  return this->resolve_ip_addr(cct, NULL, hostname, addr);
+#endif
+
+}
+
+int DNSResolver::resolve_ip_addr(CephContext *cct, res_state *res, const string& hostname, 
+    entity_addr_t *addr) {
+
+  u_char nsbuf[NS_PACKETSZ];
+  int len;
+  int family = cct->_conf->ms_bind_ipv6 ? AF_INET6 : AF_INET;
+  int type = cct->_conf->ms_bind_ipv6 ? ns_t_aaaa : ns_t_a;
+
+#ifdef HAVE_RES_NQUERY
+  len = resolv_h->res_nquery(*res, hostname.c_str(), ns_c_in, type, nsbuf, sizeof(nsbuf));
+#else
+  {
+# ifndef HAVE_THREAD_SAFE_RES_QUERY
+    std::lock_guard l(lock);
+# endif
+    len = resolv_h->res_query(hostname.c_str(), ns_c_in, type, nsbuf, sizeof(nsbuf));
+  }
+#endif
+  if (len < 0) {
+    lderr(cct) << "res_query() failed" << dendl;
+    return len;
+  }
+  else if (len == 0) {
+    ldout(cct, 20) << "no address found for hostname " << hostname << dendl;
+    return -1;
+  }
+
+  ns_msg handle;
+  ns_initparse(nsbuf, len, &handle);
+
+  if (ns_msg_count(handle, ns_s_an) == 0) {
+    ldout(cct, 20) << "no address found for hostname " << hostname << dendl;
+    return -1;
+  }
+
+  ns_rr rr;
+  int r;
+  if ((r = ns_parserr(&handle, ns_s_an, 0, &rr)) < 0) {
+      lderr(cct) << "error while parsing DNS record" << dendl;
+      return r;
+  }
+
+  char addr_buf[64];
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(addr_buf, 0, sizeof(addr_buf));
+  inet_ntop(family, ns_rr_rdata(rr), addr_buf, sizeof(addr_buf));
+  if (!addr->parse(addr_buf)) {
+      lderr(cct) << "failed to parse address '" << (const char *)ns_rr_rdata(rr) 
+        << "'" << dendl;
+      return -1;
+  }
+
+  return 0;
+}
+
+int DNSResolver::resolve_srv_hosts(CephContext *cct, const string& service_name, 
+    const SRV_Protocol trans_protocol,
+    map<string, DNSResolver::Record> *srv_hosts) {
+  return this->resolve_srv_hosts(cct, service_name, trans_protocol, "", srv_hosts);
+}
+
+int DNSResolver::resolve_srv_hosts(CephContext *cct, const string& service_name, 
+    const SRV_Protocol trans_protocol, const string& domain,
+    map<string, DNSResolver::Record> *srv_hosts) {
+
+#ifdef HAVE_RES_NQUERY
+  res_state res;
+  int r = get_state(cct, &res);
+  if (r < 0) {
+    return r;
+  }
+  auto put_state = make_scope_guard([res, this] {
+      this->put_state(res);
+    });
+#endif
+
+  u_char nsbuf[NS_PACKETSZ];
+  int num_hosts;
+
+  string proto_str = srv_protocol_to_str(trans_protocol);
+  string query_str = "_"+service_name+"._"+proto_str+(domain.empty() ? ""
+      : "."+domain);
+  int len;
+
+#ifdef HAVE_RES_NQUERY
+  len = resolv_h->res_nsearch(res, query_str.c_str(), ns_c_in, ns_t_srv, nsbuf,
+      sizeof(nsbuf));
+#else
+  {
+# ifndef HAVE_THREAD_SAFE_RES_QUERY
+    std::lock_guard l(lock);
+# endif
+    len = resolv_h->res_search(query_str.c_str(), ns_c_in, ns_t_srv, nsbuf,
+        sizeof(nsbuf));
+  }
+#endif
+  if (len < 0) {
+    lderr(cct) << "failed for service " << query_str << dendl;
+    return len;
+  }
+  else if (len == 0) {
+    ldout(cct, 20) << "No hosts found for service " << query_str << dendl;
+    return 0;
+  }
+
+  ns_msg handle;
+
+  ns_initparse(nsbuf, len, &handle);
+
+  num_hosts = ns_msg_count (handle, ns_s_an);
+  if (num_hosts == 0) {
+    ldout(cct, 20) << "No hosts found for service " << query_str << dendl;
+    return 0;
+  }
+
+  ns_rr rr;
+  char full_target[NS_MAXDNAME];
+
+  for (int i = 0; i < num_hosts; i++) {
+    int r;
+    if ((r = ns_parserr(&handle, ns_s_an, i, &rr)) < 0) {
+      lderr(cct) << "Error while parsing DNS record" << dendl;
+      return r;
+    }
+
+    string full_srv_name = ns_rr_name(rr);
+    string protocol = "_" + proto_str;
+    string srv_domain = full_srv_name.substr(full_srv_name.find(protocol)
+        + protocol.length());
+
+    auto rdata = ns_rr_rdata(rr);
+    uint16_t priority = ns_get16(rdata); rdata += NS_INT16SZ;
+    uint16_t weight = ns_get16(rdata); rdata += NS_INT16SZ;
+    uint16_t port = ns_get16(rdata); rdata += NS_INT16SZ;
+    // FIPS zeroization audit 20191115: this memset is not security related.
+    memset(full_target, 0, sizeof(full_target));
+    ns_name_uncompress(ns_msg_base(handle), ns_msg_end(handle),
+                       rdata, full_target, sizeof(full_target));
+
+    entity_addr_t addr;
+#ifdef HAVE_RES_NQUERY
+    r = this->resolve_ip_addr(cct, &res, full_target, &addr);
+#else
+    r = this->resolve_ip_addr(cct, NULL, full_target, &addr);
+#endif
+
+    if (r == 0) {
+      addr.set_port(port);
+      string target = full_target;
+      auto end = target.find(srv_domain);
+      if (end == target.npos) {
+	lderr(cct) << "resolved target not in search domain: "
+		   << target << " / " << srv_domain << dendl;
+	return -EINVAL;
+      }
+      target = target.substr(0, end);
+      (*srv_hosts)[target] = {priority, weight, addr};
+    }
+  }
+  return 0;
+}
+
+}
diff --git a/src/common/dns_resolve.h b/src/common/dns_resolve.h
new file mode 100644
index 000000000..59504e156
--- /dev/null
+++ b/src/common/dns_resolve.h
@@ -0,0 +1,167 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_DNS_RESOLVE_H
+#define CEPH_DNS_RESOLVE_H
+
+#include <netinet/in.h>
+#ifndef _WIN32
+#include <resolv.h>
+#endif
+
+#include "common/ceph_mutex.h"
+#include "msg/msg_types.h"		// for entity_addr_t
+
+namespace ceph {
+
+/**
+ * this class is used to facilitate the testing of
+ * resolv.h functions.
+ */
+class ResolvHWrapper {
+  public:
+    virtual ~ResolvHWrapper() {}
+
+#ifdef HAVE_RES_NQUERY
+    virtual int res_nquery(res_state s, const char *hostname, int cls, int type, 
+        u_char *buf, int bufsz);
+
+    virtual int res_nsearch(res_state s, const char *hostname, int cls, int type, 
+        u_char *buf, int bufsz);
+#else
+    virtual int res_query(const char *hostname, int cls, int type,
+        u_char *buf, int bufsz);
+
+    virtual int res_search(const char *hostname, int cls, int type,
+        u_char *buf, int bufsz);
+#endif
+
+};
+
+
+/**
+ * @class DNSResolver
+ *
+ * This is a singleton class that exposes the functionality of DNS querying.
+ */
+class DNSResolver {
+
+  public:
+    // singleton declaration
+    static DNSResolver *get_instance()
+    {
+      static DNSResolver instance;
+      return &instance;
+    }
+    DNSResolver(DNSResolver const&) = delete;
+    void operator=(DNSResolver const&) = delete;
+
+    // this function is used by the unit test
+    static DNSResolver *get_instance(ResolvHWrapper *resolv_wrapper) {
+      DNSResolver *resolv = DNSResolver::get_instance();
+      delete resolv->resolv_h;
+      resolv->resolv_h = resolv_wrapper;
+      return resolv;
+    }
+
+    enum class SRV_Protocol {
+      TCP, UDP
+    };
+
+
+    struct Record {
+      uint16_t priority;
+      uint16_t weight;
+      entity_addr_t addr;
+    };
+
+    int resolve_cname(CephContext *cct, const std::string& hostname,
+        std::string *cname, bool *found);
+
+    /**
+     * Resolves the address given a hostname.
+     *
+     * @param hostname the hostname to resolved
+     * @param[out] addr the hostname's address
+     * @returns 0 on success, negative error code on failure
+     */
+    int resolve_ip_addr(CephContext *cct, const std::string& hostname,
+        entity_addr_t *addr);
+
+    /**
+     * Returns the list of hostnames and addresses that provide a given
+     * service configured as DNS SRV records.
+     *
+     * @param service_name the service name
+     * @param trans_protocol the IP protocol used by the service (TCP or UDP)
+     * @param[out] srv_hosts the hostname to address map of available hosts
+     *             providing the service. If no host exists the map is not
+     *             changed.
+     * @returns 0 on success, negative error code on failure
+     */
+    int resolve_srv_hosts(CephContext *cct, const std::string& service_name,
+        const SRV_Protocol trans_protocol, std::map<std::string, Record> *srv_hosts);
+
+    /**
+     * Returns the list of hostnames and addresses that provide a given
+     * service configured as DNS SRV records.
+     *
+     * @param service_name the service name
+     * @param trans_protocol the IP protocol used by the service (TCP or UDP)
+     * @param domain the domain of the service
+     * @param[out] srv_hosts the hostname to address map of available hosts
+     *             providing the service. If no host exists the map is not
+     *             changed.
+     * @returns 0 on success, negative error code on failure
+     */
+    int resolve_srv_hosts(CephContext *cct, const std::string& service_name,
+        const SRV_Protocol trans_protocol, const std::string& domain,
+        std::map<std::string, Record> *srv_hosts);
+
+  private:
+    DNSResolver() { resolv_h = new ResolvHWrapper(); }
+    ~DNSResolver();
+
+    ceph::mutex lock = ceph::make_mutex("DNSResolver::lock");
+    ResolvHWrapper *resolv_h;
+#ifdef HAVE_RES_NQUERY
+    std::list<res_state> states;
+
+    int get_state(CephContext *cct, res_state *ps);
+    void put_state(res_state s);
+#endif
+
+#ifndef _WIN32
+    /* this private function allows to reuse the res_state structure used
+     * by other function of this class
+     */
+    int resolve_ip_addr(CephContext *cct, res_state *res,
+        const std::string& hostname, entity_addr_t *addr);
+#endif
+
+    std::string srv_protocol_to_str(SRV_Protocol proto) {
+      switch (proto) {
+        case SRV_Protocol::TCP:
+          return "tcp";
+        case SRV_Protocol::UDP:
+          return "udp";
+      }
+      return "";
+    }
+
+};
+
+}
+
+#endif
+
diff --git a/src/common/dout.cc b/src/common/dout.cc
new file mode 100644
index 000000000..4bbbfc8fc
--- /dev/null
+++ b/src/common/dout.cc
@@ -0,0 +1,14 @@
+
+#include <iostream>
+
+void dout_emergency(const char * const str)
+{
+  std::cerr << str;
+  std::cerr.flush();
+}
+
+void dout_emergency(const std::string &str)
+{
+  std::cerr << str;
+  std::cerr.flush();
+}
diff --git a/src/common/dout.h b/src/common/dout.h
new file mode 100644
index 000000000..4cd60efff
--- /dev/null
+++ b/src/common/dout.h
@@ -0,0 +1,196 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2010 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2010 Dreamhost
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_DOUT_H
+#define CEPH_DOUT_H
+
+#include <type_traits>
+
+#include "include/ceph_assert.h"
+#include "include/common_fwd.h"
+#if defined(WITH_SEASTAR) && !defined(WITH_ALIEN)
+#include <seastar/util/log.hh>
+#include "crimson/common/log.h"
+#include "crimson/common/config_proxy.h"
+#else
+#include "global/global_context.h"
+#include "common/ceph_context.h"
+#include "common/config.h"
+#include "common/likely.h"
+#include "common/Clock.h"
+#include "log/Log.h"
+#endif
+
+extern void dout_emergency(const char * const str);
+extern void dout_emergency(const std::string &str);
+
+// intentionally conflict with endl
+class _bad_endl_use_dendl_t { public: _bad_endl_use_dendl_t(int) {} };
+static const _bad_endl_use_dendl_t endl = 0;
+inline std::ostream& operator<<(std::ostream& out, _bad_endl_use_dendl_t) {
+  ceph_abort_msg("you are using the wrong endl.. use std::endl or dendl");
+  return out;
+}
+
+class DoutPrefixProvider {
+public:
+  virtual std::ostream& gen_prefix(std::ostream& out) const = 0;
+  virtual CephContext *get_cct() const = 0;
+  virtual unsigned get_subsys() const = 0;
+  virtual ~DoutPrefixProvider() {}
+};
+
+inline std::ostream &operator<<(
+  std::ostream &lhs, const DoutPrefixProvider &dpp) {
+  return dpp.gen_prefix(lhs);
+}
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<DoutPrefixProvider> : fmt::ostream_formatter {};
+#endif
+
+// a prefix provider with empty prefix
+class NoDoutPrefix : public DoutPrefixProvider {
+  CephContext *const cct;
+  const unsigned subsys;
+ public:
+  NoDoutPrefix(CephContext *cct, unsigned subsys) : cct(cct), subsys(subsys) {}
+
+  std::ostream& gen_prefix(std::ostream& out) const override { return out; }
+  CephContext *get_cct() const override { return cct; }
+  unsigned get_subsys() const override { return subsys; }
+};
+
+// a prefix provider with static (const char*) prefix
+class DoutPrefix : public NoDoutPrefix {
+  const char *const prefix;
+ public:
+  DoutPrefix(CephContext *cct, unsigned subsys, const char *prefix)
+    : NoDoutPrefix(cct, subsys), prefix(prefix) {}
+
+  std::ostream& gen_prefix(std::ostream& out) const override {
+    return out << prefix;
+  }
+};
+
+// a prefix provider that composes itself on top of another
+class DoutPrefixPipe : public DoutPrefixProvider {
+  const DoutPrefixProvider& dpp;
+ public:
+  DoutPrefixPipe(const DoutPrefixProvider& dpp) : dpp(dpp) {}
+
+  std::ostream& gen_prefix(std::ostream& out) const override final {
+    dpp.gen_prefix(out);
+    add_prefix(out);
+    return out;
+  }
+  CephContext *get_cct() const override { return dpp.get_cct(); }
+  unsigned get_subsys() const override { return dpp.get_subsys(); }
+
+  virtual void add_prefix(std::ostream& out) const = 0;
+};
+
+// helpers
+namespace ceph::dout {
+
+template<typename T>
+struct dynamic_marker_t {
+  T value;
+  // constexpr ctor isn't needed as it's an aggregate type
+  constexpr operator T() const { return value; }
+};
+
+template<typename T>
+constexpr dynamic_marker_t<T> need_dynamic(T&& t) {
+  return dynamic_marker_t<T>{ std::forward<T>(t) };
+}
+
+template<typename T>
+struct is_dynamic : public std::false_type {};
+
+template<typename T>
+struct is_dynamic<dynamic_marker_t<T>> : public std::true_type {};
+
+} // ceph::dout
+
+// generic macros
+#define dout_prefix *_dout
+
+#if defined(WITH_SEASTAR) && !defined(WITH_ALIEN)
+#define dout_impl(cct, sub, v)                                          \
+  do {                                                                  \
+    if (crimson::common::local_conf()->subsys.should_gather(sub, v)) {  \
+      seastar::logger& _logger = crimson::get_logger(sub);              \
+      const auto _lv = v;                                               \
+      std::ostringstream _out;                                          \
+      std::ostream* _dout = &_out;
+#define dendl_impl                              \
+     "";                                        \
+      _logger.log(crimson::to_log_level(_lv),   \
+                  "{}", _out.str().c_str());    \
+    }                                           \
+  } while (0)
+#else
+#define dout_impl(cct, sub, v)						\
+  do {									\
+  const bool should_gather = [&](const auto cctX) {			\
+    if constexpr (ceph::dout::is_dynamic<decltype(sub)>::value ||	\
+		  ceph::dout::is_dynamic<decltype(v)>::value) {		\
+      return cctX->_conf->subsys.should_gather(sub, v);			\
+    } else {								\
+      /* The parentheses are **essential** because commas in angle	\
+       * brackets are NOT ignored on macro expansion! A language's	\
+       * limitation, sorry. */						\
+      return (cctX->_conf->subsys.template should_gather<sub, v>());	\
+    }									\
+  }(cct);								\
+									\
+  if (should_gather) {							\
+    ceph::logging::MutableEntry _dout_e(v, sub);                        \
+    static_assert(std::is_convertible<decltype(&*cct), 			\
+				      CephContext* >::value,		\
+		  "provided cct must be compatible with CephContext*"); \
+    auto _dout_cct = cct;						\
+    std::ostream* _dout = &_dout_e.get_ostream();
+
+#define dendl_impl std::flush;                                          \
+    _dout_cct->_log->submit_entry(std::move(_dout_e));                  \
+  }                                                                     \
+  } while (0)
+#endif	// WITH_SEASTAR
+
+#define lsubdout(cct, sub, v)  dout_impl(cct, ceph_subsys_##sub, v) dout_prefix
+#define ldout(cct, v)  dout_impl(cct, dout_subsys, v) dout_prefix
+#define lderr(cct) dout_impl(cct, ceph_subsys_, -1) dout_prefix
+
+#define ldpp_subdout(dpp, sub, v) 						\
+  if (decltype(auto) pdpp = (dpp); pdpp) /* workaround -Wnonnull-compare for 'this' */ \
+    dout_impl(pdpp->get_cct(), ceph_subsys_##sub, v) \
+      pdpp->gen_prefix(*_dout)
+
+#define ldpp_dout(dpp, v) 						\
+  if (decltype(auto) pdpp = (dpp); pdpp) /* workaround -Wnonnull-compare for 'this' */ \
+    dout_impl(pdpp->get_cct(), ceph::dout::need_dynamic(pdpp->get_subsys()), v) \
+      pdpp->gen_prefix(*_dout)
+
+#define lgeneric_subdout(cct, sub, v) dout_impl(cct, ceph_subsys_##sub, v) *_dout
+#define lgeneric_dout(cct, v) dout_impl(cct, ceph_subsys_, v) *_dout
+#define lgeneric_derr(cct) dout_impl(cct, ceph_subsys_, -1) *_dout
+
+#define ldlog_p1(cct, sub, lvl)                 \
+  (cct->_conf->subsys.should_gather((sub), (lvl)))
+
+#define dendl dendl_impl
+
+#endif
diff --git a/src/common/dummy.cc b/src/common/dummy.cc
new file mode 100644
index 000000000..262671426
--- /dev/null
+++ b/src/common/dummy.cc
@@ -0,0 +1,20 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+/*
+ * A dummy file with a .cc extension to make autotools link
+ * ceph_test_librbd_fsx with a C++ linker.  An approach w/o a physical
+ * dummy.cc recommended in 8.3.5 Libtool Convenience Libraries works,
+ * but breaks 'make tags' and friends.
+ */
diff --git a/src/common/entity_name.cc b/src/common/entity_name.cc
new file mode 100644
index 000000000..5357b34ea
--- /dev/null
+++ b/src/common/entity_name.cc
@@ -0,0 +1,165 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/entity_name.h"
+#include "common/ceph_strings.h"
+
+#include <sstream>
+
+using std::string;
+
+
+const std::array<EntityName::str_to_entity_type_t, 6> EntityName::STR_TO_ENTITY_TYPE = {{
+  { CEPH_ENTITY_TYPE_AUTH, "auth" },
+  { CEPH_ENTITY_TYPE_MON, "mon" },
+  { CEPH_ENTITY_TYPE_OSD, "osd" },
+  { CEPH_ENTITY_TYPE_MDS, "mds" },
+  { CEPH_ENTITY_TYPE_MGR, "mgr" },
+  { CEPH_ENTITY_TYPE_CLIENT, "client" },
+}};
+
+const std::string& EntityName::
+to_str() const
+{
+  return type_id;
+}
+
+const char* EntityName::
+to_cstr() const
+{
+  return type_id.c_str();
+}
+
+bool EntityName::
+from_str(std::string_view s)
+{
+  size_t pos = s.find('.');
+
+  if (pos == string::npos)
+    return false;
+
+  auto type_ = s.substr(0, pos);
+  auto id_ = s.substr(pos + 1);
+  if (set(type_, id_))
+    return false;
+  return true;
+}
+
+void EntityName::
+set(uint32_t type_, std::string_view id_)
+{
+  type = type_;
+  id = id_;
+
+  if (type) {
+    std::ostringstream oss;
+    oss << ceph_entity_type_name(type_) << "." << id_;
+    type_id = oss.str();
+  } else {
+    type_id.clear();
+  }
+}
+
+int EntityName::
+set(std::string_view type_, std::string_view id_)
+{
+  uint32_t t = str_to_ceph_entity_type(type_);
+  if (t == CEPH_ENTITY_TYPE_ANY)
+    return -EINVAL;
+  set(t, id_);
+  return 0;
+}
+
+void EntityName::
+set_type(uint32_t type_)
+{
+  set(type_, id);
+}
+
+int EntityName::
+set_type(std::string_view type_)
+{
+  return set(type_, id);
+}
+
+void EntityName::
+set_id(std::string_view id_)
+{
+  set(type, id_);
+}
+
+void EntityName::set_name(entity_name_t n)
+{
+  char s[40];
+  sprintf(s, "%lld", (long long)n.num());
+  set(n.type(), s);
+}
+
+const char* EntityName::
+get_type_str() const
+{
+  return ceph_entity_type_name(type);
+}
+
+std::string_view EntityName::
+get_type_name() const
+{
+  return ceph_entity_type_name(type);
+}
+
+const std::string &EntityName::
+get_id() const
+{
+  return id;
+}
+
+bool EntityName::
+has_default_id() const
+{
+  return (id == "admin");
+}
+
+std::string EntityName::
+get_valid_types_as_str()
+{
+  std::ostringstream out;
+  size_t i;
+  for (i = 0; i < STR_TO_ENTITY_TYPE.size(); ++i) {
+    if (i > 0) {
+      out << ", ";
+    }
+    out << STR_TO_ENTITY_TYPE[i].str;
+  }
+  return out.str();
+}
+
+uint32_t EntityName::str_to_ceph_entity_type(std::string_view s)
+{
+  size_t i;
+  for (i = 0; i < STR_TO_ENTITY_TYPE.size(); ++i) {
+    if (s == STR_TO_ENTITY_TYPE[i].str)
+      return STR_TO_ENTITY_TYPE[i].type;
+  }
+  return CEPH_ENTITY_TYPE_ANY;
+}
+
+bool operator<(const EntityName& a, const EntityName& b)
+{
+  return (a.type < b.type) || (a.type == b.type && a.id < b.id);
+}
+
+std::ostream& operator<<(std::ostream& out, const EntityName& n)
+{
+  return out << n.to_str();
+}
diff --git a/src/common/entity_name.h b/src/common/entity_name.h
new file mode 100644
index 000000000..c88ebcbba
--- /dev/null
+++ b/src/common/entity_name.h
@@ -0,0 +1,92 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_ENTITY_NAME_H
+#define CEPH_COMMON_ENTITY_NAME_H
+
+#include <string_view>
+
+#include <ifaddrs.h>
+
+#include "msg/msg_types.h"
+
+/* Represents a Ceph entity name.
+ *
+ * For example, mds.0 is the name of the first metadata server.
+ * client
+ */
+struct EntityName
+{
+  void encode(ceph::buffer::list& bl) const {
+    using ceph::encode;
+    encode(type, bl);
+    encode(id, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    using ceph::decode;
+    uint32_t type_;
+    std::string id_;
+    decode(type_, bl);
+    decode(id_, bl);
+    set(type_, id_);
+  }
+
+  const std::string& to_str() const;
+  const char *to_cstr() const;
+  bool from_str(std::string_view s);
+  void set(uint32_t type_, std::string_view id_);
+  int set(std::string_view type_, std::string_view id_);
+  void set_type(uint32_t type_);
+  int set_type(std::string_view type);
+  void set_id(std::string_view id_);
+  void set_name(entity_name_t n);
+
+  const char* get_type_str() const;
+
+  uint32_t get_type() const { return type; }
+  bool is_osd() const { return get_type() == CEPH_ENTITY_TYPE_OSD; }
+  bool is_mgr() const { return get_type() == CEPH_ENTITY_TYPE_MGR; }
+  bool is_mds() const { return get_type() == CEPH_ENTITY_TYPE_MDS; }
+  bool is_client() const { return get_type() == CEPH_ENTITY_TYPE_CLIENT; }
+  bool is_mon() const { return get_type() == CEPH_ENTITY_TYPE_MON; }
+
+  std::string_view get_type_name() const;
+  const std::string &get_id() const;
+  bool has_default_id() const;
+
+  static std::string get_valid_types_as_str();
+  static uint32_t str_to_ceph_entity_type(std::string_view);
+
+  friend bool operator<(const EntityName& a, const EntityName& b);
+  friend std::ostream& operator<<(std::ostream& out, const EntityName& n);
+
+  bool operator==(const EntityName& rhs) const noexcept {
+    return type == rhs.type && id == rhs.id;
+  }
+
+private:
+  struct str_to_entity_type_t {
+    uint32_t type;
+    const char *str;
+  };
+  static const std::array<str_to_entity_type_t, 6> STR_TO_ENTITY_TYPE;
+
+  uint32_t type = 0;
+  std::string id;
+  std::string type_id;
+};
+
+WRITE_CLASS_ENCODER(EntityName)
+
+#endif
diff --git a/src/common/environment.cc b/src/common/environment.cc
new file mode 100644
index 000000000..a71bb3466
--- /dev/null
+++ b/src/common/environment.cc
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/environment.h"
+
+#include <stdlib.h>
+#include <strings.h>
+
+bool get_env_bool(const char *key)
+{
+  const char *val = getenv(key);
+  if (!val)
+    return false;
+  if (strcasecmp(val, "off") == 0)
+    return false;
+  if (strcasecmp(val, "no") == 0)
+    return false;
+  if (strcasecmp(val, "false") == 0)
+    return false;
+  if (strcasecmp(val, "0") == 0)
+    return false;
+  return true;
+}
+
+int get_env_int(const char *key)
+{
+  const char *val = getenv(key);
+  if (!val)
+    return 0;
+  int v = atoi(val);
+  return v;
+}
diff --git a/src/common/environment.h b/src/common/environment.h
new file mode 100644
index 000000000..9967a0ba4
--- /dev/null
+++ b/src/common/environment.h
@@ -0,0 +1,21 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_ENVIRONMENT_H
+#define CEPH_COMMON_ENVIRONMENT_H
+
+extern bool get_env_bool(const char *key);
+extern int get_env_int(const char *key);
+
+#endif
diff --git a/src/common/errno.cc b/src/common/errno.cc
new file mode 100644
index 000000000..83992361c
--- /dev/null
+++ b/src/common/errno.cc
@@ -0,0 +1,22 @@
+#include "common/errno.h"
+#include "acconfig.h"
+#include "include/compat.h"
+
+#include <sstream>
+#include <string.h>
+
+std::string cpp_strerror(int err)
+{
+  char buf[128];
+  char *errmsg;
+
+  if (err < 0)
+    err = -err;
+  std::ostringstream oss;
+
+  errmsg = ceph_strerror_r(err, buf, sizeof(buf));
+
+  oss << "(" << err << ") " << errmsg;
+
+  return oss.str();
+}
diff --git a/src/common/errno.h b/src/common/errno.h
new file mode 100644
index 000000000..8f967d088
--- /dev/null
+++ b/src/common/errno.h
@@ -0,0 +1,16 @@
+#ifndef CEPH_ERRNO_H
+#define CEPH_ERRNO_H
+
+#include <string>
+
+/* Return a given error code as a string */
+std::string cpp_strerror(int err);
+
+#ifdef _WIN32
+// While cpp_strerror handles errors defined in errno.h, this one
+// accepts standard Windows error codes.
+std::string win32_strerror(int err);
+std::string win32_lasterror_str();
+#endif /* _WIN32 */
+
+#endif
diff --git a/src/common/error_code.cc b/src/common/error_code.cc
new file mode 100644
index 000000000..60086c550
--- /dev/null
+++ b/src/common/error_code.cc
@@ -0,0 +1,196 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc. <contact@redhat.com>
+ *
+ * Author: Adam C. Emerson <aemerson@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version
+ * 2.1, as published by the Free Software Foundation.  See file
+ * COPYING.
+ */
+
+#include <exception>
+
+#include "common/error_code.h"
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wnon-virtual-dtor"
+
+using boost::system::error_category;
+using boost::system::error_condition;
+using boost::system::generic_category;
+using boost::system::system_category;
+
+namespace ceph {
+
+// A category for error conditions particular to Ceph
+
+class ceph_error_category : public converting_category {
+public:
+  ceph_error_category(){}
+  const char* name() const noexcept override;
+  using converting_category::message;
+  std::string message(int ev) const override;
+  const char* message(int ev, char*, std::size_t) const noexcept override;
+  using converting_category::equivalent;
+  bool equivalent(const boost::system::error_code& c,
+		  int ev) const noexcept override;
+  int from_code(int ev) const noexcept override;
+};
+
+const char* ceph_error_category::name() const noexcept {
+  return "ceph";
+}
+
+const char* ceph_error_category::message(int ev, char*,
+					 std::size_t) const noexcept {
+  if (ev == 0)
+    return "No error";
+
+  switch (static_cast<errc>(ev)) {
+
+  case errc::not_in_map:
+    return "Map does not contain requested entry.";
+  case errc::does_not_exist:
+    return "Item does not exist";
+  case errc::failure:
+    return "An internal fault or inconsistency occurred";
+  case errc::exists:
+    return "Already exists";
+  case errc::limit_exceeded:
+    return "Attempt to use too much";
+  case errc::auth:
+    return "Authentication error";
+  case errc::conflict:
+    return "Conflict detected or precondition failed";
+  }
+
+  return "Unknown error.";
+}
+
+std::string ceph_error_category::message(int ev) const {
+  return message(ev, nullptr, 0);
+}
+
+bool ceph_error_category::equivalent(const boost::system::error_code& c,
+				     int ev) const noexcept {
+  if (c.category() == system_category()) {
+    if (c.value() == boost::system::errc::no_such_file_or_directory) {
+      if (ev == static_cast<int>(errc::not_in_map) ||
+	  ev == static_cast<int>(errc::does_not_exist)) {
+	// Blargh. A bunch of stuff returns ENOENT now, so just to be safe.
+	return true;
+      }
+    }
+    if (c.value() == boost::system::errc::io_error) {
+      if (ev == static_cast<int>(errc::failure)) {
+	return true;
+      }
+    }
+    if (c.value() == boost::system::errc::file_exists) {
+      if (ev == static_cast<int>(errc::exists)) {
+	return true;
+      }
+    }
+    if (c.value() == boost::system::errc::no_space_on_device ||
+	c.value() == boost::system::errc::invalid_argument) {
+      if (ev == static_cast<int>(errc::limit_exceeded)) {
+	return true;
+      }
+    }
+    if (c.value() == boost::system::errc::operation_not_permitted) {
+      if (ev == static_cast<int>(ceph::errc::conflict)) {
+	return true;
+      }
+    }
+  }
+  return false;
+}
+
+int ceph_error_category::from_code(int ev) const noexcept {
+  if (ev == 0)
+    return 0;
+
+  switch (static_cast<errc>(ev)) {
+  case errc::not_in_map:
+  case errc::does_not_exist:
+    // What we use now.
+    return -ENOENT;
+  case errc::failure:
+    return -EIO;
+  case errc::exists:
+    return -EEXIST;
+  case errc::limit_exceeded:
+    return -EIO;
+  case errc::auth:
+    return -EACCES;
+  case errc::conflict:
+    return -EINVAL;
+  }
+  return -EDOM;
+}
+
+const error_category& ceph_category() noexcept {
+  static const ceph_error_category c;
+  return c;
+}
+
+
+// This is part of the glue for hooking new code to old. Since
+// Context* and other things give us integer codes from errno, wrap
+// them in an error_code.
+[[nodiscard]] boost::system::error_code to_error_code(int ret) noexcept
+{
+  if (ret == 0)
+    return {};
+  return { std::abs(ret), boost::system::system_category() };
+}
+
+// This is more complicated. For the case of categories defined
+// elsewhere, we have to convert everything here.
+[[nodiscard]] int from_error_code(boost::system::error_code e) noexcept
+{
+  if (!e)
+    return 0;
+
+  auto c = dynamic_cast<const converting_category*>(&e.category());
+  // For categories we define
+  if (c)
+    return c->from_code(e.value());
+
+  // For categories matching values of errno
+  if (e.category() == boost::system::system_category() ||
+      e.category() == boost::system::generic_category() ||
+      // ASIO uses the system category for these and matches system
+      // error values.
+      e.category() == boost::asio::error::get_netdb_category() ||
+      e.category() == boost::asio::error::get_addrinfo_category())
+    return -e.value();
+
+  if (e.category() == boost::asio::error::get_misc_category()) {
+    // These values are specific to asio
+    switch (e.value()) {
+    case boost::asio::error::already_open:
+      return -EIO;
+    case boost::asio::error::eof:
+      return -EIO;
+    case boost::asio::error::not_found:
+      return -ENOENT;
+    case boost::asio::error::fd_set_failure:
+      return -EINVAL;
+    }
+  }
+  // Add any other categories we use here.
+
+  // Marcus likes this as a sentinel for 'Error code? What error code?'
+  return -EDOM;
+}
+}
+#pragma GCC diagnostic pop
+#pragma clang diagnostic pop
diff --git a/src/common/error_code.h b/src/common/error_code.h
new file mode 100644
index 000000000..6bcd8cb17
--- /dev/null
+++ b/src/common/error_code.h
@@ -0,0 +1,151 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc. <contact@redhat.com>
+ *
+ * Author: Adam C. Emerson <aemerson@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License version
+ * 2.1, as published by the Free Software Foundation.  See file
+ * COPYING.
+ */
+
+#ifndef COMMON_CEPH_ERROR_CODE
+#define COMMON_CEPH_ERROR_CODE
+
+#include <netdb.h>
+
+#include <boost/system/error_code.hpp>
+#include <boost/asio.hpp>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wnon-virtual-dtor"
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wnon-virtual-dtor"
+
+namespace ceph {
+
+// This is for error categories we define, so we can specify the
+// equivalent integral value at the point of definition.
+class converting_category : public boost::system::error_category {
+public:
+  virtual int from_code(int code) const noexcept = 0;
+};
+
+const boost::system::error_category& ceph_category() noexcept;
+
+enum class errc {
+  not_in_map = 1, // The requested item was not found in the map
+  does_not_exist, // Item does not exist
+  failure, // An internal fault or inconsistency
+  exists, // Already exists
+  limit_exceeded, // Attempting to use too much of something
+  auth, // May not be an auth failure. It could be that the
+	// preconditions to attempt auth failed.
+  conflict, // Conflict or precondition failure
+};
+}
+
+namespace boost::system {
+template<>
+struct is_error_condition_enum<::ceph::errc> {
+  static const bool value = true;
+};
+template<>
+struct is_error_code_enum<::ceph::errc> {
+  static const bool value = false;
+};
+}
+
+namespace ceph {
+//  explicit conversion:
+inline boost::system::error_code make_error_code(errc e) noexcept {
+  return { static_cast<int>(e), ceph_category() };
+}
+
+// implicit conversion:
+inline boost::system::error_condition make_error_condition(errc e) noexcept {
+  return { static_cast<int>(e), ceph_category() };
+}
+
+[[nodiscard]] boost::system::error_code to_error_code(int ret) noexcept;
+[[nodiscard]] int from_error_code(boost::system::error_code e) noexcept;
+}
+#pragma GCC diagnostic pop
+#pragma clang diagnostic pop
+
+// Moved here from buffer.h so librados doesn't gain a dependency on
+// Boost.System
+
+namespace ceph::buffer {
+inline namespace v15_2_0 {
+const boost::system::error_category& buffer_category() noexcept;
+enum class errc { bad_alloc = 1,
+		  end_of_buffer,
+		  malformed_input };
+}
+}
+
+namespace boost::system {
+template<>
+struct is_error_code_enum<::ceph::buffer::errc> {
+  static const bool value = true;
+};
+
+template<>
+struct is_error_condition_enum<::ceph::buffer::errc> {
+  static const bool value = false;
+};
+}
+
+namespace ceph::buffer {
+inline namespace v15_2_0 {
+
+// implicit conversion:
+inline boost::system::error_code make_error_code(errc e) noexcept {
+  return { static_cast<int>(e), buffer_category() };
+}
+
+// explicit conversion:
+inline boost::system::error_condition
+make_error_condition(errc e) noexcept {
+  return { static_cast<int>(e), buffer_category() };
+}
+
+struct error : boost::system::system_error {
+  using system_error::system_error;
+};
+
+struct bad_alloc : public error {
+  bad_alloc() : error(errc::bad_alloc) {}
+  bad_alloc(const char* what_arg) : error(errc::bad_alloc, what_arg) {}
+  bad_alloc(const std::string& what_arg) : error(errc::bad_alloc, what_arg) {}
+};
+struct end_of_buffer : public error {
+  end_of_buffer() : error(errc::end_of_buffer) {}
+  end_of_buffer(const char* what_arg) : error(errc::end_of_buffer, what_arg) {}
+  end_of_buffer(const std::string& what_arg)
+    : error(errc::end_of_buffer, what_arg) {}
+};
+
+struct malformed_input : public error {
+  malformed_input() : error(errc::malformed_input) {}
+  malformed_input(const char* what_arg)
+    : error(errc::malformed_input, what_arg) {}
+  malformed_input(const std::string& what_arg)
+    : error(errc::malformed_input, what_arg) {}
+};
+struct error_code : public error {
+  error_code(int r) : error(-r, boost::system::system_category()) {}
+  error_code(int r, const char* what_arg)
+    : error(-r, boost::system::system_category(), what_arg) {}
+  error_code(int r, const std::string& what_arg)
+    : error(-r, boost::system::system_category(), what_arg) {}
+};
+}
+}
+
+#endif // COMMON_CEPH_ERROR_CODE
diff --git a/src/common/escape.cc b/src/common/escape.cc
new file mode 100644
index 000000000..67d68326c
--- /dev/null
+++ b/src/common/escape.cc
@@ -0,0 +1,286 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/escape.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <iomanip>
+#include <boost/optional.hpp>
+
+/*
+ * Some functions for escaping RGW responses
+ */
+
+/* Static string length */
+#define SSTRL(x) ((sizeof(x)/sizeof(x[0])) - 1)
+
+#define LESS_THAN_XESCAPE		"&lt;"
+#define AMPERSAND_XESCAPE		"&amp;"
+#define GREATER_THAN_XESCAPE		"&gt;"
+#define SGL_QUOTE_XESCAPE		"&apos;"
+#define DBL_QUOTE_XESCAPE		"&quot;"
+
+size_t escape_xml_attr_len(const char *buf)
+{
+	const char *b;
+	size_t ret = 0;
+	for (b = buf; *b; ++b) {
+		unsigned char c = *b;
+		switch (c) {
+		case '<':
+			ret += SSTRL(LESS_THAN_XESCAPE);
+			break;
+		case '&':
+			ret += SSTRL(AMPERSAND_XESCAPE);
+			break;
+		case '>':
+			ret += SSTRL(GREATER_THAN_XESCAPE);
+			break;
+		case '\'':
+			ret += SSTRL(SGL_QUOTE_XESCAPE);
+			break;
+		case '"':
+			ret += SSTRL(DBL_QUOTE_XESCAPE);
+			break;
+		default:
+			// Escape control characters.
+			if (((c < 0x20) && (c != 0x09) && (c != 0x0a)) ||
+				    (c == 0x7f)) {
+				ret += 6;
+			}
+			else {
+				ret++;
+			}
+		}
+	}
+	// leave room for null terminator
+	ret++;
+	return ret;
+}
+
+void escape_xml_attr(const char *buf, char *out)
+{
+	char *o = out;
+	const char *b;
+	for (b = buf; *b; ++b) {
+		unsigned char c = *b;
+		switch (c) {
+		case '<':
+			memcpy(o, LESS_THAN_XESCAPE, SSTRL(LESS_THAN_XESCAPE));
+			o += SSTRL(LESS_THAN_XESCAPE);
+			break;
+		case '&':
+			memcpy(o, AMPERSAND_XESCAPE, SSTRL(AMPERSAND_XESCAPE));
+			o += SSTRL(AMPERSAND_XESCAPE);
+			break;
+		case '>':
+			memcpy(o, GREATER_THAN_XESCAPE, SSTRL(GREATER_THAN_XESCAPE));
+			o += SSTRL(GREATER_THAN_XESCAPE);
+			break;
+		case '\'':
+			memcpy(o, SGL_QUOTE_XESCAPE, SSTRL(SGL_QUOTE_XESCAPE));
+			o += SSTRL(SGL_QUOTE_XESCAPE);
+			break;
+		case '"':
+			memcpy(o, DBL_QUOTE_XESCAPE, SSTRL(DBL_QUOTE_XESCAPE));
+			o += SSTRL(DBL_QUOTE_XESCAPE);
+			break;
+		default:
+			// Escape control characters.
+			if (((c < 0x20) && (c != 0x09) && (c != 0x0a)) ||
+				    (c == 0x7f)) {
+				snprintf(o, 7, "&#x%02x;", c);
+				o += 6;
+			}
+			else {
+				*o++ = c;
+			}
+			break;
+		}
+	}
+	// null terminator
+	*o = '\0';
+}
+
+// applies hex formatting on construction, restores on destruction
+struct hex_formatter {
+  std::ostream& out;
+  const char old_fill;
+  const std::ostream::fmtflags old_flags;
+
+  explicit hex_formatter(std::ostream& out)
+    : out(out),
+      old_fill(out.fill('0')),
+      old_flags(out.setf(out.hex, out.basefield))
+  {}
+  ~hex_formatter() {
+    out.fill(old_fill);
+    out.flags(old_flags);
+  }
+};
+
+std::ostream& operator<<(std::ostream& out, const xml_stream_escaper& e)
+{
+  boost::optional<hex_formatter> fmt;
+
+  for (unsigned char c : e.str) {
+    switch (c) {
+    case '<':
+      out << LESS_THAN_XESCAPE;
+      break;
+    case '&':
+      out << AMPERSAND_XESCAPE;
+      break;
+    case '>':
+      out << GREATER_THAN_XESCAPE;
+      break;
+    case '\'':
+      out << SGL_QUOTE_XESCAPE;
+      break;
+    case '"':
+      out << DBL_QUOTE_XESCAPE;
+      break;
+    default:
+      // Escape control characters.
+      if (((c < 0x20) && (c != 0x09) && (c != 0x0a)) || (c == 0x7f)) {
+        if (!fmt) {
+          fmt.emplace(out); // enable hex formatting
+        }
+        out << "&#x" << std::setw(2) << static_cast<unsigned int>(c) << ';';
+      } else {
+        out << c;
+      }
+      break;
+    }
+  }
+  return out;
+}
+
+#define DBL_QUOTE_JESCAPE "\\\""
+#define BACKSLASH_JESCAPE "\\\\"
+#define TAB_JESCAPE "\\t"
+#define NEWLINE_JESCAPE "\\n"
+
+size_t escape_json_attr_len(const char *buf, size_t src_len)
+{
+	const char *b;
+	size_t i, ret = 0;
+	for (i = 0, b = buf; i < src_len; ++i, ++b) {
+		unsigned char c = *b;
+		switch (c) {
+		case '"':
+			ret += SSTRL(DBL_QUOTE_JESCAPE);
+			break;
+		case '\\':
+			ret += SSTRL(BACKSLASH_JESCAPE);
+			break;
+		case '\t':
+			ret += SSTRL(TAB_JESCAPE);
+			break;
+		case '\n':
+			ret += SSTRL(NEWLINE_JESCAPE);
+			break;
+		default:
+			// Escape control characters.
+			if ((c < 0x20) || (c == 0x7f)) {
+				ret += 6;
+			}
+			else {
+				ret++;
+			}
+		}
+	}
+	// leave room for null terminator
+	ret++;
+	return ret;
+}
+
+void escape_json_attr(const char *buf, size_t src_len, char *out)
+{
+	char *o = out;
+	const char *b;
+	size_t i;
+	for (i = 0, b = buf; i < src_len; ++i, ++b) {
+		unsigned char c = *b;
+		switch (c) {
+		case '"':
+			// cppcheck-suppress invalidFunctionArg
+			memcpy(o, DBL_QUOTE_JESCAPE, SSTRL(DBL_QUOTE_JESCAPE));
+			o += SSTRL(DBL_QUOTE_JESCAPE);
+			break;
+		case '\\':
+			// cppcheck-suppress invalidFunctionArg
+			memcpy(o, BACKSLASH_JESCAPE, SSTRL(BACKSLASH_JESCAPE));
+			o += SSTRL(BACKSLASH_JESCAPE);
+			break;
+		case '\t':
+			// cppcheck-suppress invalidFunctionArg
+			memcpy(o, TAB_JESCAPE, SSTRL(TAB_JESCAPE));
+			o += SSTRL(TAB_JESCAPE);
+			break;
+		case '\n':
+			// cppcheck-suppress invalidFunctionArg
+			memcpy(o, NEWLINE_JESCAPE, SSTRL(NEWLINE_JESCAPE));
+			o += SSTRL(NEWLINE_JESCAPE);
+			break;
+		default:
+			// Escape control characters.
+			if ((c < 0x20) || (c == 0x7f)) {
+				snprintf(o, 7, "\\u%04x", c);
+				o += 6;
+			}
+			else {
+				*o++ = c;
+			}
+			break;
+		}
+	}
+	// null terminator
+	*o = '\0';
+}
+
+std::ostream& operator<<(std::ostream& out, const json_stream_escaper& e)
+{
+  boost::optional<hex_formatter> fmt;
+
+  for (unsigned char c : e.str) {
+    switch (c) {
+    case '"':
+      out << DBL_QUOTE_JESCAPE;
+      break;
+    case '\\':
+      out << BACKSLASH_JESCAPE;
+      break;
+    case '\t':
+      out << TAB_JESCAPE;
+      break;
+    case '\n':
+      out << NEWLINE_JESCAPE;
+      break;
+    default:
+      // Escape control characters.
+      if ((c < 0x20) || (c == 0x7f)) {
+        if (!fmt) {
+          fmt.emplace(out); // enable hex formatting
+        }
+        out << "\\u" << std::setw(4) << static_cast<unsigned int>(c);
+      } else {
+        out << c;
+      }
+      break;
+    }
+  }
+  return out;
+}
diff --git a/src/common/escape.h b/src/common/escape.h
new file mode 100644
index 000000000..d2151e8a7
--- /dev/null
+++ b/src/common/escape.h
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RGW_ESCAPE_H
+#define CEPH_RGW_ESCAPE_H
+
+#include <ostream>
+#include <string_view>
+
+/* Returns the length of a buffer that would be needed to escape 'buf'
+ * as an XML attribute
+ */
+size_t escape_xml_attr_len(const char *buf);
+
+/* Escapes 'buf' as an XML attribute. Assumes that 'out' is at least long
+ * enough to fit the output. You can find out the required length by calling
+ * escape_xml_attr_len first.
+ */
+void escape_xml_attr(const char *buf, char *out);
+
+/* Returns the length of a buffer that would be needed to escape 'buf'
+ * as an JSON attribute
+ */
+size_t escape_json_attr_len(const char *buf, size_t src_len);
+
+/* Escapes 'buf' as an JSON attribute. Assumes that 'out' is at least long
+ * enough to fit the output. You can find out the required length by calling
+ * escape_json_attr_len first.
+ */
+void escape_json_attr(const char *buf, size_t src_len, char *out);
+
+/* Note: we escape control characters. Although the XML spec doesn't actually
+ * require this, Amazon does it in their XML responses.
+ */
+
+// stream output operators that write escaped text without making a copy
+// usage:
+//   std::string xml_input = ...;
+//   std::cout << xml_stream_escaper(xml_input) << std::endl;
+
+struct xml_stream_escaper {
+  std::string_view str;
+  xml_stream_escaper(std::string_view str) : str(str.data(), str.size()) {}
+};
+std::ostream& operator<<(std::ostream& out, const xml_stream_escaper& e);
+
+struct json_stream_escaper {
+  std::string_view str;
+  json_stream_escaper(std::string_view str) : str(str.data(), str.size()) {}
+};
+std::ostream& operator<<(std::ostream& out, const json_stream_escaper& e);
+
+#endif
diff --git a/src/common/event_socket.h b/src/common/event_socket.h
new file mode 100644
index 000000000..9224f7683
--- /dev/null
+++ b/src/common/event_socket.h
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_EVENT_SOCKET_H
+#define CEPH_COMMON_EVENT_SOCKET_H
+
+#include <unistd.h>
+#if defined(__FreeBSD__) || defined(__APPLE__)
+#include <errno.h>
+#endif
+#include "include/event_type.h"
+
+class EventSocket {
+  int socket;
+  int type;
+
+ public:
+  EventSocket(): socket(-1), type(EVENT_SOCKET_TYPE_NONE) {}
+  bool is_valid() const { return socket != -1; }
+  int init(int fd, int t) {
+    switch (t) {
+      case EVENT_SOCKET_TYPE_PIPE:
+#ifdef HAVE_EVENTFD
+      case EVENT_SOCKET_TYPE_EVENTFD:
+#endif
+      {
+        socket = fd;
+        type = t;
+        return 0;
+      }
+    }
+    return -EINVAL;
+  }
+  int notify() {
+    int ret;
+    switch (type) {
+      case EVENT_SOCKET_TYPE_PIPE:
+      {
+        char buf[1];
+        buf[0] = 'i';
+        ret = write(socket, buf, 1);
+        if (ret < 0)
+          ret = -errno;
+        else
+          ret = 0;
+        break;
+      }
+#ifdef HAVE_EVENTFD
+      case EVENT_SOCKET_TYPE_EVENTFD:
+      {
+        uint64_t value = 1;
+        ret = write(socket, &value, sizeof (value));
+        if (ret < 0)
+          ret = -errno;
+        else
+          ret = 0;
+        break;
+      }
+#endif
+      default:
+      {
+        ret = -1;
+        break;
+      }
+    }
+    return ret;
+  }
+};
+
+#endif
diff --git a/src/common/fair_mutex.h b/src/common/fair_mutex.h
new file mode 100644
index 000000000..9baa04400
--- /dev/null
+++ b/src/common/fair_mutex.h
@@ -0,0 +1,80 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*-
+
+#pragma once
+
+#include "common/ceph_mutex.h"
+
+#include <thread>
+#include <string>
+
+namespace ceph {
+/// a FIFO mutex
+class fair_mutex {
+public:
+  fair_mutex(const std::string& name)
+    : mutex{ceph::make_mutex(name)}
+  {}
+  ~fair_mutex() = default;
+  fair_mutex(const fair_mutex&) = delete;
+  fair_mutex& operator=(const fair_mutex&) = delete;
+
+  void lock()
+  {
+    std::unique_lock lock(mutex);
+    const unsigned my_id = next_id++;
+    cond.wait(lock, [&] {
+      return my_id == unblock_id;
+    });
+    _set_locked_by();
+  }
+
+  bool try_lock()
+  {
+    std::lock_guard lock(mutex);
+    if (is_locked()) {
+      return false;
+    }
+    ++next_id;
+    _set_locked_by();
+    return true;
+  }
+
+  void unlock()
+  {
+    std::lock_guard lock(mutex);
+    ++unblock_id;
+    _reset_locked_by();
+    cond.notify_all();
+  }
+
+  bool is_locked() const
+  {
+    return next_id != unblock_id;
+  }
+
+#ifdef CEPH_DEBUG_MUTEX
+  bool is_locked_by_me() const {
+    return is_locked() && locked_by == std::this_thread::get_id();
+  }
+private:
+  void _set_locked_by() {
+    locked_by = std::this_thread::get_id();
+  }
+  void _reset_locked_by() {
+    locked_by = {};
+  }
+#else
+  void _set_locked_by() {}
+  void _reset_locked_by() {}
+#endif
+
+private:
+  unsigned next_id = 0;
+  unsigned unblock_id = 0;
+  ceph::condition_variable cond;
+  ceph::mutex mutex;
+#ifdef CEPH_DEBUG_MUTEX
+  std::thread::id locked_by = {};
+#endif
+};
+} // namespace ceph
diff --git a/src/common/fault_injector.h b/src/common/fault_injector.h
new file mode 100644
index 000000000..b1ea52a57
--- /dev/null
+++ b/src/common/fault_injector.h
@@ -0,0 +1,162 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <thread>
+#include <type_traits>
+#include <boost/type_traits/has_equal_to.hpp>
+#include <boost/type_traits/has_left_shift.hpp>
+#include <variant>
+#include "include/ceph_assert.h"
+#include "common/ceph_time.h"
+#include "common/dout.h"
+
+/// @file
+
+/// A failure type that aborts the process with a failed assertion.
+struct InjectAbort {};
+
+/// A failure type that injects an error code and optionally logs a message.
+struct InjectError {
+  /// error code to inject
+  int error;
+  /// an optional log channel to print an error message
+  const DoutPrefixProvider* dpp = nullptr;
+};
+
+/// Injects a delay before returning success.
+struct InjectDelay {
+  /// duration of the delay
+  ceph::timespan duration;
+  /// an optional log channel to print a message
+  const DoutPrefixProvider* dpp = nullptr;
+};
+
+/** @class FaultInjector
+ * @brief Used to instrument a code path with deterministic fault injection
+ * by making one or more calls to check().
+ *
+ * A default-constructed FaultInjector contains no failure. It can also be
+ * constructed with a failure type and a location to inject that failure.
+ *
+ * The contained failure can be overwritten with a call to inject() or clear().
+ * This is not thread-safe with respect to other member functions on the same
+ * instance.
+ *
+ * @tparam Key  The location can be represented by any Key type that is
+ * movable, default-constructible, inequality-comparable and stream-outputable.
+ * A string or string_view Key may be preferable when the location comes from
+ * user input, or to describe the steps like "before-foo" and "after-foo".
+ * An integer Key may be preferable for a code path with many steps, where you
+ * just want to check 1, 2, 3, etc. without inventing names for each.
+ */
+template <typename Key>
+class FaultInjector {
+ public:
+  /// Default-construct with no injected failure.
+  constexpr FaultInjector() noexcept : location() {}
+
+  /// Construct with an injected assertion failure at the given location.
+  constexpr FaultInjector(Key location, InjectAbort a)
+    : location(std::move(location)), failure(a) {}
+
+  /// Construct with an injected error code at the given location.
+  constexpr FaultInjector(Key location, InjectError e)
+    : location(std::move(location)), failure(e) {}
+
+  /// Construct with an injected delay at the given location.
+  constexpr FaultInjector(Key location, InjectDelay d)
+    : location(std::move(location)), failure(d) {}
+
+  /// Inject an assertion failure at the given location.
+  void inject(Key location, InjectAbort a) {
+    this->location = std::move(location);
+    this->failure = a;
+  }
+
+  /// Inject an error at the given location.
+  void inject(Key location, InjectError e) {
+    this->location = std::move(location);
+    this->failure = e;
+  }
+
+  /// Injecte a delay at the given location.
+  void inject(Key location, InjectDelay d) {
+    this->location = std::move(location);
+    this->failure = d;
+  }
+
+  /// Clear any injected failure.
+  void clear() {
+    this->failure = Empty{};
+  }
+
+  /// Check for an injected failure at the given location. If the location
+  /// matches an InjectAbort failure, the process aborts here with an assertion
+  /// failure.
+  /// @returns 0 or InjectError::error if the location matches an InjectError
+  /// failure
+  [[nodiscard]] constexpr int check(const Key& location) const {
+    struct visitor {
+      const Key& check_location;
+      const Key& this_location;
+      constexpr int operator()(const std::monostate&) const {
+        return 0;
+      }
+      int operator()(const InjectAbort&) const {
+        if (check_location == this_location) {
+          ceph_assert_always(!"FaultInjector");
+        }
+        return 0;
+      }
+      int operator()(const InjectError& e) const {
+        if (check_location == this_location) {
+          ldpp_dout(e.dpp, -1) << "Injecting error=" << e.error
+              << " at location=" << this_location << dendl;
+          return e.error;
+        }
+        return 0;
+      }
+      int operator()(const InjectDelay& e) const {
+        if (check_location == this_location) {
+          ldpp_dout(e.dpp, -1) << "Injecting delay=" << e.duration
+              << " at location=" << this_location << dendl;
+          std::this_thread::sleep_for(e.duration);
+        }
+        return 0;
+      }
+    };
+    return std::visit(visitor{location, this->location}, failure);
+  }
+
+ private:
+  // Key requirements:
+  static_assert(std::is_default_constructible_v<Key>,
+                "Key must be default-constrible");
+  static_assert(std::is_move_constructible_v<Key>,
+                "Key must be move-constructible");
+  static_assert(std::is_move_assignable_v<Key>,
+                "Key must be move-assignable");
+  static_assert(boost::has_equal_to<Key, Key, bool>::value,
+                "Key must be equality-comparable");
+  static_assert(boost::has_left_shift<std::ostream, Key, std::ostream&>::value,
+                "Key must have an ostream operator<<");
+
+  Key location; // location of the check that should fail
+
+  using Empty = std::monostate; // empty state for std::variant
+
+  std::variant<Empty, InjectAbort, InjectError, InjectDelay> failure;
+};
diff --git a/src/common/fd.cc b/src/common/fd.cc
new file mode 100644
index 000000000..89d18940b
--- /dev/null
+++ b/src/common/fd.cc
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2012 Inktank
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+#include "debug.h"
+#include "errno.h"
+
+#ifndef _WIN32
+void dump_open_fds(CephContext *cct)
+{
+#ifdef __APPLE__
+  const char *fn = "/dev/fd";
+#else
+  const char *fn = PROCPREFIX "/proc/self/fd";
+#endif
+  DIR *d = opendir(fn);
+  if (!d) {
+    lderr(cct) << "dump_open_fds unable to open " << fn << dendl;
+    return;
+  }
+  struct dirent *de = nullptr;
+
+  int n = 0;
+  while ((de = ::readdir(d))) {
+    if (de->d_name[0] == '.')
+      continue;
+    char path[PATH_MAX];
+    snprintf(path, sizeof(path), "%s/%s", fn, de->d_name);
+    char target[PATH_MAX];
+    ssize_t r = readlink(path, target, sizeof(target) - 1);
+    if (r < 0) {
+      r = -errno;
+      lderr(cct) << "dump_open_fds unable to readlink " << path << ": " << cpp_strerror(r) << dendl;
+      continue;
+    }
+    target[r] = 0;
+    lderr(cct) << "dump_open_fds " << de->d_name << " -> " << target << dendl;
+    n++;
+  }
+  lderr(cct) << "dump_open_fds dumped " << n << " open files" << dendl;
+
+  closedir(d);
+}
+#else
+void dump_open_fds(CephContext *cct)
+{
+}
+#endif
diff --git a/src/common/fd.h b/src/common/fd.h
new file mode 100644
index 000000000..718d59296
--- /dev/null
+++ b/src/common/fd.h
@@ -0,0 +1,22 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2012 Inktank
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_FD_H
+#define CEPH_COMMON_FD_H
+
+#include "include/common_fwd.h"
+
+void dump_open_fds(CephContext *cct);
+
+#endif
diff --git a/src/common/fork_function.h b/src/common/fork_function.h
new file mode 100644
index 000000000..3a4f2f29c
--- /dev/null
+++ b/src/common/fork_function.h
@@ -0,0 +1,176 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+// Run a function in a forked child, with a timeout.
+
+#pragma once
+
+#include <functional>
+#include <iostream>
+#include <ostream>
+
+#include <signal.h>
+#ifndef _WIN32
+#include <sys/wait.h>
+#endif
+#include <sys/types.h>
+
+#include "include/ceph_assert.h"
+#include "common/errno.h"
+
+#ifndef _WIN32
+static void _fork_function_dummy_sighandler(int sig) {}
+
+// Run a function post-fork, with a timeout.  Function can return
+// int8_t only due to unix exit code limitations.  Returns -ETIMEDOUT
+// if timeout is reached.
+static inline int fork_function(
+  int timeout,
+  std::ostream& errstr,
+  std::function<int8_t(void)> f)
+{
+  // first fork the forker.
+  pid_t forker_pid = fork();
+  if (forker_pid) {
+    // just wait
+    int status;
+    while (waitpid(forker_pid, &status, 0) == -1) {
+      ceph_assert(errno == EINTR);
+    }
+    if (WIFSIGNALED(status)) {
+      errstr << ": got signal: " << WTERMSIG(status) << "\n";
+      return 128 + WTERMSIG(status);
+    }
+    if (WIFEXITED(status)) {
+      int8_t r = WEXITSTATUS(status);
+      errstr << ": exit status: " << (int)r << "\n";
+      return r;
+    }
+    errstr << ": waitpid: unknown status returned\n";
+    return -1;
+  }
+
+  // we are forker (first child)
+
+  // close all fds
+  int maxfd = sysconf(_SC_OPEN_MAX);
+  if (maxfd == -1)
+    maxfd = 16384;
+  for (int fd = 0; fd <= maxfd; fd++) {
+    if (fd == STDIN_FILENO)
+      continue;
+    if (fd == STDOUT_FILENO)
+      continue;
+    if (fd == STDERR_FILENO)
+      continue;
+    ::close(fd);
+  }
+
+  sigset_t mask, oldmask;
+  int pid;
+
+  // Restore default action for SIGTERM in case the parent process decided
+  // to ignore it.
+  if (signal(SIGTERM, SIG_DFL) == SIG_ERR) {
+    std::cerr << ": signal failed: " << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+  // Because SIGCHLD is ignored by default, setup dummy handler for it,
+  // so we can mask it.
+  if (signal(SIGCHLD, _fork_function_dummy_sighandler) == SIG_ERR) {
+    std::cerr << ": signal failed: " << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+  // Setup timeout handler.
+  if (signal(SIGALRM, timeout_sighandler) == SIG_ERR) {
+    std::cerr << ": signal failed: " << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+  // Block interesting signals.
+  sigemptyset(&mask);
+  sigaddset(&mask, SIGINT);
+  sigaddset(&mask, SIGTERM);
+  sigaddset(&mask, SIGCHLD);
+  sigaddset(&mask, SIGALRM);
+  if (sigprocmask(SIG_SETMASK, &mask, &oldmask) == -1) {
+    std::cerr << ": sigprocmask failed: "
+	      << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+
+  pid = fork();
+
+  if (pid == -1) {
+    std::cerr << ": fork failed: " << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+
+  if (pid == 0) { // we are second child
+    // Restore old sigmask.
+    if (sigprocmask(SIG_SETMASK, &oldmask, NULL) == -1) {
+      std::cerr << ": sigprocmask failed: "
+		<< cpp_strerror(errno) << "\n";
+      goto fail_exit;
+    }
+    (void)setpgid(0, 0); // Become process group leader.
+    int8_t r = f();
+    _exit((uint8_t)r);
+  }
+
+  // Parent
+  (void)alarm(timeout);
+
+  for (;;) {
+    int signo;
+    if (sigwait(&mask, &signo) == -1) {
+      std::cerr << ": sigwait failed: " << cpp_strerror(errno) << "\n";
+      goto fail_exit;
+    }
+    switch (signo) {
+    case SIGCHLD:
+      int status;
+      if (waitpid(pid, &status, WNOHANG) == -1) {
+	std::cerr << ": waitpid failed: " << cpp_strerror(errno) << "\n";
+	goto fail_exit;
+      }
+      if (WIFEXITED(status))
+	_exit(WEXITSTATUS(status));
+      if (WIFSIGNALED(status))
+	_exit(128 + WTERMSIG(status));
+      std::cerr << ": unknown status returned\n";
+      goto fail_exit;
+    case SIGINT:
+    case SIGTERM:
+      // Pass SIGINT and SIGTERM, which are usually used to terminate
+      // a process, to the child.
+      if (::kill(pid, signo) == -1) {
+	std::cerr << ": kill failed: " << cpp_strerror(errno) << "\n";
+	goto fail_exit;
+      }
+      continue;
+    case SIGALRM:
+      std::cerr << ": timed out (" << timeout << " sec)\n";
+      if (::killpg(pid, SIGKILL) == -1) {
+	std::cerr << ": kill failed: " << cpp_strerror(errno) << "\n";
+	goto fail_exit;
+      }
+      _exit(-ETIMEDOUT);
+    default:
+      std::cerr << ": sigwait: invalid signal: " << signo << "\n";
+      goto fail_exit;
+    }
+  }
+  return 0;
+fail_exit:
+  _exit(EXIT_FAILURE);
+}
+#else
+static inline int fork_function(
+  int timeout,
+  std::ostream& errstr,
+  std::function<int8_t(void)> f)
+{
+  errstr << "Forking is not available on Windows.\n";
+  return -1;
+}
+#endif
diff --git a/src/common/freebsd_errno.cc b/src/common/freebsd_errno.cc
new file mode 100644
index 000000000..259ce7be7
--- /dev/null
+++ b/src/common/freebsd_errno.cc
@@ -0,0 +1,219 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <errno.h>
+#include "include/types.h"
+#include "include/compat.h"
+
+#define H2C_ERRNO(a,b) [a] = b
+#define C2H_ERRNO(a,b) [a] = b
+
+// Build a table with the FreeBSD error as index
+// and the Linux error as value
+// Use the fact that the arry is initialised per default on all 0's
+// And we do not translate for 0's, but return the original value.
+static const __s32 ceph_to_hostos_conv[256] = {
+//       Linux errno  FreeBSD errno
+       C2H_ERRNO(11,  EAGAIN),	
+       C2H_ERRNO(35,  EDEADLK),	
+       C2H_ERRNO(36,  ENAMETOOLONG),	
+       C2H_ERRNO(37,  ENOLCK),	
+       C2H_ERRNO(38,  ENOSYS),	
+       C2H_ERRNO(39,  ENOTEMPTY),	
+       C2H_ERRNO(40,  ELOOP),	
+       C2H_ERRNO(42,  ENOMSG),	
+       C2H_ERRNO(43,  EIDRM),	
+       C2H_ERRNO(44,  EPERM),	 //TODO ECHRNG   /* Channel number out of range */
+       C2H_ERRNO(45,  EPERM),	 //TODO EL2NSYNC /* Level 2 not synchronized */
+       C2H_ERRNO(46,  EPERM),	 //TODO EL3HLT   /* Level 3 halted */
+       C2H_ERRNO(47,  EPERM),	 //TODO EL3RST   /* Level 3 reset */
+       C2H_ERRNO(48,  EPERM),	 //TODO ELNRNG   /* Link number out of range */
+       C2H_ERRNO(49,  EPERM),	 //TODO EUNATCH  /* Protocol driver not attached */
+       C2H_ERRNO(50,  EPERM),	 //TODO ENOCSI   /* No CSI structure available */
+       C2H_ERRNO(51,  EPERM),	 //TODO EL2HLT   /* Level 2 halted */
+       C2H_ERRNO(52,  EPERM),	 //TODO EBADE    /* Invalid exchange */
+       C2H_ERRNO(53,  EPERM),	 //TODO EBADR    /* Invalid request descriptor */
+       C2H_ERRNO(54,  EPERM),	 //TODO EXFULL   /* Exchange full */
+       C2H_ERRNO(55,  EPERM),	 //TODO ENOANO   /* No anode */
+       C2H_ERRNO(56,  EPERM),	 //TODO EBADRQC  /* Invalid request code */
+       C2H_ERRNO(57,  EPERM),	 //TODO EBADSLT  /* Invalid slot */
+       C2H_ERRNO(59,  EPERM),	 //TODO EBFONT   /* Bad font file format */
+       C2H_ERRNO(60,  ENOSTR),	
+       C2H_ERRNO(61,  ENODATA),	
+       C2H_ERRNO(62,  ETIME),	
+       C2H_ERRNO(63,  ENOSR),	
+       C2H_ERRNO(64,  EPERM),	 //TODO ENONET
+       C2H_ERRNO(65,  EPERM),	 //TODO ENOPKG
+       C2H_ERRNO(66,  EREMOTE),	
+       C2H_ERRNO(67,  ENOLINK),	
+       C2H_ERRNO(68,  EPERM),	 //TODO EADV
+       C2H_ERRNO(69,  EPERM),	 //TODO ESRMNT
+       C2H_ERRNO(70,  EPERM),	 //TODO ECOMM
+       C2H_ERRNO(71,  EPROTO),	
+       C2H_ERRNO(72,  EMULTIHOP),	
+       C2H_ERRNO(73,  EPERM),	 //TODO EDOTDOT
+       C2H_ERRNO(74,  EBADMSG),	
+       C2H_ERRNO(75,  EOVERFLOW),	
+       C2H_ERRNO(76,  EPERM),	 //TODO ENOTUNIQ
+       C2H_ERRNO(77,  EPERM),	 //TODO EBADFD
+       C2H_ERRNO(78,  EPERM),	 //TODO EREMCHG
+       C2H_ERRNO(79,  EPERM),	 //TODO ELIBACC
+       C2H_ERRNO(80,  EPERM),	 //TODO ELIBBAD
+       C2H_ERRNO(81,  EPERM),	 //TODO ELIBSCN
+       C2H_ERRNO(82,  EPERM),	 //TODO ELIBMAX
+       C2H_ERRNO(83,  EPERM),	 //TODO ELIBEXEC
+       C2H_ERRNO(84,  EILSEQ),	
+       C2H_ERRNO(85,  EINTR),	 /* not quite, since this is a syscll restart */
+       C2H_ERRNO(86,  EPERM),	 //ESTRPIPE;
+       C2H_ERRNO(87,  EUSERS),	
+       C2H_ERRNO(88,  ENOTSOCK),	
+       C2H_ERRNO(89,  EDESTADDRREQ),	
+       C2H_ERRNO(90,  EMSGSIZE),	
+       C2H_ERRNO(91,  EPROTOTYPE),	
+       C2H_ERRNO(92,  ENOPROTOOPT),	
+       C2H_ERRNO(93,  EPROTONOSUPPORT),	
+       C2H_ERRNO(94,  ESOCKTNOSUPPORT),	
+       C2H_ERRNO(95,  EOPNOTSUPP),	
+       C2H_ERRNO(96,  EPFNOSUPPORT),	
+       C2H_ERRNO(97,  EAFNOSUPPORT),	
+       C2H_ERRNO(98,  EADDRINUSE),	
+       C2H_ERRNO(99,  EADDRNOTAVAIL),	
+       C2H_ERRNO(100, ENETDOWN),	
+       C2H_ERRNO(101, ENETUNREACH),	
+       C2H_ERRNO(102, ENETRESET),	
+       C2H_ERRNO(103, ECONNABORTED),	
+       C2H_ERRNO(104, ECONNRESET),	
+       C2H_ERRNO(105, ENOBUFS),	
+       C2H_ERRNO(106, EISCONN),	
+       C2H_ERRNO(107, ENOTCONN),	
+       C2H_ERRNO(108, ESHUTDOWN),	
+       C2H_ERRNO(109, ETOOMANYREFS),	
+       C2H_ERRNO(110, ETIMEDOUT),	
+       C2H_ERRNO(111, ECONNREFUSED),	
+       C2H_ERRNO(112, EHOSTDOWN),	
+       C2H_ERRNO(113, EHOSTUNREACH),	
+       C2H_ERRNO(114, EALREADY),	
+       C2H_ERRNO(115, EINPROGRESS),	
+       C2H_ERRNO(116, ESTALE),	
+       C2H_ERRNO(117, EPERM),	 //TODO EUCLEAN
+       C2H_ERRNO(118, EPERM),	 //TODO ENOTNAM
+       C2H_ERRNO(119, EPERM),	 //TODO ENAVAIL
+       C2H_ERRNO(120, EPERM),	 //TODO EISNAM
+       C2H_ERRNO(121, EREMOTEIO),	
+       C2H_ERRNO(122, EDQUOT),	
+       C2H_ERRNO(123, EPERM),	 //TODO ENOMEDIUM
+       C2H_ERRNO(124, EPERM),	 //TODO EMEDIUMTYPE - not used
+       C2H_ERRNO(125, ECANCELED),	
+       C2H_ERRNO(126, EPERM),	 //TODO ENOKEY
+       C2H_ERRNO(127, EPERM),	 //TODO EKEYEXPIRED
+       C2H_ERRNO(128, EPERM),	 //TODO EKEYREVOKED
+       C2H_ERRNO(129, EPERM),	 //TODO EKEYREJECTED
+       C2H_ERRNO(130, EOWNERDEAD),	
+       C2H_ERRNO(131, ENOTRECOVERABLE),	
+       C2H_ERRNO(132, EPERM),	 //TODO ERFKILL
+       C2H_ERRNO(133, EPERM),	 //TODO EHWPOISON
+    };
+
+// Build a table with the FreeBSD error as index
+// and the Linux error as value
+// Use the fact that the arry is initialised per default on all 0's
+// And we do not translate for 0's, but return the original value.
+static const __s32 hostos_to_ceph_conv[256] = {
+	//        FreeBSD errno Linux errno
+	H2C_ERRNO(EDEADLK,	35),   	/* Resource deadlock avoided */
+        H2C_ERRNO(EAGAIN,	11),   	/* Resource temporarily unavailable */
+        H2C_ERRNO(EINPROGRESS,	115),	/* Operation now in progress */
+        H2C_ERRNO(EALREADY,	114),	/* Operation already in progress */
+        H2C_ERRNO(ENOTSOCK,	88),	/* Socket operation on non-socket */
+        H2C_ERRNO(EDESTADDRREQ,	89),	/* Destination address required */
+        H2C_ERRNO(EMSGSIZE,	90),	/* Message too long */
+        H2C_ERRNO(EPROTOTYPE,	91),	/* Protocol wrong type for socket */
+        H2C_ERRNO(ENOPROTOOPT,	92),	/* Protocol not available */
+        H2C_ERRNO(EPROTONOSUPPORT, 93),	/* Protocol not supported */
+        H2C_ERRNO(ESOCKTNOSUPPORT, 94),	/* Socket type not supported */
+        H2C_ERRNO(EOPNOTSUPP,	95),	/* Operation not supported */
+        H2C_ERRNO(EPFNOSUPPORT,	96),	/* Protocol family not supported */
+        H2C_ERRNO(EAFNOSUPPORT,	97),	/* Address family not supported by protocol family */
+        H2C_ERRNO(EADDRINUSE,	98),	/* Address already in use */
+        H2C_ERRNO(EADDRNOTAVAIL, 99),	/* Can't assign requested address */
+        H2C_ERRNO(ENETDOWN,	100),	/* Network is down */
+        H2C_ERRNO(ENETUNREACH,	101),	/* Network is unreachable */
+        H2C_ERRNO(ENETRESET,	102),	/* Network dropped connection on reset */
+        H2C_ERRNO(ECONNABORTED,	103),	/* Software caused connection abort */
+        H2C_ERRNO(ECONNRESET,	104),	/* Connection reset by peer */
+        H2C_ERRNO(ENOBUFS,	105),	/* No buffer space available */
+        H2C_ERRNO(EISCONN,	106),	/* Socket is already connected */
+        H2C_ERRNO(ENOTCONN,	107),	/* Socket is not connected */
+        H2C_ERRNO(ESHUTDOWN,	108),	/* Can't send after socket shutdown */
+        H2C_ERRNO(ETOOMANYREFS,	109),	/* Too many references: can't splice */
+        H2C_ERRNO(ETIMEDOUT,	110),	/* Operation timed out */
+        H2C_ERRNO(ECONNREFUSED,	111),	/* Connection refused */
+        H2C_ERRNO(ELOOP,	40),	/* Too many levels of symbolic links */
+        H2C_ERRNO(ENAMETOOLONG,	36),	/* File name too long */
+        H2C_ERRNO(EHOSTDOWN,	112),	/* Host is down */
+        H2C_ERRNO(EHOSTUNREACH,	113),	/* No route to host */
+        H2C_ERRNO(ENOTEMPTY,	39),	/* Directory not empty */
+        H2C_ERRNO(EPROCLIM,	EPERM),	/* Too many processes */
+        H2C_ERRNO(EUSERS,	87),	/* Too many users */
+        H2C_ERRNO(EDQUOT,	122),	/* Disc quota exceeded */
+        H2C_ERRNO(ESTALE,	116),	/* Stale NFS file handle */
+        H2C_ERRNO(EREMOTE,	66),	/* Too many levels of remote in path */
+        H2C_ERRNO(EBADRPC,	EPERM),	/* RPC struct is bad */
+        H2C_ERRNO(ERPCMISMATCH,	EPERM),	/* RPC version wrong */
+        H2C_ERRNO(EPROGUNAVAIL,	EPERM),	/* RPC prog. not avail */
+        H2C_ERRNO(EPROGMISMATCH, EPERM),/* Program version wrong */
+        H2C_ERRNO(EPROCUNAVAIL,	EPERM),	/* Bad procedure for program */
+        H2C_ERRNO(ENOLCK,	EPERM),	/* No locks available */
+        H2C_ERRNO(ENOSYS,	EPERM),	/* Function not implemented */
+        H2C_ERRNO(EFTYPE,	EPERM),	/* Inappropriate file type or format */
+        H2C_ERRNO(EAUTH,	EPERM),	/* Authentication error */
+        H2C_ERRNO(ENEEDAUTH,	EPERM),	/* Need authenticator */
+        H2C_ERRNO(EIDRM,	43),	/* Identifier removed */
+        H2C_ERRNO(ENOMSG,	42),	/* No message of desired type */
+        H2C_ERRNO(EOVERFLOW,	75),	/* Value too large to be stored in data type */
+        H2C_ERRNO(ECANCELED,	125),	/* Operation canceled */
+        H2C_ERRNO(EILSEQ,	84),	/* Illegal byte sequence */
+        H2C_ERRNO(ENOATTR,	61),	/* Attribute not found */
+        H2C_ERRNO(EDOOFUS,	EPERM),	/* Programming error */
+        H2C_ERRNO(EBADMSG,	74),	/* Bad message */
+        H2C_ERRNO(EMULTIHOP,	72),	/* Multihop attempted */
+        H2C_ERRNO(ENOLINK,	67),	/* Link has been severed */
+        H2C_ERRNO(EPROTO,	71),	/* Protocol error */
+        H2C_ERRNO(ENOTCAPABLE,	EPERM),	/* Capabilities insufficient */
+        H2C_ERRNO(ECAPMODE,	EPERM),	/* Not permitted in capability mode */
+        H2C_ERRNO(ENOTRECOVERABLE, 131),/* State not recoverable */
+        H2C_ERRNO(EOWNERDEAD,	130),	/* Previous owner died */
+	};
+
+// converts from linux errno values to host values
+__s32 ceph_to_hostos_errno(__s32 r)
+{
+  int sign = (r < 0 ? -1 : 1);
+  int err = std::abs(r);
+  if (err < 256 && ceph_to_hostos_conv[err] !=0 ) {
+    err = ceph_to_hostos_conv[err];
+  }
+  return err * sign;
+}
+
+// converts Host OS errno values to linux/Ceph values
+__s32 hostos_to_ceph_errno(__s32 r)
+{
+  int sign = (r < 0 ? -1 : 1);
+  int err = std::abs(r);
+  if (err < 256 && hostos_to_ceph_conv[err] !=0 ) {
+    err = hostos_to_ceph_conv[err];
+  }
+  return err * sign;
+}
diff --git a/src/common/fs_types.cc b/src/common/fs_types.cc
new file mode 100644
index 000000000..47021e360
--- /dev/null
+++ b/src/common/fs_types.cc
@@ -0,0 +1,150 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/fs_types.h"
+#include "common/Formatter.h"
+#include "include/ceph_features.h"
+#include "common/ceph_json.h"
+
+void dump(const ceph_file_layout& l, ceph::Formatter *f)
+{
+  f->dump_unsigned("stripe_unit", l.fl_stripe_unit);
+  f->dump_unsigned("stripe_count", l.fl_stripe_count);
+  f->dump_unsigned("object_size", l.fl_object_size);
+  if (l.fl_cas_hash)
+    f->dump_unsigned("cas_hash", l.fl_cas_hash);
+  if (l.fl_object_stripe_unit)
+    f->dump_unsigned("object_stripe_unit", l.fl_object_stripe_unit);
+  if (l.fl_pg_pool)
+    f->dump_unsigned("pg_pool", l.fl_pg_pool);
+}
+
+void dump(const ceph_dir_layout& l, ceph::Formatter *f)
+{
+  f->dump_unsigned("dir_hash", l.dl_dir_hash);
+  f->dump_unsigned("unused1", l.dl_unused1);
+  f->dump_unsigned("unused2", l.dl_unused2);
+  f->dump_unsigned("unused3", l.dl_unused3);
+}
+
+
+// file_layout_t
+
+bool file_layout_t::is_valid() const
+{
+  /* stripe unit, object size must be non-zero, 64k increment */
+  if (!stripe_unit || (stripe_unit & (CEPH_MIN_STRIPE_UNIT-1)))
+    return false;
+  if (!object_size || (object_size & (CEPH_MIN_STRIPE_UNIT-1)))
+    return false;
+  /* object size must be a multiple of stripe unit */
+  if (object_size < stripe_unit || object_size % stripe_unit)
+    return false;
+  /* stripe count must be non-zero */
+  if (!stripe_count)
+    return false;
+  return true;
+}
+
+void file_layout_t::from_legacy(const ceph_file_layout& fl)
+{
+  stripe_unit = fl.fl_stripe_unit;
+  stripe_count = fl.fl_stripe_count;
+  object_size = fl.fl_object_size;
+  pool_id = (int32_t)fl.fl_pg_pool;
+  // in the legacy encoding, a zeroed structure was the default and
+  // would have pool 0 instead of -1.
+  if (pool_id == 0 && stripe_unit == 0 && stripe_count == 0 && object_size == 0)
+    pool_id = -1;
+  pool_ns.clear();
+}
+
+void file_layout_t::to_legacy(ceph_file_layout *fl) const
+{
+  fl->fl_stripe_unit = stripe_unit;
+  fl->fl_stripe_count = stripe_count;
+  fl->fl_object_size = object_size;
+  fl->fl_cas_hash = 0;
+  fl->fl_object_stripe_unit = 0;
+  fl->fl_unused = 0;
+  // in the legacy encoding, pool 0 was undefined.
+  if (pool_id >= 0)
+    fl->fl_pg_pool = pool_id;
+  else
+    fl->fl_pg_pool = 0;
+}
+
+void file_layout_t::encode(ceph::buffer::list& bl, uint64_t features) const
+{
+  using ceph::encode;
+  if ((features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) == 0) {
+    ceph_file_layout fl;
+    ceph_assert((stripe_unit & 0xff) == 0);  // first byte must be 0
+    to_legacy(&fl);
+    encode(fl, bl);
+    return;
+  }
+
+  ENCODE_START(2, 2, bl);
+  encode(stripe_unit, bl);
+  encode(stripe_count, bl);
+  encode(object_size, bl);
+  encode(pool_id, bl);
+  encode(pool_ns, bl);
+  ENCODE_FINISH(bl);
+}
+
+void file_layout_t::decode(ceph::buffer::list::const_iterator& p)
+{
+  using ceph::decode;
+  if (*p == 0) {
+    ceph_file_layout fl;
+    decode(fl, p);
+    from_legacy(fl);
+    return;
+  }
+  DECODE_START(2, p);
+  decode(stripe_unit, p);
+  decode(stripe_count, p);
+  decode(object_size, p);
+  decode(pool_id, p);
+  decode(pool_ns, p);
+  DECODE_FINISH(p);
+}
+
+void file_layout_t::dump(ceph::Formatter *f) const
+{
+  f->dump_unsigned("stripe_unit", stripe_unit);
+  f->dump_unsigned("stripe_count", stripe_count);
+  f->dump_unsigned("object_size", object_size);
+  f->dump_int("pool_id", pool_id);
+  f->dump_string("pool_ns", pool_ns);
+}
+
+void file_layout_t::decode_json(JSONObj *obj){
+
+    JSONDecoder::decode_json("stripe_unit", stripe_unit, obj, true);
+    JSONDecoder::decode_json("stripe_count", stripe_count, obj, true);
+    JSONDecoder::decode_json("object_size", object_size, obj, true);
+    JSONDecoder::decode_json("pool_id", pool_id, obj, true);
+    JSONDecoder::decode_json("pool_ns", pool_ns, obj, true);
+}
+
+void file_layout_t::generate_test_instances(std::list<file_layout_t*>& o)
+{
+  o.push_back(new file_layout_t);
+  o.push_back(new file_layout_t);
+  o.back()->stripe_unit = 4096;
+  o.back()->stripe_count = 16;
+  o.back()->object_size = 1048576;
+  o.back()->pool_id = 3;
+  o.back()->pool_ns = "myns";
+}
+
+std::ostream& operator<<(std::ostream& out, const file_layout_t &layout)
+{
+  ceph::JSONFormatter f;
+  layout.dump(&f);
+  f.flush(out);
+  return out;
+}
diff --git a/src/common/function_signature.h b/src/common/function_signature.h
new file mode 100644
index 000000000..6d2a34ee6
--- /dev/null
+++ b/src/common/function_signature.h
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Copied from:
+ * https://github.com/exclipy/inline_variant_visitor/blob/master/function_signature.hpp
+ * which apparently copied it from
+ * http://stackoverflow.com/questions/4771417/how-to-get-the-signature-of-a-c-bind-expression
+ */
+
+#ifndef FUNCTION_SIGNATURE_H
+#define FUNCTION_SIGNATURE_H
+
+#include <boost/mpl/pop_front.hpp>
+#include <boost/mpl/push_front.hpp>
+#include <boost/function_types/function_type.hpp>
+#include <boost/function_types/result_type.hpp>
+#include <boost/function_types/parameter_types.hpp>
+
+template <typename F>
+struct signature_of_member
+{
+    typedef typename boost::function_types::result_type<F>::type result_type;
+    typedef typename boost::function_types::parameter_types<F>::type parameter_types;
+    typedef typename boost::mpl::pop_front<parameter_types>::type base;
+    typedef typename boost::mpl::push_front<base, result_type>::type L;
+    typedef typename boost::function_types::function_type<L>::type type;
+};
+
+template <typename F, bool is_class>
+struct signature_of_impl
+{
+    typedef typename boost::function_types::function_type<F>::type type;
+};
+
+template <typename F>
+struct signature_of_impl<F, true>
+{
+    typedef typename signature_of_member<decltype(&F::operator())>::type type;
+};
+
+template <typename F>
+struct signature_of
+{
+    typedef typename signature_of_impl<F, boost::is_class<F>::value>::type type;
+};
+
+#endif
diff --git a/src/common/hex.cc b/src/common/hex.cc
new file mode 100644
index 000000000..a02e0fd68
--- /dev/null
+++ b/src/common/hex.cc
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2008-2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/hex.h"
+
+void hex2str(const char *s, int len, char *buf, int dest_len)
+{
+  int pos = 0;
+  for (int i=0; i<len && pos<dest_len; i++) {
+    if (i && !(i%8))
+      pos += snprintf(&buf[pos], dest_len-pos, " ");
+    if (i && !(i%16))
+      pos += snprintf(&buf[pos], dest_len-pos, "\n");
+    pos += snprintf(&buf[pos], dest_len-pos, "%.2x ", (int)(unsigned char)s[i]);
+  }
+}
+
+std::string hexdump(const std::string &msg, const char *s, int len)
+{
+  int buf_len = len*4;
+  char buf[buf_len];
+  hex2str(s, len, buf, buf_len);
+  return buf;
+}
diff --git a/src/common/hex.h b/src/common/hex.h
new file mode 100644
index 000000000..f3c15097b
--- /dev/null
+++ b/src/common/hex.h
@@ -0,0 +1,25 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_HEX_H
+#define CEPH_COMMON_HEX_H
+
+#include <string>
+
+extern void hex2str(const char *s, int len, char *buf, int dest_len);
+
+extern std::string hexdump(std::string msg, const char *s, int len);
+
+#endif
diff --git a/src/common/histogram.cc b/src/common/histogram.cc
new file mode 100644
index 000000000..62a7f4492
--- /dev/null
+++ b/src/common/histogram.cc
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/histogram.h"
+#include "common/Formatter.h"
+
+// -- pow2_hist_t --
+void pow2_hist_t::dump(ceph::Formatter *f) const
+{
+  f->open_array_section("histogram");
+  for (std::vector<int32_t>::const_iterator p = h.begin(); p != h.end(); ++p)
+    f->dump_int("count", *p);
+  f->close_section();
+  f->dump_int("upper_bound", upper_bound());
+}
+
+void pow2_hist_t::encode(ceph::buffer::list& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(h, bl);
+  ENCODE_FINISH(bl);
+}
+
+void pow2_hist_t::decode(ceph::buffer::list::const_iterator& p)
+{
+  DECODE_START(1, p);
+  decode(h, p);
+  DECODE_FINISH(p);
+}
+
+void pow2_hist_t::generate_test_instances(std::list<pow2_hist_t*>& ls)
+{
+  ls.push_back(new pow2_hist_t);
+  ls.push_back(new pow2_hist_t);
+  ls.back()->h.push_back(1);
+  ls.back()->h.push_back(3);
+  ls.back()->h.push_back(0);
+  ls.back()->h.push_back(2);
+}
+
+void pow2_hist_t::decay(int bits)
+{
+  for (std::vector<int32_t>::iterator p = h.begin(); p != h.end(); ++p) {
+    *p >>= bits;
+  }
+  _contract();
+}
diff --git a/src/common/histogram.h b/src/common/histogram.h
new file mode 100644
index 000000000..cdaca61c2
--- /dev/null
+++ b/src/common/histogram.h
@@ -0,0 +1,128 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ * Copyright 2013 Inktank
+ */
+
+#ifndef CEPH_HISTOGRAM_H
+#define CEPH_HISTOGRAM_H
+
+#include <list>
+#include "include/encoding.h"
+#include "include/intarith.h"
+
+namespace ceph {
+  class Formatter;
+}
+
+/**
+ * power of 2 histogram
+ */
+struct pow2_hist_t { //
+  /**
+   * histogram
+   *
+   * bin size is 2^index
+   * value is count of elements that are <= the current bin but > the previous bin.
+   */
+  std::vector<int32_t> h;
+
+private:
+  /// expand to at least another's size
+  void _expand_to(unsigned s) {
+    if (s > h.size())
+      h.resize(s, 0);
+  }
+  /// drop useless trailing 0's
+  void _contract() {
+    unsigned p = h.size();
+    while (p > 0 && h[p-1] == 0)
+      --p;
+    h.resize(p);
+  }
+
+public:
+  void clear() {
+    h.clear();
+  }
+  bool empty() const {
+    return h.empty();
+  }
+  void set_bin(int bin, int32_t count) {
+    _expand_to(bin + 1);
+    h[bin] = count;
+    _contract();
+  }
+
+  void add(int32_t v) {
+    int bin = cbits(v);
+    _expand_to(bin + 1);
+    h[bin]++;
+    _contract();
+  }
+
+  bool operator==(const pow2_hist_t &r) const {
+    return h == r.h;
+  }
+
+  /// get a value's position in the histogram.
+  ///
+  /// positions are represented as values in the range [0..1000000]
+  /// (millionths on the unit interval).
+  ///
+  /// @param v [in] value (non-negative)
+  /// @param lower [out] pointer to lower-bound (0..1000000)
+  /// @param upper [out] pointer to the upper bound (0..1000000)
+  int get_position_micro(int32_t v, uint64_t *lower, uint64_t *upper) {
+    if (v < 0)
+      return -1;
+    unsigned bin = cbits(v);
+    uint64_t lower_sum = 0, upper_sum = 0, total = 0;
+    for (unsigned i=0; i<h.size(); ++i) {
+      if (i <= bin)
+	upper_sum += h[i];
+      if (i < bin)
+	lower_sum += h[i];
+      total += h[i];
+    }
+    if (total > 0) {
+      *lower = lower_sum * 1000000 / total;
+      *upper = upper_sum * 1000000 / total;
+    }
+    return 0;
+  }
+
+  void add(const pow2_hist_t& o) {
+    _expand_to(o.h.size());
+    for (unsigned p = 0; p < o.h.size(); ++p)
+      h[p] += o.h[p];
+    _contract();
+  }
+  void sub(const pow2_hist_t& o) {
+    _expand_to(o.h.size());
+    for (unsigned p = 0; p < o.h.size(); ++p)
+      h[p] -= o.h[p];
+    _contract();
+  }
+
+  int32_t upper_bound() const {
+    return 1 << h.size();
+  }
+
+  /// decay histogram by N bits (default 1, for a halflife)
+  void decay(int bits = 1);
+
+  void dump(ceph::Formatter *f) const;
+  void encode(ceph::buffer::list &bl) const;
+  void decode(ceph::buffer::list::const_iterator &bl);
+  static void generate_test_instances(std::list<pow2_hist_t*>& o);
+};
+WRITE_CLASS_ENCODER(pow2_hist_t)
+
+#endif /* CEPH_HISTOGRAM_H */
diff --git a/src/common/hobject.cc b/src/common/hobject.cc
new file mode 100644
index 000000000..1aee4cc42
--- /dev/null
+++ b/src/common/hobject.cc
@@ -0,0 +1,611 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <charconv>
+
+#include "hobject.h"
+#include "common/Formatter.h"
+
+using std::list;
+using std::ostream;
+using std::set;
+using std::string;
+
+using ceph::bufferlist;
+using ceph::Formatter;
+
+static void append_escaped(const string &in, string *out)
+{
+  for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
+    if (*i == '%') {
+      out->push_back('%');
+      out->push_back('p');
+    } else if (*i == '.') {
+      out->push_back('%');
+      out->push_back('e');
+    } else if (*i == '_') {
+      out->push_back('%');
+      out->push_back('u');
+    } else {
+      out->push_back(*i);
+    }
+  }
+}
+
+set<string> hobject_t::get_prefixes(
+  uint32_t bits,
+  uint32_t mask,
+  int64_t pool)
+{
+  uint32_t len = bits;
+  while (len % 4 /* nibbles */) len++;
+
+  set<uint32_t> from;
+  if (bits < 32)
+    from.insert(mask & ~((uint32_t)(~0) << bits));
+  else if (bits == 32)
+    from.insert(mask);
+  else
+    ceph_abort();
+
+
+  set<uint32_t> to;
+  for (uint32_t i = bits; i < len; ++i) {
+    for (set<uint32_t>::iterator j = from.begin();
+	 j != from.end();
+	 ++j) {
+      to.insert(*j | (1U << i));
+      to.insert(*j);
+    }
+    to.swap(from);
+    to.clear();
+  }
+
+  char buf[20];
+  char *t = buf;
+  uint64_t poolid(pool);
+  t += snprintf(t, sizeof(buf), "%.*llX", 16, (long long unsigned)poolid);
+  *(t++) = '.';
+  string poolstr(buf, t - buf);
+  set<string> ret;
+  for (set<uint32_t>::iterator i = from.begin();
+       i != from.end();
+       ++i) {
+    uint32_t revhash(hobject_t::_reverse_nibbles(*i));
+    snprintf(buf, sizeof(buf), "%.*X", (int)(sizeof(revhash))*2, revhash);
+    ret.insert(poolstr + string(buf, len/4));
+  }
+  return ret;
+}
+
+string hobject_t::to_str() const
+{
+  string out;
+
+  char snap_with_hash[1000];
+  char *t = snap_with_hash;
+  const char *end = t + sizeof(snap_with_hash);
+
+  uint64_t poolid(pool);
+  t += snprintf(t, end - t, "%.*llX", 16, (long long unsigned)poolid);
+
+  uint32_t revhash(get_nibblewise_key_u32());
+  t += snprintf(t, end - t, ".%.*X", 8, revhash);
+
+  if (snap == CEPH_NOSNAP)
+    t += snprintf(t, end - t, ".head");
+  else if (snap == CEPH_SNAPDIR)
+    t += snprintf(t, end - t, ".snapdir");
+  else
+    t += snprintf(t, end - t, ".%llx", (long long unsigned)snap);
+
+  out.append(snap_with_hash, t);
+
+  out.push_back('.');
+  append_escaped(oid.name, &out);
+  out.push_back('.');
+  append_escaped(get_key(), &out);
+  out.push_back('.');
+  append_escaped(nspace, &out);
+
+  return out;
+}
+
+void hobject_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(4, 3, bl);
+  encode(key, bl);
+  encode(oid, bl);
+  encode(snap, bl);
+  encode(hash, bl);
+  encode(max, bl);
+  encode(nspace, bl);
+  encode(pool, bl);
+  ceph_assert(!max || (*this == hobject_t(hobject_t::get_max())));
+  ENCODE_FINISH(bl);
+}
+
+void hobject_t::decode(bufferlist::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl);
+  if (struct_v >= 1)
+    decode(key, bl);
+  decode(oid, bl);
+  decode(snap, bl);
+  decode(hash, bl);
+  if (struct_v >= 2)
+    decode(max, bl);
+  else
+    max = false;
+  if (struct_v >= 4) {
+    decode(nspace, bl);
+    decode(pool, bl);
+    // for compat with hammer, which did not handle the transition
+    // from pool -1 -> pool INT64_MIN for MIN properly.  this object
+    // name looks a bit like a pgmeta object for the meta collection,
+    // but those do not ever exist (and is_pgmeta() pool >= 0).
+    if (pool == -1 &&
+	snap == 0 &&
+	hash == 0 &&
+	!max &&
+	oid.name.empty()) {
+      pool = INT64_MIN;
+      ceph_assert(is_min());
+    }
+
+    // for compatibility with some earlier verisons which might encoded
+    // a non-canonical max object
+    if (max) {
+      *this = hobject_t::get_max();
+    }
+  }
+  DECODE_FINISH(bl);
+  build_hash_cache();
+}
+
+void hobject_t::decode(json_spirit::Value& v)
+{
+  using namespace json_spirit;
+  Object& o = v.get_obj();
+  for (Object::size_type i=0; i<o.size(); i++) {
+    Pair& p = o[i];
+    if (p.name_ == "oid")
+      oid.name = p.value_.get_str();
+    else if (p.name_ == "key")
+      key = p.value_.get_str();
+    else if (p.name_ == "snapid")
+      snap = p.value_.get_uint64();
+    else if (p.name_ == "hash")
+      hash = p.value_.get_int();
+    else if (p.name_ == "max")
+      max = p.value_.get_int();
+    else if (p.name_ == "pool")
+      pool = p.value_.get_int();
+    else if (p.name_ == "namespace")
+      nspace = p.value_.get_str();
+  }
+  build_hash_cache();
+}
+
+void hobject_t::dump(Formatter *f) const
+{
+  f->dump_string("oid", oid.name);
+  f->dump_string("key", key);
+  f->dump_int("snapid", snap);
+  f->dump_int("hash", hash);
+  f->dump_int("max", (int)max);
+  f->dump_int("pool", pool);
+  f->dump_string("namespace", nspace);
+}
+
+void hobject_t::generate_test_instances(list<hobject_t*>& o)
+{
+  o.push_back(new hobject_t);
+  o.push_back(new hobject_t);
+  o.back()->max = true;
+  o.push_back(new hobject_t(object_t("oname"), string(), 1, 234, -1, ""));
+  o.push_back(new hobject_t(object_t("oname2"), string("okey"), CEPH_NOSNAP,
+	67, 0, "n1"));
+  o.push_back(new hobject_t(object_t("oname3"), string("oname3"),
+	CEPH_SNAPDIR, 910, 1, "n2"));
+}
+
+static void append_out_escaped(const string &in, string *out)
+{
+  for (auto c : in) {
+    int i = (int)(unsigned char)(c);
+    if (i <= 0x0f) {
+      char buf[4] = {'%', '0'};
+      std::to_chars(buf + 2, buf + 3, i, 16);
+      out->append(buf);
+    } else if (i < 32 || i >= 127 || i == '%' || i == ':' || i == '/') {
+      char buf[4] = {'%'};
+      std::to_chars(buf + 1, buf + 3, i, 16);
+      out->append(buf);
+    } else {
+      out->push_back(c);
+    }
+  }
+}
+
+static const char *decode_out_escaped(const char *in, string *out)
+{
+  while (*in && *in != ':') {
+    if (*in == '%') {
+      ++in;
+      char buf[3];
+      buf[0] = *in;
+      ++in;
+      buf[1] = *in;
+      buf[2] = 0;
+      int v = strtol(buf, NULL, 16);
+      out->push_back(v);
+    } else {
+      out->push_back(*in);
+    }
+    ++in;
+  }
+  return in;
+}
+
+ostream& operator<<(ostream& out, const hobject_t& o)
+{
+  if (o == hobject_t())
+    return out << "MIN";
+  if (o.is_max())
+    return out << "MAX";
+  out << o.pool << ':';
+  out << std::hex;
+  out.width(8);
+  out.fill('0');
+  out << o.get_bitwise_key_u32(); // << '~' << o.get_hash();
+  out.width(0);
+  out.fill(' ');
+  out << std::dec;
+  out << ':';
+  string v;
+  append_out_escaped(o.nspace, &v);
+  v.push_back(':');
+  append_out_escaped(o.get_key(), &v);
+  v.push_back(':');
+  append_out_escaped(o.oid.name, &v);
+  out << v << ':' << o.snap;
+  return out;
+}
+
+bool hobject_t::parse(const string &s)
+{
+  if (s == "MIN") {
+    *this = hobject_t();
+    return true;
+  }
+  if (s == "MAX") {
+    *this = hobject_t::get_max();
+    return true;
+  }
+
+  const char *start = s.c_str();
+  long long po;
+  unsigned h;
+  int r = sscanf(start, "%lld:%x:", &po, &h);
+  if (r != 2)
+    return false;
+  for (; *start && *start != ':'; ++start) ;
+  for (++start; *start && isxdigit(*start); ++start) ;
+  if (*start != ':')
+    return false;
+
+  string ns, k, name;
+  const char *p = decode_out_escaped(start + 1, &ns);
+  if (*p != ':')
+    return false;
+  p = decode_out_escaped(p + 1, &k);
+  if (*p != ':')
+    return false;
+  p = decode_out_escaped(p + 1, &name);
+  if (*p != ':')
+    return false;
+  start = p + 1;
+
+  unsigned long long sn;
+  if (strncmp(start, "head", 4) == 0) {
+    sn = CEPH_NOSNAP;
+    start += 4;
+    if (*start != 0)
+      return false;
+  } else {
+    r = sscanf(start, "%llx", &sn);
+    if (r != 1)
+      return false;
+    for (++start; *start && isxdigit(*start); ++start) ;
+    if (*start)
+      return false;
+  }
+
+  max = false;
+  pool = po;
+  set_hash(_reverse_bits(h));
+  nspace = ns;
+  oid.name = name;
+  set_key(k);
+  snap = sn;
+  return true;
+}
+
+int cmp(const hobject_t& l, const hobject_t& r)
+{
+  if (l.max < r.max)
+    return -1;
+  if (l.max > r.max)
+    return 1;
+  if (l.pool < r.pool)
+    return -1;
+  if (l.pool > r.pool)
+    return 1;
+  if (l.get_bitwise_key() < r.get_bitwise_key())
+    return -1;
+  if (l.get_bitwise_key() > r.get_bitwise_key())
+    return 1;
+  if (l.nspace < r.nspace)
+    return -1;
+  if (l.nspace > r.nspace)
+    return 1;
+  if (!(l.get_key().empty() && r.get_key().empty())) {
+    if (l.get_effective_key() < r.get_effective_key()) {
+      return -1;
+    }
+    if (l.get_effective_key() > r.get_effective_key()) {
+      return 1;
+    }
+  }
+  if (l.oid < r.oid)
+    return -1;
+  if (l.oid > r.oid)
+    return 1;
+  if (l.snap < r.snap)
+    return -1;
+  if (l.snap > r.snap)
+    return 1;
+  return 0;
+}
+
+
+
+// This is compatible with decode for hobject_t prior to
+// version 5.
+void ghobject_t::encode(bufferlist& bl) const
+{
+  // when changing this, remember to update encoded_size() too.
+  ENCODE_START(6, 3, bl);
+  encode(hobj.key, bl);
+  encode(hobj.oid, bl);
+  encode(hobj.snap, bl);
+  encode(hobj.hash, bl);
+  encode(hobj.max, bl);
+  encode(hobj.nspace, bl);
+  encode(hobj.pool, bl);
+  encode(generation, bl);
+  encode(shard_id, bl);
+  encode(max, bl);
+  ENCODE_FINISH(bl);
+}
+
+size_t ghobject_t::encoded_size() const
+{
+  // this is not in order of encoding or appearance, but rather
+  // in order of known constants first, so it can be (mostly) computed
+  // at compile time.
+  //  - encoding header + 3 string lengths
+  size_t r = sizeof(ceph_le32) + 2 * sizeof(__u8) + 3 * sizeof(__u32);
+
+  // hobj.snap
+  r += sizeof(uint64_t);
+
+  // hobj.hash
+  r += sizeof(uint32_t);
+
+  // hobj.max
+  r += sizeof(bool);
+
+  // hobj.pool
+  r += sizeof(uint64_t);
+
+  // hobj.generation
+  r += sizeof(uint64_t);
+
+  // hobj.shard_id
+  r += sizeof(int8_t);
+
+  // max
+  r += sizeof(bool);
+
+  // hobj.key
+  r += hobj.key.size();
+
+  // hobj.oid
+  r += hobj.oid.name.size();
+
+  // hobj.nspace
+  r += hobj.nspace.size();
+
+  return r;
+}
+
+void ghobject_t::decode(bufferlist::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl);
+  if (struct_v >= 1)
+    decode(hobj.key, bl);
+  decode(hobj.oid, bl);
+  decode(hobj.snap, bl);
+  decode(hobj.hash, bl);
+  if (struct_v >= 2)
+    decode(hobj.max, bl);
+  else
+    hobj.max = false;
+  if (struct_v >= 4) {
+    decode(hobj.nspace, bl);
+    decode(hobj.pool, bl);
+    // for compat with hammer, which did not handle the transition from
+    // pool -1 -> pool INT64_MIN for MIN properly (see hobject_t::decode()).
+    if (hobj.pool == -1 &&
+	hobj.snap == 0 &&
+	hobj.hash == 0 &&
+	!hobj.max &&
+	hobj.oid.name.empty()) {
+      hobj.pool = INT64_MIN;
+      ceph_assert(hobj.is_min());
+    }
+  }
+  if (struct_v >= 5) {
+    decode(generation, bl);
+    decode(shard_id, bl);
+  } else {
+    generation = ghobject_t::NO_GEN;
+    shard_id = shard_id_t::NO_SHARD;
+  }
+  if (struct_v >= 6) {
+    decode(max, bl);
+  } else {
+    max = false;
+  }
+  DECODE_FINISH(bl);
+  hobj.build_hash_cache();
+}
+
+void ghobject_t::decode(json_spirit::Value& v)
+{
+  hobj.decode(v);
+  using namespace json_spirit;
+  Object& o = v.get_obj();
+  for (Object::size_type i=0; i<o.size(); i++) {
+    Pair& p = o[i];
+    if (p.name_ == "generation")
+      generation = p.value_.get_uint64();
+    else if (p.name_ == "shard_id")
+      shard_id.id = p.value_.get_int();
+    else if (p.name_ == "max")
+      max = p.value_.get_int();
+  }
+}
+
+void ghobject_t::dump(Formatter *f) const
+{
+  hobj.dump(f);
+  if (generation != NO_GEN)
+    f->dump_int("generation", generation);
+  if (shard_id != shard_id_t::NO_SHARD)
+    f->dump_int("shard_id", shard_id);
+  f->dump_int("max", (int)max);
+}
+
+void ghobject_t::generate_test_instances(list<ghobject_t*>& o)
+{
+  o.push_back(new ghobject_t);
+  o.push_back(new ghobject_t);
+  o.back()->hobj.max = true;
+  o.push_back(new ghobject_t(hobject_t(object_t("oname"), string(), 1, 234, -1, "")));
+
+  o.push_back(new ghobject_t(hobject_t(object_t("oname2"), string("okey"), CEPH_NOSNAP,
+        67, 0, "n1"), 1, shard_id_t(0)));
+  o.push_back(new ghobject_t(hobject_t(object_t("oname2"), string("okey"), CEPH_NOSNAP,
+        67, 0, "n1"), 1, shard_id_t(1)));
+  o.push_back(new ghobject_t(hobject_t(object_t("oname2"), string("okey"), CEPH_NOSNAP,
+        67, 0, "n1"), 1, shard_id_t(2)));
+  o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+        CEPH_SNAPDIR, 910, 1, "n2"), 1, shard_id_t(0)));
+  o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+        CEPH_SNAPDIR, 910, 1, "n2"), 2, shard_id_t(0)));
+  o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+        CEPH_SNAPDIR, 910, 1, "n2"), 3, shard_id_t(0)));
+  o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+        CEPH_SNAPDIR, 910, 1, "n2"), 3, shard_id_t(1)));
+  o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+        CEPH_SNAPDIR, 910, 1, "n2"), 3, shard_id_t(2)));
+}
+
+ostream& operator<<(ostream& out, const ghobject_t& o)
+{
+  if (o == ghobject_t())
+    return out << "GHMIN";
+  if (o.is_max())
+    return out << "GHMAX";
+  if (o.shard_id != shard_id_t::NO_SHARD)
+    out << std::hex << o.shard_id << std::dec;
+  out << '#' << o.hobj << '#';
+  if (o.generation != ghobject_t::NO_GEN)
+    out << std::hex << (unsigned long long)(o.generation) << std::dec;
+  return out;
+}
+
+bool ghobject_t::parse(const string& s)
+{
+  if (s == "GHMIN") {
+    *this = ghobject_t();
+    return true;
+  }
+  if (s == "GHMAX") {
+    *this = ghobject_t::get_max();
+    return true;
+  }
+
+  // look for shard# prefix
+  const char *start = s.c_str();
+  const char *p;
+  int sh = shard_id_t::NO_SHARD;
+  for (p = start; *p && isxdigit(*p); ++p) ;
+  if (!*p && *p != '#')
+    return false;
+  if (p > start) {
+    int r = sscanf(s.c_str(), "%x", &sh);
+    if (r < 1)
+      return false;
+    start = p + 1;
+  } else {
+    ++start;
+  }
+
+  // look for #generation suffix
+  long long unsigned g = NO_GEN;
+  const char *last = start + strlen(start) - 1;
+  p = last;
+  while (isxdigit(*p))
+    p--;
+  if (*p != '#')
+    return false;
+  if (p < last) {
+    sscanf(p + 1, "%llx", &g);
+  }
+
+  string inner(start, p - start);
+  hobject_t h;
+  if (!h.parse(inner)) {
+    return false;
+  }
+
+  shard_id = shard_id_t(sh);
+  hobj = h;
+  generation = g;
+  max = false;
+  return true;
+}
+
+int cmp(const ghobject_t& l, const ghobject_t& r)
+{
+  if (l.max < r.max)
+    return -1;
+  if (l.max > r.max)
+    return 1;
+  if (l.shard_id < r.shard_id)
+    return -1;
+  if (l.shard_id > r.shard_id)
+    return 1;
+  int ret = cmp(l.hobj, r.hobj);
+  if (ret != 0)
+    return ret;
+  if (l.generation < r.generation)
+    return -1;
+  if (l.generation > r.generation)
+    return 1;
+  return 0;
+}
diff --git a/src/common/hobject.h b/src/common/hobject.h
new file mode 100644
index 000000000..34191ccf5
--- /dev/null
+++ b/src/common/hobject.h
@@ -0,0 +1,515 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __CEPH_OS_HOBJECT_H
+#define __CEPH_OS_HOBJECT_H
+
+#if FMT_VERSION >= 90000
+#include <fmt/ostream.h>
+#endif
+
+#include "include/types.h"
+
+#include "json_spirit/json_spirit_value.h"
+#include "include/ceph_assert.h"   // spirit clobbers it!
+
+#include "reverse.h"
+
+namespace ceph {
+  class Formatter;
+}
+
+#ifndef UINT64_MAX
+#define UINT64_MAX (18446744073709551615ULL)
+#endif
+#ifndef INT64_MIN
+#define INT64_MIN ((int64_t)0x8000000000000000ll)
+#endif
+
+struct hobject_t {
+public:
+  static const int64_t POOL_META = -1;
+  static const int64_t POOL_TEMP_START = -2; // and then negative
+
+  static bool is_temp_pool(int64_t pool) {
+    return pool <= POOL_TEMP_START;
+  }
+  static int64_t get_temp_pool(int64_t pool) {
+    return POOL_TEMP_START - pool;
+  }
+  static bool is_meta_pool(int64_t pool) {
+    return pool == POOL_META;
+  }
+
+public:
+  object_t oid;
+  snapid_t snap;
+private:
+  uint32_t hash;
+  bool max;
+  uint32_t nibblewise_key_cache;
+  uint32_t hash_reverse_bits;
+public:
+  int64_t pool;
+  std::string nspace;
+
+private:
+  std::string key;
+
+  class hobject_t_max {};
+
+public:
+  const std::string& get_key() const {
+    return key;
+  }
+
+  void set_key(const std::string& key_) {
+    if (key_ == oid.name)
+      key.clear();
+    else
+      key = key_;
+  }
+
+  std::string to_str() const;
+  
+  uint32_t get_hash() const { 
+    return hash;
+  }
+  void set_hash(uint32_t value) { 
+    hash = value;
+    build_hash_cache();
+  }
+
+  static bool match_hash(uint32_t to_check, uint32_t bits, uint32_t match) {
+    return (match & ~((~0)<<bits)) == (to_check & ~((~0)<<bits));
+  }
+  bool match(uint32_t bits, uint32_t match) const {
+    return match_hash(hash, bits, match);
+  }
+
+  bool is_temp() const {
+    return is_temp_pool(pool) && pool != INT64_MIN;
+  }
+  bool is_meta() const {
+    return is_meta_pool(pool);
+  }
+  int64_t get_logical_pool() const {
+    if (is_temp_pool(pool))
+      return get_temp_pool(pool);  // it's reversible
+    else
+      return pool;
+  }
+
+  hobject_t() : snap(0), hash(0), max(false), pool(INT64_MIN) {
+    build_hash_cache();
+  }
+
+  hobject_t(const hobject_t &rhs) = default;
+  hobject_t(hobject_t &&rhs) = default;
+  hobject_t(hobject_t_max &&singleton) : hobject_t() {
+    max = true;
+  }
+  hobject_t &operator=(const hobject_t &rhs) = default;
+  hobject_t &operator=(hobject_t &&rhs) = default;
+  hobject_t &operator=(hobject_t_max &&singleton) {
+    *this = hobject_t();
+    max = true;
+    return *this;
+  }
+
+  // maximum sorted value.
+  static hobject_t_max get_max() {
+    return hobject_t_max();
+  }
+
+  hobject_t(const object_t& oid, const std::string& key, snapid_t snap,
+            uint32_t hash, int64_t pool, const std::string& nspace)
+    : oid(oid), snap(snap), hash(hash), max(false),
+      pool(pool), nspace(nspace),
+      key(oid.name == key ? std::string() : key) {
+    build_hash_cache();
+  }
+
+  hobject_t(const sobject_t &soid, const std::string &key, uint32_t hash,
+	    int64_t pool, const std::string& nspace)
+    : oid(soid.oid), snap(soid.snap), hash(hash), max(false),
+      pool(pool), nspace(nspace),
+      key(soid.oid.name == key ? std::string() : key) {
+    build_hash_cache();
+  }
+
+  // used by Crimson
+  hobject_t(const std::string &key, snapid_t snap, uint32_t reversed_hash,
+            int64_t pool, const std::string& nspace)
+    : oid(key), snap(snap), max(false), pool(pool), nspace(nspace) {
+    set_bitwise_key_u32(reversed_hash);
+  }
+
+  /// @return min hobject_t ret s.t. ret.hash == this->hash
+  hobject_t get_boundary() const {
+    if (is_max())
+      return *this;
+    hobject_t ret;
+    ret.set_hash(hash);
+    ret.pool = pool;
+    return ret;
+  }
+
+  hobject_t get_object_boundary() const {
+    if (is_max())
+      return *this;
+    hobject_t ret = *this;
+    ret.snap = 0;
+    return ret;
+  }
+
+  /// @return head version of this hobject_t
+  hobject_t get_head() const {
+    hobject_t ret(*this);
+    ret.snap = CEPH_NOSNAP;
+    return ret;
+  }
+
+  /// @return snapdir version of this hobject_t
+  hobject_t get_snapdir() const {
+    hobject_t ret(*this);
+    ret.snap = CEPH_SNAPDIR;
+    return ret;
+  }
+
+  /// @return true if object is snapdir
+  bool is_snapdir() const {
+    return snap == CEPH_SNAPDIR;
+  }
+
+  /// @return true if object is head
+  bool is_head() const {
+    return snap == CEPH_NOSNAP;
+  }
+
+  /// @return true if object is neither head nor snapdir nor max
+  bool is_snap() const {
+    return !is_max() && !is_head() && !is_snapdir();
+  }
+
+  /// @return true iff the object should have a snapset in it's attrs
+  bool has_snapset() const {
+    return is_head() || is_snapdir();
+  }
+
+  /* Do not use when a particular hash function is needed */
+  explicit hobject_t(const sobject_t &o) :
+    oid(o.oid), snap(o.snap), max(false), pool(POOL_META) {
+    set_hash(std::hash<sobject_t>()(o));
+  }
+
+  bool is_max() const {
+    ceph_assert(!max || (*this == hobject_t(hobject_t::get_max())));
+    return max;
+  }
+  bool is_min() const {
+    // this needs to match how it's constructed
+    return snap == 0 &&
+	   hash == 0 &&
+	   !max &&
+	   pool == INT64_MIN;
+  }
+
+  static uint32_t _reverse_bits(uint32_t v) {
+    return reverse_bits(v);
+  }
+  static uint32_t _reverse_nibbles(uint32_t retval) {
+    return reverse_nibbles(retval);
+  }
+
+  /**
+   * Returns set S of strings such that for any object
+   * h where h.match(bits, mask), there is some string
+   * s \f$\in\f$ S such that s is a prefix of h.to_str().
+   * Furthermore, for any s \f$\in\f$ S, s is a prefix of
+   * h.str() implies that h.match(bits, mask).
+   */
+  static std::set<std::string> get_prefixes(
+    uint32_t bits,
+    uint32_t mask,
+    int64_t pool);
+
+  // filestore nibble-based key
+  uint32_t get_nibblewise_key_u32() const {
+    ceph_assert(!max);
+    return nibblewise_key_cache;
+  }
+  uint64_t get_nibblewise_key() const {
+    return max ? 0x100000000ull : nibblewise_key_cache;
+  }
+
+  // newer bit-reversed key
+  uint32_t get_bitwise_key_u32() const {
+    ceph_assert(!max);
+    return hash_reverse_bits;
+  }
+  uint64_t get_bitwise_key() const {
+    return max ? 0x100000000ull : hash_reverse_bits;
+  }
+
+  // please remember to update set_bitwise_key_u32() also
+  // once you change build_hash_cache()
+  void build_hash_cache() {
+    nibblewise_key_cache = _reverse_nibbles(hash);
+    hash_reverse_bits = _reverse_bits(hash);
+  }
+  void set_bitwise_key_u32(uint32_t value) {
+    hash = _reverse_bits(value);
+    // below is identical to build_hash_cache() and shall be
+    // updated correspondingly if you change build_hash_cache() 
+    nibblewise_key_cache = _reverse_nibbles(hash);
+    hash_reverse_bits = value;
+  }
+
+  const std::string& get_effective_key() const {
+    if (key.length())
+      return key;
+    return oid.name;
+  }
+
+  hobject_t make_temp_hobject(const std::string& name) const {
+    return hobject_t(object_t(name), "", CEPH_NOSNAP,
+		     hash,
+		     get_temp_pool(pool),
+		     "");
+  }
+
+  void swap(hobject_t &o) {
+    hobject_t temp(o);
+    o = (*this);
+    (*this) = temp;
+  }
+
+  const std::string &get_namespace() const {
+    return nspace;
+  }
+
+  bool parse(const std::string& s);
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::bufferlist::const_iterator& bl);
+  void decode(json_spirit::Value& v);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<hobject_t*>& o);
+  friend int cmp(const hobject_t& l, const hobject_t& r);
+  auto operator<=>(const hobject_t &rhs) const noexcept {
+    auto cmp = max <=> rhs.max;
+    if (cmp != 0) return cmp;
+    cmp = pool <=> rhs.pool;
+    if (cmp != 0) return cmp;
+    cmp = get_bitwise_key() <=> rhs.get_bitwise_key();
+    if (cmp != 0) return cmp;
+    cmp = nspace <=> rhs.nspace;
+    if (cmp != 0) return cmp;
+    if (!(get_key().empty() && rhs.get_key().empty())) {
+      cmp = get_effective_key() <=> rhs.get_effective_key();
+      if (cmp != 0) return cmp;
+    }
+    cmp = oid <=> rhs.oid;
+    if (cmp != 0) return cmp;
+    return snap <=> rhs.snap;
+  }
+  bool operator==(const hobject_t& rhs) const noexcept {
+    return operator<=>(rhs) == 0;
+  }
+  friend struct ghobject_t;
+};
+WRITE_CLASS_ENCODER(hobject_t)
+
+namespace std {
+template<> struct hash<hobject_t> {
+  size_t operator()(const hobject_t &r) const {
+    static rjhash<uint64_t> RJ;
+    return RJ(r.get_hash() ^ r.snap);
+  }
+};
+} // namespace std
+
+std::ostream& operator<<(std::ostream& out, const hobject_t& o);
+
+template <typename T>
+struct always_false {
+  using value = std::false_type;
+};
+
+template <typename T>
+inline bool operator==(const hobject_t &lhs, const T&) {
+  static_assert(always_false<T>::value::value, "Do not compare to get_max()");
+  return lhs.is_max();
+}
+template <typename T>
+inline bool operator==(const T&, const hobject_t &rhs) {
+  static_assert(always_false<T>::value::value, "Do not compare to get_max()");
+  return rhs.is_max();
+}
+template <typename T>
+inline bool operator!=(const hobject_t &lhs, const T&) {
+  static_assert(always_false<T>::value::value, "Do not compare to get_max()");
+  return !lhs.is_max();
+}
+template <typename T>
+inline bool operator!=(const T&, const hobject_t &rhs) {
+  static_assert(always_false<T>::value::value, "Do not compare to get_max()");
+  return !rhs.is_max();
+}
+
+extern int cmp(const hobject_t& l, const hobject_t& r);
+template <typename T>
+static inline int cmp(const hobject_t &l, const T&) {
+  static_assert(always_false<T>::value::value, "Do not compare to get_max()");
+  return l.is_max() ? 0 : -1;
+}
+template <typename T>
+static inline int cmp(const T&, const hobject_t&r) {
+  static_assert(always_false<T>::value::value, "Do not compare to get_max()");
+  return r.is_max() ? 0 : 1;
+}
+
+
+
+typedef version_t gen_t;
+
+struct ghobject_t {
+  static const gen_t NO_GEN = UINT64_MAX;
+
+  bool max = false;
+  shard_id_t shard_id = shard_id_t::NO_SHARD;
+  hobject_t hobj;
+  gen_t generation = NO_GEN;
+
+  ghobject_t() = default;
+
+  explicit ghobject_t(const hobject_t &obj)
+    : hobj(obj) {}
+
+  ghobject_t(const hobject_t &obj, gen_t gen, shard_id_t shard)
+    : shard_id(shard),
+      hobj(obj),
+      generation(gen) {}
+
+  // used by Crimson
+  ghobject_t(shard_id_t shard, int64_t pool, uint32_t reversed_hash,
+             const std::string& nspace, const std::string& oid,
+             snapid_t snap, gen_t gen)
+    : shard_id(shard),
+      hobj(oid, snap, reversed_hash, pool, nspace),
+      generation(gen) {}
+
+  static ghobject_t make_pgmeta(int64_t pool, uint32_t hash, shard_id_t shard) {
+    hobject_t h(object_t(), std::string(), CEPH_NOSNAP, hash, pool, std::string());
+    return ghobject_t(h, NO_GEN, shard);
+  }
+  bool is_pgmeta() const {
+    // make sure we are distinct from hobject_t(), which has pool INT64_MIN
+    return hobj.pool >= 0 && hobj.oid.name.empty();
+  }
+
+  bool match(uint32_t bits, uint32_t match) const {
+    return hobj.match_hash(hobj.hash, bits, match);
+  }
+  /// @return min ghobject_t ret s.t. ret.hash == this->hash
+  ghobject_t get_boundary() const {
+    if (hobj.is_max())
+      return *this;
+    ghobject_t ret;
+    ret.hobj.set_hash(hobj.hash);
+    ret.shard_id = shard_id;
+    ret.hobj.pool = hobj.pool;
+    return ret;
+  }
+  uint32_t get_nibblewise_key_u32() const {
+    return hobj.get_nibblewise_key_u32();
+  }
+  uint32_t get_nibblewise_key() const {
+    return hobj.get_nibblewise_key();
+  }
+
+  bool is_degenerate() const {
+    return generation == NO_GEN && shard_id == shard_id_t::NO_SHARD;
+  }
+
+  bool is_no_gen() const {
+    return generation == NO_GEN;
+  }
+
+  bool is_no_shard() const {
+    return shard_id == shard_id_t::NO_SHARD;
+  }
+
+  void set_shard(shard_id_t s) {
+    shard_id = s;
+  }
+
+  bool parse(const std::string& s);
+
+  // maximum sorted value.
+  static ghobject_t get_max() {
+    ghobject_t h;
+    h.max = true;
+    h.hobj = hobject_t::get_max();  // so that is_max() => hobj.is_max()
+    return h;
+  }
+  bool is_max() const {
+    return max;
+  }
+  bool is_min() const {
+    return *this == ghobject_t();
+  }
+
+  void swap(ghobject_t &o) {
+    ghobject_t temp(o);
+    o = (*this);
+    (*this) = temp;
+  }
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void decode(json_spirit::Value& v);
+  size_t encoded_size() const;
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<ghobject_t*>& o);
+  friend int cmp(const ghobject_t& l, const ghobject_t& r);
+  auto operator<=>(const ghobject_t&) const = default;
+  bool operator==(const ghobject_t&) const = default;
+};
+WRITE_CLASS_ENCODER(ghobject_t)
+
+namespace std {
+  template<> struct hash<ghobject_t> {
+    size_t operator()(const ghobject_t &r) const {
+      static rjhash<uint64_t> RJ;
+      static hash<hobject_t> HO;
+      size_t hash = HO(r.hobj);
+      hash = RJ(hash ^ r.generation);
+      hash = hash ^ r.shard_id.id;
+      return hash;
+    }
+  };
+} // namespace std
+
+std::ostream& operator<<(std::ostream& out, const ghobject_t& o);
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<ghobject_t> : fmt::ostream_formatter {};
+#endif
+
+extern int cmp(const ghobject_t& l, const ghobject_t& r);
+
+
+#endif
diff --git a/src/common/hobject_fmt.h b/src/common/hobject_fmt.h
new file mode 100644
index 000000000..622611121
--- /dev/null
+++ b/src/common/hobject_fmt.h
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+/**
+ * \file fmtlib formatters for some hobject.h classes
+ */
+#include <fmt/format.h>
+#include <fmt/ranges.h>
+
+#include "common/hobject.h"
+#include "include/object_fmt.h"
+#include "msg/msg_fmt.h"
+
+// \todo reimplement
+static inline void append_out_escaped(const std::string& in, std::string* out)
+{
+  for (auto i = in.cbegin(); i != in.cend(); ++i) {
+    if (*i == '%' || *i == ':' || *i == '/' || *i < 32 || *i >= 127) {
+      char buf[4];
+      snprintf(buf, sizeof(buf), "%%%02x", (int)(unsigned char)*i);
+      out->append(buf);
+    } else {
+      out->push_back(*i);
+    }
+  }
+}
+
+template <> struct fmt::formatter<hobject_t> {
+
+  constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
+
+  template <typename FormatContext> auto format(const hobject_t& ho, FormatContext& ctx)
+  {
+    if (ho == hobject_t{}) {
+      return fmt::format_to(ctx.out(), "MIN");
+    }
+
+    if (ho.is_max()) {
+      return fmt::format_to(ctx.out(), "MAX");
+    }
+
+    std::string v;
+    append_out_escaped(ho.nspace, &v);
+    v.push_back(':');
+    append_out_escaped(ho.get_key(), &v);
+    v.push_back(':');
+    append_out_escaped(ho.oid.name, &v);
+
+    return fmt::format_to(ctx.out(), "{}:{:08x}:{}:{}", static_cast<uint64_t>(ho.pool),
+			  ho.get_bitwise_key_u32(), v, ho.snap);
+  }
+};
diff --git a/src/common/hostname.cc b/src/common/hostname.cc
new file mode 100644
index 000000000..b452a5723
--- /dev/null
+++ b/src/common/hostname.cc
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/hostname.h"
+
+#include <unistd.h>
+
+#include "include/compat.h"
+
+std::string ceph_get_hostname()
+{
+  // are we in a container?  if so we would prefer the *real* hostname.
+  const char *node_name = getenv("NODE_NAME");
+  if (node_name) {
+    return node_name;
+  }
+
+  char buf[1024];
+  gethostname(buf, 1024);
+  return std::string(buf);
+}
+
+std::string ceph_get_short_hostname()
+{
+  std::string hostname = ceph_get_hostname();
+  size_t pos = hostname.find('.');
+  if (pos == std::string::npos)
+  {
+    return hostname;
+  }
+  else
+  {
+    return hostname.substr(0, pos);
+  }
+}
diff --git a/src/common/hostname.h b/src/common/hostname.h
new file mode 100644
index 000000000..9d270bf63
--- /dev/null
+++ b/src/common/hostname.h
@@ -0,0 +1,22 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_HOSTNAME_H
+#define CEPH_COMMON_HOSTNAME_H
+
+#include <string>
+
+extern std::string ceph_get_hostname();
+extern std::string ceph_get_short_hostname();
+#endif
diff --git a/src/common/inline_variant.h b/src/common/inline_variant.h
new file mode 100644
index 000000000..28426ba71
--- /dev/null
+++ b/src/common/inline_variant.h
@@ -0,0 +1,211 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:4; indent-tabs-mode:t -*-
+// vim: ts=8 sw=4 smarttab
+/*
+ * Copied from:
+ * https://github.com/exclipy/inline_variant_visitor/blob/master/inline_variant.hpp
+ */
+
+#ifndef INLINE_VARIANT_H
+#define INLINE_VARIANT_H
+
+#include <boost/function_types/function_arity.hpp>
+#include <boost/fusion/algorithm/transformation/transform.hpp>
+#include <boost/mpl/contains.hpp>
+#include <boost/mpl/map.hpp>
+#include <boost/mpl/vector.hpp>
+#include <boost/mpl/range_c.hpp>
+#include <boost/noncopyable.hpp>
+
+#include "function_signature.h"
+
+namespace detail {
+
+// A metafunction class for getting the argument type from a unary function or functor type
+struct function_arg_extractor
+{
+    // Function is either a function type like void(int const&), or a functor - eg. a class with void operator(int)
+    // Sets type to the argument type with the constness and referenceness stripped (eg. int)
+    template <typename Function>
+    struct apply
+    {
+    private:
+        typedef typename boost::remove_const< typename boost::remove_reference<Function>::type >::type bare_type;
+        typedef typename signature_of<bare_type>::type normalized_function_type;
+        typedef typename boost::function_types::function_arity<normalized_function_type>::type arity;
+        typedef typename boost::function_types::parameter_types<normalized_function_type>::type parameter_types;
+        typedef typename boost::function_types::result_type<normalized_function_type>::type result_type;
+
+        BOOST_STATIC_ASSERT_MSG((arity::value == 1), "make_visitor called with a non-unary function");
+
+        typedef typename boost::mpl::front<parameter_types>::type parameter_type;
+    public:
+        typedef typename boost::remove_const< typename boost::remove_reference<parameter_type>::type >::type type;
+    };
+};
+
+struct make_pair
+{
+    template <typename AType, typename Ind>
+    struct apply {
+	typedef boost::mpl::pair<AType, Ind> type;
+    };
+};
+
+// A metafunction class that asserts the second argument is in Allowed, and returns void
+template<typename Allowed>
+struct check_in
+{
+    template <typename Type1, typename Type2>
+    struct apply
+    {
+    private:
+        BOOST_STATIC_ASSERT_MSG((boost::mpl::contains<Allowed, typename boost::mpl::first<Type2>::type>::value),
+                "make_visitor called with spurious handler functions");
+    public:
+        typedef void type;
+    };
+};
+
+template <typename Seq>
+struct as_map
+{
+private:
+    struct insert_helper {
+	template <typename M, typename P>
+	struct apply
+	{
+	    typedef typename boost::mpl::insert<
+		M,
+		P>::type type;
+	};
+    };
+public:
+    typedef typename boost::mpl::fold<Seq, boost::mpl::map0<>, insert_helper>::type type;
+};
+
+// A functor template suitable for passing into apply_visitor.  The constructor accepts the list of handler functions,
+// which are then exposed through a set of operator()s
+template <typename Result, typename Variant, typename... Functions>
+struct generic_visitor : boost::static_visitor<Result>, boost::noncopyable
+{
+private:
+    typedef generic_visitor<Result, Variant, Functions...> type;
+
+    // Compute the function_map type
+    typedef boost::mpl::vector<Functions...> function_types;
+    typedef typename boost::mpl::transform<function_types, function_arg_extractor>::type arg_types;
+    typedef typename boost::mpl::transform<
+        arg_types,
+	boost::mpl::range_c<int, 0, boost::mpl::size<arg_types>::value>,
+	make_pair
+	>::type pair_list;
+    typedef typename as_map<pair_list>::type fmap;
+
+    // Check that the argument types are unique
+    BOOST_STATIC_ASSERT_MSG((boost::mpl::size<fmap>::value == boost::mpl::size<arg_types>::value),
+            "make_visitor called with non-unique argument types for handler functions");
+
+    // Check that there aren't any argument types not in the variant types
+    typedef typename boost::mpl::fold<fmap, void, check_in<typename Variant::types> >::type dummy;
+
+    boost::fusion::vector<Functions...> fvec;
+
+
+    template <typename T>
+    Result apply_helper(const T& object, boost::mpl::true_) const {
+	typedef typename boost::mpl::at<fmap, T>::type Ind;
+        return boost::fusion::at<Ind>(fvec)(object);
+    }
+
+    template <typename T>
+    Result apply_helper(const T& object, boost::mpl::false_) const {
+        return Result();
+    }
+
+    BOOST_MOVABLE_BUT_NOT_COPYABLE(generic_visitor)
+
+public:
+    generic_visitor(BOOST_RV_REF(type) other)
+    :
+        fvec(boost::move(other.fvec))
+    {
+    }
+    generic_visitor(Functions&&... functions)
+    :
+        fvec(std::forward<Functions>(functions)...)
+    {
+    }
+
+    template <typename T>
+    Result operator()(const T& object) const {
+        typedef typename boost::mpl::has_key<fmap, T>::type correct_key;
+        BOOST_STATIC_ASSERT_MSG(correct_key::value,
+            "make_visitor called without specifying handlers for all required types");
+        return apply_helper(object, correct_key());
+    }
+};
+
+// A metafunction class for getting the return type of a function
+struct function_return_extractor
+{
+    template <typename Function>
+    struct apply : boost::function_types::result_type<typename signature_of<Function>::type>
+    {
+    };
+};
+
+// A metafunction class that asserts the two arguments are the same and returns the first one
+struct check_same
+{
+    template <typename Type1, typename Type2>
+    struct apply
+    {
+    private:
+        BOOST_STATIC_ASSERT_MSG((boost::is_same<Type1, Type2>::value),
+                "make_visitor called with functions of differing return types");
+    public:
+        typedef Type1 type;
+    };
+};
+
+// A metafunction for getting the required generic_visitor type for the set of Functions
+template <typename Variant, typename... Functions>
+struct get_generic_visitor
+{
+private:
+    typedef boost::mpl::vector<Functions...> function_types;
+    typedef typename boost::mpl::transform<
+        function_types,
+        boost::remove_const< boost::remove_reference<boost::mpl::_1> >
+    >::type bare_function_types;
+    typedef typename boost::mpl::transform<bare_function_types, function_return_extractor>::type return_types;
+
+public:
+    // Set result_type to the return type of the first function
+    typedef typename boost::mpl::front<return_types>::type result_type;
+    typedef generic_visitor<result_type, Variant, Functions...> type;
+
+private:
+    // Assert that every return type is the same as the first one
+    typedef typename boost::mpl::fold<return_types, result_type, check_same>::type dummy;
+};
+
+// Accepts a set of functions and returns an object suitable for apply_visitor
+template <typename Variant, typename... Functions>
+auto make_visitor(BOOST_RV_REF(Functions)... functions)
+    -> typename detail::get_generic_visitor<Variant, Functions...>::type
+{
+    return typename detail::get_generic_visitor<Variant, Functions...>::type(boost::forward<Functions>(functions)...);
+}
+
+}
+
+template <typename Variant, typename... Functions>
+auto match(Variant const& variant, BOOST_RV_REF(Functions)... functions)
+    -> typename detail::get_generic_visitor<Variant, Functions...>::result_type
+{
+    return boost::apply_visitor(detail::make_visitor<Variant>(
+        boost::forward<Functions>(functions)...), variant);
+}
+
+#endif
diff --git a/src/common/interval_map.h b/src/common/interval_map.h
new file mode 100644
index 000000000..65a89e211
--- /dev/null
+++ b/src/common/interval_map.h
@@ -0,0 +1,289 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef INTERVAL_MAP_H
+#define INTERVAL_MAP_H
+
+#include "include/interval_set.h"
+#include <initializer_list>
+
+template <typename K, typename V, typename S>
+/**
+ * interval_map
+ *
+ * Maps intervals to values.  Erasing or inserting over an existing
+ * range will use S::operator() to split any overlapping existing
+ * values.
+ *
+ * Surprisingly, boost/icl/interval_map doesn't seem to be appropriate
+ * for this use case.  The aggregation concept seems to assume
+ * commutativity, which doesn't work if we want more recent insertions
+ * to overwrite previous ones.
+ */
+class interval_map {
+  S s;
+  using map = std::map<K, std::pair<K, V> >;
+  using mapiter = typename std::map<K, std::pair<K, V> >::iterator;
+  using cmapiter = typename std::map<K, std::pair<K, V> >::const_iterator;
+  map m;
+  std::pair<mapiter, mapiter> get_range(K off, K len) {
+    // fst is first iterator with end after off (may be end)
+    auto fst = m.upper_bound(off);
+    if (fst != m.begin())
+      --fst;
+    if (fst != m.end() && off >= (fst->first + fst->second.first))
+      ++fst;
+
+    // lst is first iterator with start after off + len (may be end)
+    auto lst = m.lower_bound(off + len);
+    return std::make_pair(fst, lst);
+  }
+  std::pair<cmapiter, cmapiter> get_range(K off, K len) const {
+    // fst is first iterator with end after off (may be end)
+    auto fst = m.upper_bound(off);
+    if (fst != m.begin())
+      --fst;
+    if (fst != m.end() && off >= (fst->first + fst->second.first))
+      ++fst;
+
+    // lst is first iterator with start after off + len (may be end)
+    auto lst = m.lower_bound(off + len);
+    return std::make_pair(fst, lst);
+  }
+  void try_merge(mapiter niter) {
+    if (niter != m.begin()) {
+      auto prev = niter;
+      prev--;
+      if (prev->first + prev->second.first == niter->first &&
+	  s.can_merge(prev->second.second, niter->second.second)) {
+	V n = s.merge(
+	  std::move(prev->second.second),
+	  std::move(niter->second.second));
+	K off = prev->first;
+	K len = niter->first + niter->second.first - off;
+	niter++;
+	m.erase(prev, niter);
+	auto p = m.insert(
+	  std::make_pair(
+	    off,
+	    std::make_pair(len, std::move(n))));
+	ceph_assert(p.second);
+	niter = p.first;
+      }
+    }
+    auto next = niter;
+    next++;
+    if (next != m.end() &&
+	niter->first + niter->second.first == next->first &&
+	s.can_merge(niter->second.second, next->second.second)) {
+      V n = s.merge(
+	std::move(niter->second.second),
+	std::move(next->second.second));
+      K off = niter->first;
+      K len = next->first + next->second.first - off;
+      next++;
+      m.erase(niter, next);
+      auto p = m.insert(
+	std::make_pair(
+	  off,
+	  std::make_pair(len, std::move(n))));
+      ceph_assert(p.second);
+    }
+  }
+public:
+  interval_map() = default;
+  interval_map(std::initializer_list<typename map::value_type> l) {
+    for (auto& v : l) {
+      insert(v.first, v.second.first, v.second.second);
+    }
+  }
+
+  interval_map intersect(K off, K len) const {
+    interval_map ret;
+    auto limits = get_range(off, len);
+    for (auto i = limits.first; i != limits.second; ++i) {
+      K o = i->first;
+      K l = i->second.first;
+      V v = i->second.second;
+      if (o < off) {
+	V p = v;
+	l -= (off - o);
+	v = s.split(off - o, l, p);
+	o = off;
+      }
+      if ((o + l) > (off + len)) {
+	V p = v;
+	l -= (o + l) - (off + len);
+	v = s.split(0, l, p);
+      }
+      ret.insert(o, l, v);
+    }
+    return ret;
+  }
+  void clear() {
+    m.clear();
+  }
+  void erase(K off, K len) {
+    if (len == 0)
+      return;
+    auto range = get_range(off, len);
+    std::vector<
+      std::pair<
+	K,
+	std::pair<K, V>
+	>> to_insert;
+    for (auto i = range.first; i != range.second; ++i) {
+      if (i->first < off) {
+	to_insert.emplace_back(
+	  std::make_pair(
+	    i->first,
+	    std::make_pair(
+	      off - i->first,
+	      s.split(0, off - i->first, i->second.second))));
+      }
+      if ((off + len) < (i->first + i->second.first)) {
+	K nlen = (i->first + i->second.first) - (off + len);
+	to_insert.emplace_back(
+	  std::make_pair(
+	    off + len,
+	    std::make_pair(
+	      nlen,
+	      s.split(i->second.first - nlen, nlen, i->second.second))));
+      }
+    }
+    m.erase(range.first, range.second);
+    m.insert(to_insert.begin(), to_insert.end());
+  }
+  void insert(K off, K len, V &&v) {
+    ceph_assert(len > 0);
+    ceph_assert(len == s.length(v));
+    erase(off, len);
+    auto p = m.insert(make_pair(off, std::make_pair(len, std::forward<V>(v))));
+    ceph_assert(p.second);
+    try_merge(p.first);
+  }
+  void insert(interval_map &&other) {
+    for (auto i = other.m.begin();
+	 i != other.m.end();
+	 other.m.erase(i++)) {
+      insert(i->first, i->second.first, std::move(i->second.second));
+    }
+  }
+  void insert(K off, K len, const V &v) {
+    ceph_assert(len > 0);
+    ceph_assert(len == s.length(v));
+    erase(off, len);
+    auto p = m.insert(make_pair(off, std::make_pair(len, v)));
+    ceph_assert(p.second);
+    try_merge(p.first);
+  }
+  void insert(const interval_map &other) {
+    for (auto &&i: other) {
+      insert(i.get_off(), i.get_len(), i.get_val());
+    }
+  }
+  bool empty() const {
+    return m.empty();
+  }
+  interval_set<K> get_interval_set() const {
+    interval_set<K> ret;
+    for (auto &&i: *this) {
+      ret.insert(i.get_off(), i.get_len());
+    }
+    return ret;
+  }
+  class const_iterator {
+    cmapiter it;
+    const_iterator(cmapiter &&it) : it(std::move(it)) {}
+    const_iterator(const cmapiter &it) : it(it) {}
+
+    friend class interval_map;
+  public:
+    const_iterator(const const_iterator &) = default;
+    const_iterator &operator=(const const_iterator &) = default;
+
+    const_iterator &operator++() {
+      ++it;
+      return *this;
+    }
+    const_iterator operator++(int) {
+      return const_iterator(it++);
+    }
+    const_iterator &operator--() {
+      --it;
+      return *this;
+    }
+    const_iterator operator--(int) {
+      return const_iterator(it--);
+    }
+    bool operator==(const const_iterator &rhs) const {
+      return it == rhs.it;
+    }
+    bool operator!=(const const_iterator &rhs) const {
+      return it != rhs.it;
+    }
+    K get_off() const {
+      return it->first;
+    }
+    K get_len() const {
+      return it->second.first;
+    }
+    const V &get_val() const {
+      return it->second.second;
+    }
+    const_iterator &operator*() {
+      return *this;
+    }
+  };
+  const_iterator begin() const {
+    return const_iterator(m.begin());
+  }
+  const_iterator end() const {
+    return const_iterator(m.end());
+  }
+  std::pair<const_iterator, const_iterator> get_containing_range(
+    K off,
+    K len) const {
+    auto rng = get_range(off, len);
+    return std::make_pair(const_iterator(rng.first), const_iterator(rng.second));
+  }
+  unsigned ext_count() const {
+    return m.size();
+  }
+  bool operator==(const interval_map &rhs) const {
+    return m == rhs.m;
+  }
+
+  std::ostream &print(std::ostream &out) const {
+    bool first = true;
+    out << "{";
+    for (auto &&i: *this) {
+      if (first) {
+	first = false;
+      } else {
+	out << ",";
+      }
+      out << i.get_off() << "~" << i.get_len() << "("
+	  << s.length(i.get_val()) << ")";
+    }
+    return out << "}";
+  }
+};
+
+template <typename K, typename V, typename S>
+std::ostream &operator<<(std::ostream &out, const interval_map<K, V, S> &m) {
+  return m.print(out);
+}
+
+#endif
diff --git a/src/common/intrusive_lru.h b/src/common/intrusive_lru.h
new file mode 100644
index 000000000..e8c3cda3e
--- /dev/null
+++ b/src/common/intrusive_lru.h
@@ -0,0 +1,222 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <boost/intrusive_ptr.hpp>
+#include <boost/intrusive/set.hpp>
+#include <boost/intrusive/list.hpp>
+
+namespace ceph::common {
+
+/**
+ * intrusive_lru: lru implementation with embedded map and list hook
+ *
+ * Note, this implementation currently is entirely thread-unsafe.
+ */
+
+template <typename K, typename V, typename VToK>
+struct intrusive_lru_config {
+  using key_type = K;
+  using value_type = V;
+  using key_of_value = VToK;
+};
+
+template <typename Config>
+class intrusive_lru;
+
+template <typename Config>
+class intrusive_lru_base;
+
+template <typename Config>
+void intrusive_ptr_add_ref(intrusive_lru_base<Config> *p);
+
+template <typename Config>
+void intrusive_ptr_release(intrusive_lru_base<Config> *p);
+
+
+template <typename Config>
+class intrusive_lru_base {
+  unsigned use_count = 0;
+
+  // null if unreferenced
+  intrusive_lru<Config> *lru = nullptr;
+
+public:
+  boost::intrusive::set_member_hook<> set_hook;
+  boost::intrusive::list_member_hook<> list_hook;
+
+  using Ref = boost::intrusive_ptr<typename Config::value_type>;
+  using lru_t = intrusive_lru<Config>;
+
+  friend intrusive_lru<Config>;
+  friend void intrusive_ptr_add_ref<>(intrusive_lru_base<Config> *);
+  friend void intrusive_ptr_release<>(intrusive_lru_base<Config> *);
+
+  virtual ~intrusive_lru_base() {}
+};
+
+template <typename Config>
+class intrusive_lru {
+  using base_t = intrusive_lru_base<Config>;
+  using K = typename Config::key_type;
+  using T = typename Config::value_type;
+  using TRef = typename base_t::Ref;
+
+  using lru_set_option_t = boost::intrusive::member_hook<
+    base_t,
+    boost::intrusive::set_member_hook<>,
+    &base_t::set_hook>;
+
+  using VToK = typename Config::key_of_value;
+  struct VToKWrapped {
+    using type = typename VToK::type;
+    const type &operator()(const base_t &obc) {
+      return VToK()(static_cast<const T&>(obc));
+    }
+  };
+  using lru_set_t = boost::intrusive::set<
+    base_t,
+    lru_set_option_t,
+    boost::intrusive::key_of_value<VToKWrapped>
+    >;
+  lru_set_t lru_set;
+
+  using lru_list_t = boost::intrusive::list<
+    base_t,
+    boost::intrusive::member_hook<
+      base_t,
+      boost::intrusive::list_member_hook<>,
+      &base_t::list_hook>>;
+  lru_list_t unreferenced_list;
+
+  size_t lru_target_size = 0;
+
+  void evict() {
+    while (!unreferenced_list.empty() &&
+	   lru_set.size() > lru_target_size) {
+      auto &b = unreferenced_list.front();
+      assert(!b.lru);
+      unreferenced_list.pop_front();
+      lru_set.erase_and_dispose(
+	lru_set.iterator_to(b),
+	[](auto *p) { delete p; }
+      );
+    }
+  }
+
+  void access(base_t &b) {
+    if (b.lru)
+      return;
+    unreferenced_list.erase(lru_list_t::s_iterator_to(b));
+    b.lru = this;
+  }
+
+  void insert(base_t &b) {
+    assert(!b.lru);
+    lru_set.insert(b);
+    b.lru = this;
+    evict();
+  }
+
+  void unreferenced(base_t &b) {
+    assert(b.lru);
+    unreferenced_list.push_back(b);
+    b.lru = nullptr;
+    evict();
+  }
+
+public:
+  /**
+   * Returns the TRef corresponding to k if it exists or
+   * creates it otherwise.  Return is:
+   * std::pair(reference_to_val, found)
+   */
+  std::pair<TRef, bool> get_or_create(const K &k) {
+    typename lru_set_t::insert_commit_data icd;
+    auto [iter, missing] = lru_set.insert_check(
+      k,
+      icd);
+    if (missing) {
+      auto ret = new T(k);
+      lru_set.insert_commit(*ret, icd);
+      insert(*ret);
+      return {TRef(ret), false};
+    } else {
+      access(*iter);
+      return {TRef(static_cast<T*>(&*iter)), true};
+    }
+  }
+
+  /*
+   * Clears unreferenced elements from the lru set [from, to]
+   */
+  void clear_range(
+    const K& from,
+    const K& to) {
+      auto from_iter = lru_set.lower_bound(from);
+      auto to_iter = lru_set.upper_bound(to);
+      for (auto i = from_iter; i != to_iter; ) {
+        if (!(*i).lru) {
+          unreferenced_list.erase(lru_list_t::s_iterator_to(*i));
+          i = lru_set.erase_and_dispose(i, [](auto *p)
+            { delete p; } );
+        } else {
+          i++;
+        }
+      }
+  }
+
+  template <class F>
+  void for_each(F&& f) {
+    for (auto& v : lru_set) {
+      access(v);
+      f(TRef{static_cast<T*>(&v)});
+    }
+  }
+
+  /**
+   * Returns the TRef corresponding to k if it exists or
+   * nullptr otherwise.
+   */
+  TRef get(const K &k) {
+    if (auto iter = lru_set.find(k); iter != std::end(lru_set)) {
+      access(*iter);
+      return TRef(static_cast<T*>(&*iter));
+    } else {
+      return nullptr;
+    }
+  }
+
+  void set_target_size(size_t target_size) {
+    lru_target_size = target_size;
+    evict();
+  }
+
+  ~intrusive_lru() {
+    set_target_size(0);
+  }
+
+  friend void intrusive_ptr_add_ref<>(intrusive_lru_base<Config> *);
+  friend void intrusive_ptr_release<>(intrusive_lru_base<Config> *);
+};
+
+template <typename Config>
+void intrusive_ptr_add_ref(intrusive_lru_base<Config> *p) {
+  assert(p);
+  assert(p->lru);
+  p->use_count++;
+}
+
+template <typename Config>
+void intrusive_ptr_release(intrusive_lru_base<Config> *p) {
+  assert(p);
+  assert(p->use_count > 0);
+  --p->use_count;
+  if (p->use_count == 0) {
+    p->lru->unreferenced(*p);
+  }
+}
+
+
+}
diff --git a/src/common/ipaddr.cc b/src/common/ipaddr.cc
new file mode 100644
index 000000000..8c5da54b9
--- /dev/null
+++ b/src/common/ipaddr.cc
@@ -0,0 +1,180 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <arpa/inet.h>
+#include <ifaddrs.h>
+#include <stdlib.h>
+#include <string.h>
+#if defined(__FreeBSD__)
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#endif
+
+#include "include/ipaddr.h"
+#include "msg/msg_types.h"
+#include "common/pick_address.h"
+
+using std::string;
+
+void netmask_ipv4(const struct in_addr *addr,
+			 unsigned int prefix_len,
+			 struct in_addr *out) {
+  uint32_t mask;
+
+  if (prefix_len >= 32) {
+    // also handle 32 in this branch, because >>32 is not defined by
+    // the C standards
+    mask = ~uint32_t(0);
+  } else {
+    mask = htonl(~(~uint32_t(0) >> prefix_len));
+  }
+  out->s_addr = addr->s_addr & mask;
+}
+
+bool matches_ipv4_in_subnet(const struct ifaddrs& addrs,
+			  const struct sockaddr_in* net,
+			  unsigned int prefix_len)
+{
+  if (addrs.ifa_addr == nullptr)
+    return false;
+
+  if (addrs.ifa_addr->sa_family != net->sin_family)
+      return false;
+  struct in_addr want;
+  netmask_ipv4(&net->sin_addr, prefix_len, &want);
+  struct in_addr *cur = &((struct sockaddr_in*)addrs.ifa_addr)->sin_addr;
+  struct in_addr temp;
+  netmask_ipv4(cur, prefix_len, &temp);
+  return temp.s_addr == want.s_addr;
+}
+
+void netmask_ipv6(const struct in6_addr *addr,
+		  unsigned int prefix_len,
+		  struct in6_addr *out) {
+  if (prefix_len > 128)
+    prefix_len = 128;
+
+  memcpy(out->s6_addr, addr->s6_addr, prefix_len/8);
+  if (prefix_len < 128)
+    out->s6_addr[prefix_len/8] = addr->s6_addr[prefix_len/8] & ~( 0xFF >> (prefix_len % 8) );
+  if (prefix_len/8 < 15)
+    memset(out->s6_addr+prefix_len/8+1, 0, 16-prefix_len/8-1);
+}
+
+bool matches_ipv6_in_subnet(const struct ifaddrs& addrs,
+			    const struct sockaddr_in6* net,
+			    unsigned int prefix_len)
+{
+  if (addrs.ifa_addr == nullptr)
+    return false;
+
+  if (addrs.ifa_addr->sa_family != net->sin6_family)
+    return false;
+  struct in6_addr want;
+  netmask_ipv6(&net->sin6_addr, prefix_len, &want);
+  struct in6_addr temp;
+  struct in6_addr *cur = &((struct sockaddr_in6*)addrs.ifa_addr)->sin6_addr;
+  if (IN6_IS_ADDR_LINKLOCAL(cur))
+    return false;
+  netmask_ipv6(cur, prefix_len, &temp);
+  return IN6_ARE_ADDR_EQUAL(&temp, &want);
+}
+
+bool parse_network(const char *s, struct sockaddr_storage *network, unsigned int *prefix_len) {
+  char *slash = strchr((char*)s, '/');
+  if (!slash) {
+    // no slash
+    return false;
+  }
+  if (*(slash+1) == '\0') {
+    // slash is the last character
+    return false;
+  }
+
+  char *end;
+  long int num = strtol(slash+1, &end, 10);
+  if (*end != '\0') {
+    // junk after the prefix_len
+    return false;
+  }
+  if (num < 0) {
+    return false;
+  }
+  *prefix_len = num;
+
+  // copy the part before slash to get nil termination
+  char *addr = (char*)alloca(slash-s + 1);
+  strncpy(addr, s, slash-s);
+  addr[slash-s] = '\0';
+
+  // caller expects ports etc to be zero
+  memset(network, 0, sizeof(*network));
+
+  // try parsing as ipv4
+  int ok;
+  ok = inet_pton(AF_INET, addr, &((struct sockaddr_in*)network)->sin_addr);
+  if (ok) {
+    network->ss_family = AF_INET;
+    return true;
+  }
+
+  // try parsing as ipv6
+  ok = inet_pton(AF_INET6, addr, &((struct sockaddr_in6*)network)->sin6_addr);
+  if (ok) {
+    network->ss_family = AF_INET6;
+    return true;
+  }
+
+  return false;
+}
+
+bool parse_network(const char *s,
+		   entity_addr_t *network,
+		   unsigned int *prefix_len)
+{
+  sockaddr_storage ss;
+  bool ret = parse_network(s, &ss, prefix_len);
+  if (ret) {
+    network->set_type(entity_addr_t::TYPE_LEGACY);
+    network->set_sockaddr((sockaddr *)&ss);
+  }
+  return ret;
+}
+
+bool network_contains(
+  const struct entity_addr_t& network,
+  unsigned int prefix_len,
+  const struct entity_addr_t& addr)
+{
+  if (addr.get_family() != network.get_family()) {
+    return false;
+  }
+  switch (network.get_family()) {
+  case AF_INET:
+    {
+      struct in_addr a, b;
+      netmask_ipv4(
+	&((const sockaddr_in*)network.get_sockaddr())->sin_addr, prefix_len, &a);
+      netmask_ipv4(
+	&((const sockaddr_in*)addr.get_sockaddr())->sin_addr, prefix_len, &b);
+      if (memcmp(&a, &b, sizeof(a)) == 0) {
+	return true;
+      }
+    }
+    break;
+  case AF_INET6:
+    {
+      struct in6_addr a, b;
+      netmask_ipv6(
+	&((const sockaddr_in6*)network.get_sockaddr())->sin6_addr, prefix_len, &a);
+      netmask_ipv6(
+	&((const sockaddr_in6*)addr.get_sockaddr())->sin6_addr, prefix_len, &b);
+      if (memcmp(&a, &b, sizeof(a)) == 0) {
+	return true;
+      }
+    }
+    break;
+  }
+  return false;
+}
diff --git a/src/common/iso_8601.cc b/src/common/iso_8601.cc
new file mode 100644
index 000000000..21601379c
--- /dev/null
+++ b/src/common/iso_8601.cc
@@ -0,0 +1,205 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iomanip>
+#include <sstream>
+
+#include "iso_8601.h"
+#include "include/timegm.h"
+#include "include/ceph_assert.h"
+
+namespace ceph {
+using std::chrono::duration_cast;
+using std::chrono::nanoseconds;
+using std::chrono::seconds;
+using std::setw;
+using std::size_t;
+using std::stringstream;
+using std::string;
+using std::uint16_t;
+
+using boost::none;
+using boost::optional;
+using std::string_view;
+
+using ceph::real_clock;
+using ceph::real_time;
+
+using sriter = string_view::const_iterator;
+
+namespace {
+// This assumes a contiguous block of numbers in the correct order.
+uint16_t digit(char c) {
+  if (!(c >= '0' && c <= '9')) {
+    throw std::invalid_argument("Not a digit.");
+  }
+  return static_cast<uint16_t>(c - '0');
+}
+
+optional<real_time> calculate(const tm& t, uint32_t n = 0) {
+  ceph_assert(n < 1000000000);
+  time_t tt = internal_timegm(&t);
+  if (tt == static_cast<time_t>(-1)) {
+    return none;
+  }
+
+  return boost::make_optional<real_time>(real_clock::from_time_t(tt)
+                                         + nanoseconds(n));
+}
+}
+
+optional<real_time> from_iso_8601(const string_view s,
+				  const bool ws_terminates) noexcept {
+  auto end = s.cend();
+  auto read_digit = [end](sriter& c) mutable {
+    if (c == end) {
+      throw std::invalid_argument("End of input.");
+    }
+    auto f = digit(*c);
+    ++c;
+    return f;
+  };
+
+  auto read_digits = [&read_digit](sriter& c, std::size_t n) {
+    auto v = 0ULL;
+    for (auto i = 0U; i < n; ++i) {
+      auto d = read_digit(c);
+      v = (10ULL * v) + d;
+    }
+    return v;
+  };
+  auto partial_date = [end, ws_terminates](sriter& c) {
+    return (c == end || (ws_terminates && std::isspace(*c)));
+  };
+  auto time_end = [end, ws_terminates](sriter& c) {
+    return (c != end && *c == 'Z' &&
+	    ((c + 1) == end ||
+	     (ws_terminates && std::isspace(*(c + 1)))));
+  };
+  auto consume_delimiter = [end](sriter& c, char q) {
+    if (c == end || *c != q) {
+      throw std::invalid_argument("Expected delimiter not found.");
+    } else {
+      ++c;
+    }
+  };
+
+  tm t = { 0, // tm_sec
+	   0, // tm_min
+	   0, // tm_hour
+	   1, // tm_mday
+	   0, // tm_mon
+	   70, // tm_year
+	   0, // tm_wday
+	   0, // tm_yday
+	   0, // tm_isdst
+  };
+  try {
+    auto c = s.cbegin();
+    {
+      auto y = read_digits(c, 4);
+      if (y < 1970) {
+	return none;
+      }
+      t.tm_year = y - 1900;
+    }
+    if (partial_date(c)) {
+      return calculate(t, 0);
+    }
+
+    consume_delimiter(c, '-');
+    t.tm_mon = (read_digits(c, 2) - 1);
+    if (partial_date(c)) {
+      return calculate(t);
+    }
+    consume_delimiter(c, '-');
+    t.tm_mday = read_digits(c, 2);
+    if (partial_date(c)) {
+      return calculate(t);
+    }
+    consume_delimiter(c, 'T');
+    t.tm_hour = read_digits(c, 2);
+    if (time_end(c)) {
+      return calculate(t);
+    }
+    consume_delimiter(c, ':');
+    t.tm_min = read_digits(c, 2);
+    if (time_end(c)) {
+      return calculate(t);
+    }
+    consume_delimiter(c, ':');
+    t.tm_sec = read_digits(c, 2);
+    if (time_end(c)) {
+      return calculate(t);
+    }
+    consume_delimiter(c, '.');
+
+    auto n = 0UL;
+    auto multiplier = 100000000UL;
+    for (auto i = 0U; i < 9U; ++i) {
+      auto d = read_digit(c);
+      n += d * multiplier;
+      multiplier /= 10;
+      if (time_end(c)) {
+	return calculate(t, n);
+      }
+    }
+  } catch (std::invalid_argument& e) {
+    // fallthrough
+  }
+  return none;
+}
+
+string to_iso_8601(const real_time t,
+		   const iso_8601_format f,
+                   std::string_view date_separator,
+                   std::string_view time_separator) noexcept {
+  ceph_assert(f >= iso_8601_format::Y &&
+	      f <= iso_8601_format::YMDhmsn);
+  stringstream out(std::ios_base::out);
+
+  auto sec = real_clock::to_time_t(t);
+  auto nsec = duration_cast<nanoseconds>(t.time_since_epoch() %
+					 seconds(1)).count();
+
+  struct tm bt;
+  gmtime_r(&sec, &bt);
+  out.fill('0');
+
+  out << 1900 + bt.tm_year;
+  if (f == iso_8601_format::Y) {
+    return out.str();
+  }
+
+  out << date_separator << setw(2) << bt.tm_mon + 1;
+  if (f == iso_8601_format::YM) {
+    return out.str();
+  }
+
+  out << date_separator << setw(2) << bt.tm_mday;
+  if (f == iso_8601_format::YMD) {
+    return out.str();
+  }
+
+  out << 'T' << setw(2) << bt.tm_hour;
+  if (f == iso_8601_format::YMDh) {
+    out << 'Z';
+    return out.str();
+  }
+
+  out << time_separator << setw(2) << bt.tm_min;
+  if (f == iso_8601_format::YMDhm) {
+    out << 'Z';
+    return out.str();
+  }
+
+  out << time_separator << setw(2) << bt.tm_sec;
+  if (f == iso_8601_format::YMDhms) {
+    out << 'Z';
+    return out.str();
+  }
+  out << '.' << setw(9) << nsec << 'Z';
+  return out.str();
+}
+
+}
diff --git a/src/common/iso_8601.h b/src/common/iso_8601.h
new file mode 100644
index 000000000..982148343
--- /dev/null
+++ b/src/common/iso_8601.h
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_COMMON_ISO_8601_H
+#define CEPH_COMMON_ISO_8601_H
+
+#include <string_view>
+#include <boost/optional.hpp>
+
+#include "common/ceph_time.h"
+
+namespace ceph {
+
+// Here, we support the W3C profile of ISO 8601 with the following
+// restrictions:
+// -   Subsecond resolution is supported to nanosecond
+//     granularity. Any number of digits between 1 and 9 may be
+//     specified after the decimal point.
+// -   All times must be UTC.
+// -   All times must be representable as a sixty-four bit count of
+//     nanoseconds since the epoch.
+// -   Partial times are handled thus:
+//     *    If there are no subseconds, they are assumed to be zero.
+//     *    If there are no seconds, they are assumed to be zero.
+//     *    If there are no minutes, they are assumed to be zero.
+//     *    If there is no time, it is assumed to midnight.
+//     *    If there is no day, it is assumed to be the first.
+//     *    If there is no month, it is assumed to be January.
+//
+// If a date is invalid, boost::none is returned.
+
+boost::optional<ceph::real_time> from_iso_8601(
+  std::string_view s, const bool ws_terminates = true) noexcept;
+
+enum class iso_8601_format {
+  Y, YM, YMD, YMDh, YMDhm, YMDhms, YMDhmsn
+};
+
+std::string to_iso_8601(const ceph::real_time t,
+			const iso_8601_format f = iso_8601_format::YMDhmsn,
+                        std::string_view date_separator = "-",
+                        std::string_view time_separator = ":")
+  noexcept;
+
+static inline std::string to_iso_8601_no_separators(const ceph::real_time t,
+                                                    const iso_8601_format f = iso_8601_format::YMDhmsn)
+  noexcept {
+    return to_iso_8601(t, f, "", "");
+  }
+}
+
+#endif
diff --git a/src/common/item_history.h b/src/common/item_history.h
new file mode 100644
index 000000000..87512a28c
--- /dev/null
+++ b/src/common/item_history.h
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <list>
+#include <mutex>
+
+/*
+
+Keep a history of item values so that readers can dereference the pointer to
+the latest value and continue using it as long as they want.  This container
+is only appropriate for values that are updated a handful of times over their
+total lifetime.
+
+*/
+
+template<class T>
+class safe_item_history {
+private:
+  std::mutex lock;
+  std::list<T> history;
+  T *current = nullptr;
+
+public:
+  safe_item_history() {
+    history.emplace_back(T());
+    current = &history.back();
+  }
+
+  // readers are lock-free
+  const T& operator*() const {
+    return *current;
+  }
+  const T *operator->() const {
+    return current;
+  }
+
+  // writes are serialized
+  const T& operator=(const T& other) {
+    std::lock_guard l(lock);
+    history.push_back(other);
+    current = &history.back();
+    return *current;
+  }
+
+};
diff --git a/src/common/likely.h b/src/common/likely.h
new file mode 100644
index 000000000..abaf2d2e2
--- /dev/null
+++ b/src/common/likely.h
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2010 Dreamhost
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_LIKELY_DOT_H
+#define CEPH_LIKELY_DOT_H
+
+/*
+ * Likely / Unlikely macros
+ */
+#ifndef likely
+#define likely(x)       __builtin_expect((x),1)
+#endif
+#ifndef unlikely
+#define unlikely(x)     __builtin_expect((x),0)
+#endif
+#ifndef expect
+#define expect(x, hint) __builtin_expect((x),(hint))
+#endif
+
+#endif
diff --git a/src/common/linux_version.c b/src/common/linux_version.c
new file mode 100644
index 000000000..b83dc71e4
--- /dev/null
+++ b/src/common/linux_version.c
@@ -0,0 +1,25 @@
+#include "common/linux_version.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <sys/utsname.h>
+
+int get_linux_version(void)
+{
+	struct utsname ubuf;
+	int a, b, c;
+	int n;
+
+	if (uname(&ubuf) || strcmp(ubuf.sysname, "Linux"))
+		return 0;
+
+	n = sscanf(ubuf.release, "%d.%d.%d", &a, &b, &c);
+	switch (n) {
+	case 3:
+		return KERNEL_VERSION(a, b, c);
+	case 2:
+		return KERNEL_VERSION(a, b, 0);
+	default:
+		return 0;
+	}
+}
diff --git a/src/common/linux_version.h b/src/common/linux_version.h
new file mode 100644
index 000000000..5588c55ba
--- /dev/null
+++ b/src/common/linux_version.h
@@ -0,0 +1,22 @@
+#ifndef CEPH_LINUX_VERSION_H
+#define CEPH_LINUX_VERSION_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef HAVE_LINUX_VERSION_H
+# include <linux/version.h>
+#endif
+
+#ifndef KERNEL_VERSION
+# define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))
+#endif
+
+int get_linux_version(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CEPH_LINUX_VERSION_H */
diff --git a/src/common/lockdep.cc b/src/common/lockdep.cc
new file mode 100644
index 000000000..aa7d9e0f0
--- /dev/null
+++ b/src/common/lockdep.cc
@@ -0,0 +1,400 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2008-2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "lockdep.h"
+#include <bitset>
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "common/valgrind.h"
+
+/******* Constants **********/
+#define lockdep_dout(v) lsubdout(g_lockdep_ceph_ctx, lockdep, v)
+#define BACKTRACE_SKIP 2
+
+/******* Globals **********/
+bool g_lockdep;
+struct lockdep_stopper_t {
+  // disable lockdep when this module destructs.
+  ~lockdep_stopper_t() {
+    g_lockdep = 0;
+  }
+};
+
+
+static pthread_mutex_t lockdep_mutex = PTHREAD_MUTEX_INITIALIZER;
+static CephContext *g_lockdep_ceph_ctx = NULL;
+static lockdep_stopper_t lockdep_stopper;
+static ceph::unordered_map<std::string, int> lock_ids;
+static std::map<int, std::string> lock_names;
+static std::map<int, int> lock_refs;
+static constexpr size_t MAX_LOCKS = 128 * 1024;   // increase me as needed
+static std::bitset<MAX_LOCKS> free_ids; // bit set = free
+static ceph::unordered_map<pthread_t, std::map<int,ceph::BackTrace*> > held;
+static constexpr size_t NR_LOCKS = 4096; // the initial number of locks
+static std::vector<std::bitset<MAX_LOCKS>> follows(NR_LOCKS); // follows[a][b] means b taken after a
+static std::vector<std::map<int,ceph::BackTrace *>> follows_bt(NR_LOCKS);
+// upper bound of lock id
+unsigned current_maxid;
+int last_freed_id = -1;
+static bool free_ids_inited;
+
+static bool lockdep_force_backtrace()
+{
+  return (g_lockdep_ceph_ctx != NULL &&
+          g_lockdep_ceph_ctx->_conf->lockdep_force_backtrace);
+}
+
+/******* Functions **********/
+void lockdep_register_ceph_context(CephContext *cct)
+{
+  static_assert((MAX_LOCKS > 0) && (MAX_LOCKS % 8 == 0),                   
+    "lockdep's MAX_LOCKS needs to be divisible by 8 to operate correctly.");
+  pthread_mutex_lock(&lockdep_mutex);
+  if (g_lockdep_ceph_ctx == NULL) {
+    ANNOTATE_BENIGN_RACE_SIZED(&g_lockdep_ceph_ctx, sizeof(g_lockdep_ceph_ctx),
+                               "lockdep cct");
+    ANNOTATE_BENIGN_RACE_SIZED(&g_lockdep, sizeof(g_lockdep),
+                               "lockdep enabled");
+    g_lockdep = true;
+    g_lockdep_ceph_ctx = cct;
+    lockdep_dout(1) << "lockdep start" << dendl;
+    if (!free_ids_inited) {
+      free_ids_inited = true;
+      // FIPS zeroization audit 20191115: this memset is not security related.
+      free_ids.set();
+    }
+  }
+  pthread_mutex_unlock(&lockdep_mutex);
+}
+
+void lockdep_unregister_ceph_context(CephContext *cct)
+{
+  pthread_mutex_lock(&lockdep_mutex);
+  if (cct == g_lockdep_ceph_ctx) {
+    lockdep_dout(1) << "lockdep stop" << dendl;
+    // this cct is going away; shut it down!
+    g_lockdep = false;
+    g_lockdep_ceph_ctx = NULL;
+
+    // blow away all of our state, too, in case it starts up again.
+    for (unsigned i = 0; i < current_maxid; ++i) {
+      for (unsigned j = 0; j < current_maxid; ++j) {
+        delete follows_bt[i][j];
+      }
+    }
+
+    held.clear();
+    lock_names.clear();
+    lock_ids.clear();
+    std::for_each(follows.begin(), std::next(follows.begin(), current_maxid),
+                  [](auto& follow) { follow.reset(); });
+    std::for_each(follows_bt.begin(), std::next(follows_bt.begin(), current_maxid),
+                  [](auto& follow_bt) { follow_bt = {}; });
+  }
+  pthread_mutex_unlock(&lockdep_mutex);
+}
+
+int lockdep_dump_locks()
+{
+  pthread_mutex_lock(&lockdep_mutex);
+  if (!g_lockdep)
+    goto out;
+
+  for (auto p = held.begin(); p != held.end(); ++p) {
+    lockdep_dout(0) << "--- thread " << p->first << " ---" << dendl;
+    for (auto q = p->second.begin();
+	 q != p->second.end();
+	 ++q) {
+      lockdep_dout(0) << "  * " << lock_names[q->first] << "\n";
+      if (q->second)
+	*_dout << *(q->second);
+      *_dout << dendl;
+    }
+  }
+out:
+  pthread_mutex_unlock(&lockdep_mutex);
+  return 0;
+}
+
+int lockdep_get_free_id(void)
+{
+  // if there's id known to be freed lately, reuse it
+  if (last_freed_id >= 0 &&
+      free_ids.test(last_freed_id)) {
+    int tmp = last_freed_id;
+    last_freed_id = -1;
+    free_ids.reset(tmp);
+    lockdep_dout(1) << "lockdep reusing last freed id " << tmp << dendl;
+    return tmp;
+  }
+  
+  // walk through entire array and locate nonzero char, then find
+  // actual bit.
+  for (size_t i = 0; i < free_ids.size(); ++i) {
+    if (free_ids.test(i)) {
+      free_ids.reset(i);
+      return i;
+    }
+  }
+  
+  // not found
+  lockdep_dout(0) << "failing miserably..." << dendl;
+  return -1;
+}
+
+static int _lockdep_register(const char *name)
+{
+  int id = -1;
+
+  if (!g_lockdep)
+    return id;
+  ceph::unordered_map<std::string, int>::iterator p = lock_ids.find(name);
+  if (p == lock_ids.end()) {
+    id = lockdep_get_free_id();
+    if (id < 0) {
+      lockdep_dout(0) << "ERROR OUT OF IDS .. have 0"
+		      << " max " << MAX_LOCKS << dendl;
+      for (auto& p : lock_names) {
+	lockdep_dout(0) << "  lock " << p.first << " " << p.second << dendl;
+      }
+      ceph_abort();
+    }
+    if (current_maxid <= (unsigned)id) {
+      current_maxid = (unsigned)id + 1;
+      if (current_maxid == follows.size()) {
+        follows.resize(current_maxid + 1);
+        follows_bt.resize(current_maxid + 1);
+      }
+    }
+    lock_ids[name] = id;
+    lock_names[id] = name;
+    lockdep_dout(10) << "registered '" << name << "' as " << id << dendl;
+  } else {
+    id = p->second;
+    lockdep_dout(20) << "had '" << name << "' as " << id << dendl;
+  }
+
+  ++lock_refs[id];
+
+  return id;
+}
+
+int lockdep_register(const char *name)
+{
+  int id;
+
+  pthread_mutex_lock(&lockdep_mutex);
+  id = _lockdep_register(name);
+  pthread_mutex_unlock(&lockdep_mutex);
+  return id;
+}
+
+void lockdep_unregister(int id)
+{
+  if (id < 0) {
+    return;
+  }
+
+  pthread_mutex_lock(&lockdep_mutex);
+
+  std::string name;
+  auto p = lock_names.find(id);
+  if (p == lock_names.end())
+    name = "unknown" ;
+  else
+    name = p->second;
+
+  int &refs = lock_refs[id];
+  if (--refs == 0) {
+    if (p != lock_names.end()) {
+      // reset dependency ordering
+      follows[id].reset();
+      for (unsigned i=0; i<current_maxid; ++i) {
+        delete follows_bt[id][i];
+        follows_bt[id][i] = NULL;
+
+        delete follows_bt[i][id];
+        follows_bt[i][id] = NULL;
+        follows[i].reset(id);
+      }
+
+      lockdep_dout(10) << "unregistered '" << name << "' from " << id << dendl;
+      lock_ids.erase(p->second);
+      lock_names.erase(id);
+    }
+    lock_refs.erase(id);
+    free_ids.set(id);
+    last_freed_id = id;
+  } else if (g_lockdep) {
+    lockdep_dout(20) << "have " << refs << " of '" << name << "' " <<
+			"from " << id << dendl;
+  }
+  pthread_mutex_unlock(&lockdep_mutex);
+}
+
+
+// does b follow a?
+static bool does_follow(int a, int b)
+{
+  if (follows[a].test(b)) {
+    lockdep_dout(0) << "\n";
+    *_dout << "------------------------------------" << "\n";
+    *_dout << "existing dependency " << lock_names[a] << " (" << a << ") -> "
+           << lock_names[b] << " (" << b << ") at:\n";
+    if (follows_bt[a][b]) {
+      follows_bt[a][b]->print(*_dout);
+    }
+    *_dout << dendl;
+    return true;
+  }
+
+  for (unsigned i=0; i<current_maxid; i++) {
+    if (follows[a].test(i) &&
+	does_follow(i, b)) {
+      lockdep_dout(0) << "existing intermediate dependency " << lock_names[a]
+          << " (" << a << ") -> " << lock_names[i] << " (" << i << ") at:\n";
+      if (follows_bt[a][i]) {
+        follows_bt[a][i]->print(*_dout);
+      }
+      *_dout << dendl;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+int lockdep_will_lock(const char *name, int id, bool force_backtrace,
+		      bool recursive)
+{
+  pthread_t p = pthread_self();
+
+  pthread_mutex_lock(&lockdep_mutex);
+  if (!g_lockdep) {
+    pthread_mutex_unlock(&lockdep_mutex);
+    return id;
+  }
+
+  if (id < 0)
+    id = _lockdep_register(name);
+
+  lockdep_dout(20) << "_will_lock " << name << " (" << id << ")" << dendl;
+
+  // check dependency graph
+  auto& m = held[p];
+  for (auto p = m.begin(); p != m.end(); ++p) {
+    if (p->first == id) {
+      if (!recursive) {
+	lockdep_dout(0) << "\n";
+	*_dout << "recursive lock of " << name << " (" << id << ")\n";
+	auto bt = new ceph::ClibBackTrace(BACKTRACE_SKIP);
+	bt->print(*_dout);
+	if (p->second) {
+	  *_dout << "\npreviously locked at\n";
+	  p->second->print(*_dout);
+	}
+	delete bt;
+	*_dout << dendl;
+	ceph_abort();
+      }
+    } else if (!follows[p->first].test(id)) {
+      // new dependency
+
+      // did we just create a cycle?
+      if (does_follow(id, p->first)) {
+        auto bt = new ceph::ClibBackTrace(BACKTRACE_SKIP);
+	lockdep_dout(0) << "new dependency " << lock_names[p->first]
+		<< " (" << p->first << ") -> " << name << " (" << id << ")"
+		<< " creates a cycle at\n";
+	bt->print(*_dout);
+	*_dout << dendl;
+
+	lockdep_dout(0) << "btw, i am holding these locks:" << dendl;
+	for (auto q = m.begin(); q != m.end(); ++q) {
+	  lockdep_dout(0) << "  " << lock_names[q->first] << " (" << q->first << ")" << dendl;
+	  if (q->second) {
+	    lockdep_dout(0) << " ";
+	    q->second->print(*_dout);
+	    *_dout << dendl;
+	  }
+	}
+
+	lockdep_dout(0) << "\n" << dendl;
+
+	// don't add this dependency, or we'll get aMutex. cycle in the graph, and
+	// does_follow() won't terminate.
+
+	ceph_abort();  // actually, we should just die here.
+      } else {
+	ceph::BackTrace* bt = NULL;
+        if (force_backtrace || lockdep_force_backtrace()) {
+          bt = new ceph::ClibBackTrace(BACKTRACE_SKIP);
+        }
+        follows[p->first].set(id);
+        follows_bt[p->first][id] = bt;
+	lockdep_dout(10) << lock_names[p->first] << " -> " << name << " at" << dendl;
+	//bt->print(*_dout);
+      }
+    }
+  }
+  pthread_mutex_unlock(&lockdep_mutex);
+  return id;
+}
+
+int lockdep_locked(const char *name, int id, bool force_backtrace)
+{
+  pthread_t p = pthread_self();
+
+  pthread_mutex_lock(&lockdep_mutex);
+  if (!g_lockdep)
+    goto out;
+  if (id < 0)
+    id = _lockdep_register(name);
+
+  lockdep_dout(20) << "_locked " << name << dendl;
+  if (force_backtrace || lockdep_force_backtrace())
+    held[p][id] = new ceph::ClibBackTrace(BACKTRACE_SKIP);
+  else
+    held[p][id] = 0;
+out:
+  pthread_mutex_unlock(&lockdep_mutex);
+  return id;
+}
+
+int lockdep_will_unlock(const char *name, int id)
+{
+  pthread_t p = pthread_self();
+
+  if (id < 0) {
+    //id = lockdep_register(name);
+    ceph_assert(id == -1);
+    return id;
+  }
+
+  pthread_mutex_lock(&lockdep_mutex);
+  if (!g_lockdep)
+    goto out;
+  lockdep_dout(20) << "_will_unlock " << name << dendl;
+
+  // don't assert.. lockdep may be enabled at any point in time
+  //assert(held.count(p));
+  //assert(held[p].count(id));
+
+  delete held[p][id];
+  held[p].erase(id);
+out:
+  pthread_mutex_unlock(&lockdep_mutex);
+  return id;
+}
+
+
diff --git a/src/common/lockdep.h b/src/common/lockdep.h
new file mode 100644
index 000000000..f2376d0b1
--- /dev/null
+++ b/src/common/lockdep.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2008-2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_LOCKDEP_H
+#define CEPH_LOCKDEP_H
+
+#include "include/common_fwd.h"
+
+#ifdef CEPH_DEBUG_MUTEX
+
+extern bool g_lockdep;
+
+extern void lockdep_register_ceph_context(CephContext *cct);
+extern void lockdep_unregister_ceph_context(CephContext *cct);
+// lockdep tracks dependencies between multiple and different instances
+// of locks within a class denoted by `n`.
+// Caller is obliged to guarantee name uniqueness.
+extern int lockdep_register(const char *n);
+extern void lockdep_unregister(int id);
+extern int lockdep_will_lock(const char *n, int id, bool force_backtrace=false,
+			     bool recursive=false);
+extern int lockdep_locked(const char *n, int id, bool force_backtrace=false);
+extern int lockdep_will_unlock(const char *n, int id);
+extern int lockdep_dump_locks();
+
+#else
+
+static constexpr bool g_lockdep = false;
+#define lockdep_register(...) 0
+#define lockdep_unregister(...)
+#define lockdep_will_lock(...) 0
+#define lockdep_locked(...) 0
+#define lockdep_will_unlock(...) 0
+
+#endif	// CEPH_DEBUG_MUTEX
+
+
+#endif
diff --git a/src/common/lru_map.h b/src/common/lru_map.h
new file mode 100644
index 000000000..4c1c2dadb
--- /dev/null
+++ b/src/common/lru_map.h
@@ -0,0 +1,132 @@
+#ifndef CEPH_LRU_MAP_H
+#define CEPH_LRU_MAP_H
+
+#include "common/ceph_mutex.h"
+
+template <class K, class V>
+class lru_map {
+  struct entry {
+    V value;
+    typename std::list<K>::iterator lru_iter;
+  };
+
+  std::map<K, entry> entries;
+  std::list<K> entries_lru;
+
+  ceph::mutex lock = ceph::make_mutex("lru_map::lock");
+
+  size_t max;
+
+public:
+  class UpdateContext {
+    public:
+      virtual ~UpdateContext() {}
+
+      /* update should return true if object is updated */
+      virtual bool update(V *v) = 0;
+  };
+
+  bool _find(const K& key, V *value, UpdateContext *ctx);
+  void _add(const K& key, V& value);
+
+public:
+  lru_map(int _max) : max(_max) {}
+  virtual ~lru_map() {}
+
+  bool find(const K& key, V& value);
+
+  /*
+   * find_and_update()
+   *
+   * - will return true if object is found
+   * - if ctx is set will return true if object is found and updated
+   */
+  bool find_and_update(const K& key, V *value, UpdateContext *ctx);
+  void add(const K& key, V& value);
+  void erase(const K& key);
+};
+
+template <class K, class V>
+bool lru_map<K, V>::_find(const K& key, V *value, UpdateContext *ctx)
+{
+  typename std::map<K, entry>::iterator iter = entries.find(key);
+  if (iter == entries.end()) {
+    return false;
+  }
+
+  entry& e = iter->second;
+  entries_lru.erase(e.lru_iter);
+
+  bool r = true;
+
+  if (ctx)
+    r = ctx->update(&e.value);
+
+  if (value)
+    *value = e.value;
+
+  entries_lru.push_front(key);
+  e.lru_iter = entries_lru.begin();
+
+  return r;
+}
+
+template <class K, class V>
+bool lru_map<K, V>::find(const K& key, V& value)
+{
+  std::lock_guard l(lock);
+  return _find(key, &value, NULL);
+}
+
+template <class K, class V>
+bool lru_map<K, V>::find_and_update(const K& key, V *value, UpdateContext *ctx)
+{
+  std::lock_guard l(lock);
+  return _find(key, value, ctx);
+}
+
+template <class K, class V>
+void lru_map<K, V>::_add(const K& key, V& value)
+{
+  typename std::map<K, entry>::iterator iter = entries.find(key);
+  if (iter != entries.end()) {
+    entry& e = iter->second;
+    entries_lru.erase(e.lru_iter);
+  }
+
+  entries_lru.push_front(key);
+  entry& e = entries[key];
+  e.value = value;
+  e.lru_iter = entries_lru.begin();
+
+  while (entries.size() > max) {
+    typename std::list<K>::reverse_iterator riter = entries_lru.rbegin();
+    iter = entries.find(*riter);
+    // ceph_assert(iter != entries.end());
+    entries.erase(iter);
+    entries_lru.pop_back();
+  }
+}
+
+
+template <class K, class V>
+void lru_map<K, V>::add(const K& key, V& value)
+{
+  std::lock_guard l(lock);
+  _add(key, value);
+}
+
+template <class K, class V>
+void lru_map<K, V>::erase(const K& key)
+{
+  std::lock_guard l(lock);
+  typename std::map<K, entry>::iterator iter = entries.find(key);
+  if (iter == entries.end())
+    return;
+
+  entry& e = iter->second;
+  entries_lru.erase(e.lru_iter);
+  entries.erase(iter);
+}
+
+#endif
diff --git a/src/common/mClockPriorityQueue.h b/src/common/mClockPriorityQueue.h
new file mode 100644
index 000000000..c1f9f3c25
--- /dev/null
+++ b/src/common/mClockPriorityQueue.h
@@ -0,0 +1,369 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+
+#include <functional>
+#include <map>
+#include <list>
+#include <cmath>
+
+#include "common/Formatter.h"
+#include "common/OpQueue.h"
+
+#include "dmclock/src/dmclock_server.h"
+
+// the following is done to unclobber _ASSERT_H so it returns to the
+// way ceph likes it
+#include "include/ceph_assert.h"
+
+
+namespace ceph {
+
+  namespace dmc = crimson::dmclock;
+
+  template <typename T, typename K>
+  class mClockQueue : public OpQueue <T, K> {
+
+    using priority_t = unsigned;
+    using cost_t = unsigned;
+
+    typedef std::list<std::pair<cost_t, T> > ListPairs;
+
+    static void filter_list_pairs(ListPairs *l,
+				  std::function<bool (T&&)> f) {
+      for (typename ListPairs::iterator i = l->end();
+	   i != l->begin();
+	   /* no inc */
+	) {
+	auto next = i;
+	--next;
+	if (f(std::move(next->second))) {
+	  l->erase(next);
+	} else {
+	  i = next;
+	}
+      }
+    }
+
+    struct SubQueue {
+    private:
+      typedef std::map<K, ListPairs> Classes;
+      // client-class to ordered queue
+      Classes q;
+
+      unsigned tokens, max_tokens;
+
+      typename Classes::iterator cur;
+
+    public:
+
+      SubQueue(const SubQueue &other)
+	: q(other.q),
+	  tokens(other.tokens),
+	  max_tokens(other.max_tokens),
+	  cur(q.begin()) {}
+
+      SubQueue()
+	: tokens(0),
+	  max_tokens(0),
+	  cur(q.begin()) {}
+
+      void set_max_tokens(unsigned mt) {
+	max_tokens = mt;
+      }
+
+      unsigned get_max_tokens() const {
+	return max_tokens;
+      }
+
+      unsigned num_tokens() const {
+	return tokens;
+      }
+
+      void put_tokens(unsigned t) {
+	tokens += t;
+	if (tokens > max_tokens) {
+	  tokens = max_tokens;
+	}
+      }
+
+      void take_tokens(unsigned t) {
+	if (tokens > t) {
+	  tokens -= t;
+	} else {
+	  tokens = 0;
+	}
+      }
+
+      void enqueue(K cl, cost_t cost, T&& item) {
+	q[cl].emplace_back(cost, std::move(item));
+	if (cur == q.end())
+	  cur = q.begin();
+      }
+
+      void enqueue_front(K cl, cost_t cost, T&& item) {
+	q[cl].emplace_front(cost, std::move(item));
+	if (cur == q.end())
+	  cur = q.begin();
+      }
+
+      const std::pair<cost_t, T>& front() const {
+	ceph_assert(!(q.empty()));
+	ceph_assert(cur != q.end());
+	return cur->second.front();
+      }
+
+      std::pair<cost_t, T>& front() {
+	ceph_assert(!(q.empty()));
+	ceph_assert(cur != q.end());
+	return cur->second.front();
+      }
+
+      void pop_front() {
+	ceph_assert(!(q.empty()));
+	ceph_assert(cur != q.end());
+	cur->second.pop_front();
+	if (cur->second.empty()) {
+	  auto i = cur;
+	  ++cur;
+	  q.erase(i);
+	} else {
+	  ++cur;
+	}
+	if (cur == q.end()) {
+	  cur = q.begin();
+	}
+      }
+
+      unsigned get_size_slow() const {
+	unsigned count = 0;
+	for (const auto& cls : q) {
+	  count += cls.second.size();
+	}
+	return count;
+      }
+
+      bool empty() const {
+	return q.empty();
+      }
+
+      void remove_by_filter(std::function<bool (T&&)> f) {
+	for (typename Classes::iterator i = q.begin();
+	     i != q.end();
+	     /* no-inc */) {
+	  filter_list_pairs(&(i->second), f);
+	  if (i->second.empty()) {
+	    if (cur == i) {
+	      ++cur;
+	    }
+	    i = q.erase(i);
+	  } else {
+	    ++i;
+	  }
+	}
+	if (cur == q.end()) cur = q.begin();
+      }
+
+      void remove_by_class(K k, std::list<T> *out) {
+	typename Classes::iterator i = q.find(k);
+	if (i == q.end()) {
+	  return;
+	}
+	if (i == cur) {
+	  ++cur;
+	}
+	if (out) {
+	  for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
+	    out->push_front(std::move(j->second));
+	  }
+	}
+	q.erase(i);
+	if (cur == q.end()) cur = q.begin();
+      }
+
+      void dump(ceph::Formatter *f) const {
+	f->dump_int("size", get_size_slow());
+	f->dump_int("num_keys", q.size());
+      }
+    };
+
+    using SubQueues = std::map<priority_t, SubQueue>;
+
+    SubQueues high_queue;
+
+    using Queue = dmc::PullPriorityQueue<K,T,false>;
+    Queue queue;
+
+    // when enqueue_front is called, rather than try to re-calc tags
+    // to put in mClock priority queue, we'll just keep a separate
+    // list from which we dequeue items first, and only when it's
+    // empty do we use queue.
+    std::list<std::pair<K,T>> queue_front;
+
+  public:
+
+    mClockQueue(
+      const typename Queue::ClientInfoFunc& info_func,
+      double anticipation_timeout = 0.0) :
+      queue(info_func, dmc::AtLimit::Allow, anticipation_timeout)
+    {
+      // empty
+    }
+
+    unsigned get_size_slow() const {
+      unsigned total = 0;
+      total += queue_front.size();
+      total += queue.request_count();
+      for (auto i = high_queue.cbegin(); i != high_queue.cend(); ++i) {
+	ceph_assert(i->second.get_size_slow());
+	total += i->second.get_size_slow();
+      }
+      return total;
+    }
+
+    // be sure to do things in reverse priority order and push_front
+    // to the list so items end up on list in front-to-back priority
+    // order
+    void remove_by_filter(std::function<bool (T&&)> filter_accum) {
+      queue.remove_by_req_filter([&] (std::unique_ptr<T>&& r) {
+          return filter_accum(std::move(*r));
+        }, true);
+
+      for (auto i = queue_front.rbegin(); i != queue_front.rend(); /* no-inc */) {
+	if (filter_accum(std::move(i->second))) {
+	  i = decltype(i){ queue_front.erase(std::next(i).base()) };
+	} else {
+	  ++i;
+	}
+      }
+
+      for (typename SubQueues::iterator i = high_queue.begin();
+	   i != high_queue.end();
+	   /* no-inc */ ) {
+	i->second.remove_by_filter(filter_accum);
+	if (i->second.empty()) {
+	  i = high_queue.erase(i);
+	} else {
+	  ++i;
+	}
+      }
+    }
+
+    void remove_by_class(K k, std::list<T> *out = nullptr) override final {
+      if (out) {
+	queue.remove_by_client(k,
+			       true,
+			       [&out] (std::unique_ptr<T>&& t) {
+				 out->push_front(std::move(*t));
+			       });
+      } else {
+	queue.remove_by_client(k, true);
+      }
+
+      for (auto i = queue_front.rbegin(); i != queue_front.rend(); /* no-inc */) {
+	if (k == i->first) {
+	  if (nullptr != out) out->push_front(std::move(i->second));
+	  i = decltype(i){ queue_front.erase(std::next(i).base()) };
+	} else {
+	  ++i;
+	}
+      }
+
+      for (auto i = high_queue.begin(); i != high_queue.end(); /* no-inc */) {
+	i->second.remove_by_class(k, out);
+	if (i->second.empty()) {
+	  i = high_queue.erase(i);
+	} else {
+	  ++i;
+	}
+      }
+    }
+
+    void enqueue_strict(K cl, unsigned priority, T&& item) override final {
+      high_queue[priority].enqueue(cl, 1, std::move(item));
+    }
+
+    void enqueue_strict_front(K cl, unsigned priority, T&& item) override final {
+      high_queue[priority].enqueue_front(cl, 1, std::move(item));
+    }
+
+    void enqueue(K cl, unsigned priority, unsigned cost, T&& item) override final {
+      // priority is ignored
+      queue.add_request(std::move(item), cl, cost);
+    }
+
+    void enqueue_front(K cl,
+		       unsigned priority,
+		       unsigned cost,
+		       T&& item) override final {
+      queue_front.emplace_front(std::pair<K,T>(cl, std::move(item)));
+    }
+
+    bool empty() const override final {
+      return queue.empty() && high_queue.empty() && queue_front.empty();
+    }
+
+    T dequeue() override final {
+      ceph_assert(!empty());
+
+      if (!high_queue.empty()) {
+	T ret = std::move(high_queue.rbegin()->second.front().second);
+	high_queue.rbegin()->second.pop_front();
+	if (high_queue.rbegin()->second.empty()) {
+	  high_queue.erase(high_queue.rbegin()->first);
+	}
+	return ret;
+      }
+
+      if (!queue_front.empty()) {
+	T ret = std::move(queue_front.front().second);
+	queue_front.pop_front();
+	return ret;
+      }
+
+      auto pr = queue.pull_request();
+      ceph_assert(pr.is_retn());
+      auto& retn = pr.get_retn();
+      return std::move(*(retn.request));
+    }
+
+    void dump(ceph::Formatter *f) const override final {
+      f->open_array_section("high_queues");
+      for (typename SubQueues::const_iterator p = high_queue.begin();
+	   p != high_queue.end();
+	   ++p) {
+	f->open_object_section("subqueue");
+	f->dump_int("priority", p->first);
+	p->second.dump(f);
+	f->close_section();
+      }
+      f->close_section();
+
+      f->open_object_section("queue_front");
+      f->dump_int("size", queue_front.size());
+      f->close_section();
+
+      f->open_object_section("queue");
+      f->dump_int("size", queue.request_count());
+      f->close_section();
+    } // dump
+
+    void print(std::ostream &os) const final {
+      os << "mClockPriorityQueue";
+    }
+  };
+
+} // namespace ceph
diff --git a/src/common/map_cacher.hpp b/src/common/map_cacher.hpp
new file mode 100644
index 000000000..a83f924b6
--- /dev/null
+++ b/src/common/map_cacher.hpp
@@ -0,0 +1,190 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef MAPCACHER_H
+#define MAPCACHER_H
+
+#include "include/Context.h"
+#include "common/sharedptr_registry.hpp"
+
+namespace MapCacher {
+/**
+ * Abstraction for ordering key updates
+ */
+template<typename K, typename V>
+class Transaction {
+public:
+  /// Std::set keys according to map
+  virtual void set_keys(
+    const std::map<K, V> &keys ///< [in] keys/values to std::set
+    ) = 0;
+
+  /// Remove keys
+  virtual void remove_keys(
+    const std::set<K> &to_remove ///< [in] keys to remove
+    ) = 0;
+
+  /// Add context to fire when data is readable
+  virtual void add_callback(
+    Context *c ///< [in] Context to fire on readable
+    ) = 0;
+  virtual ~Transaction() {}
+};
+
+/**
+ * Abstraction for fetching keys
+ */
+template<typename K, typename V>
+class StoreDriver {
+public:
+  /// Returns requested key values
+  virtual int get_keys(
+    const std::set<K> &keys,   ///< [in] keys requested
+    std::map<K, V> *got  ///< [out] values for keys obtained
+    ) = 0; ///< @return error value
+
+  /// Returns next key
+  virtual int get_next(
+    const K &key,       ///< [in] key after which to get next
+    std::pair<K, V> *next    ///< [out] first key after key
+    ) = 0; ///< @return 0 on success, -ENOENT if there is no next
+
+  virtual int get_next_or_current(
+    const K &key,       ///< [in] key at-which-or-after to get
+    std::pair<K, V> *next_or_current
+    ) = 0; ///< @return 0 on success, -ENOENT if there is no next
+
+  virtual ~StoreDriver() {}
+};
+
+/**
+ * Uses SharedPtrRegistry to cache objects of in progress writes
+ * allowing the user to read/write a consistent view of the map
+ * without flushing writes.
+ */
+template<typename K, typename V>
+class MapCacher {
+private:
+  StoreDriver<K, V> *driver;
+
+  SharedPtrRegistry<K, boost::optional<V> > in_progress;
+  typedef typename SharedPtrRegistry<K, boost::optional<V> >::VPtr VPtr;
+  typedef ContainerContext<std::set<VPtr> > TransHolder;
+
+public:
+  MapCacher(StoreDriver<K, V> *driver) : driver(driver) {}
+
+  /// Fetch first key/value std::pair after specified key
+  int get_next(
+    K key,               ///< [in] key after which to get next
+    std::pair<K, V> *next     ///< [out] next key
+    ) {
+    while (true) {
+      std::pair<K, boost::optional<V> > cached;
+      std::pair<K, V> store;
+      bool got_cached = in_progress.get_next(key, &cached);
+
+      bool got_store = false;
+      int r = driver->get_next(key, &store);
+      if (r < 0 && r != -ENOENT) {
+	return r;
+      } else if (r == 0) {
+	got_store = true;
+      }
+
+      if (!got_cached && !got_store) {
+	return -ENOENT;
+      } else if (
+	got_cached &&
+	(!got_store || store.first >= cached.first)) {
+	if (cached.second) {
+	  if (next)
+	    *next = make_pair(cached.first, cached.second.get());
+	  return 0;
+	} else {
+	  key = cached.first;
+	  continue; // value was cached as removed, recurse
+	}
+      } else {
+	if (next)
+	  *next = store;
+	return 0;
+      }
+    }
+    ceph_abort(); // not reachable
+    return -EINVAL;
+  } ///< @return error value, 0 on success, -ENOENT if no more entries
+
+  /// Adds operation setting keys to Transaction
+  void set_keys(
+    const std::map<K, V> &keys,  ///< [in] keys/values to std::set
+    Transaction<K, V> *t    ///< [out] transaction to use
+    ) {
+    std::set<VPtr> vptrs;
+    for (auto i = keys.begin(); i != keys.end(); ++i) {
+      VPtr ip = in_progress.lookup_or_create(i->first, i->second);
+      *ip = i->second;
+      vptrs.insert(ip);
+    }
+    t->set_keys(keys);
+    t->add_callback(new TransHolder(vptrs));
+  }
+
+  /// Adds operation removing keys to Transaction
+  void remove_keys(
+    const std::set<K> &keys,  ///< [in]
+    Transaction<K, V> *t ///< [out] transaction to use
+    ) {
+    std::set<VPtr> vptrs;
+    for (auto i = keys.begin(); i != keys.end(); ++i) {
+      boost::optional<V> empty;
+      VPtr ip = in_progress.lookup_or_create(*i, empty);
+      *ip = empty;
+      vptrs.insert(ip);
+    }
+    t->remove_keys(keys);
+    t->add_callback(new TransHolder(vptrs));
+  }
+
+  /// Gets keys, uses cached values for unstable keys
+  int get_keys(
+    const std::set<K> &keys_to_get, ///< [in] std::set of keys to fetch
+    std::map<K, V> *got             ///< [out] keys gotten
+    ) {
+    std::set<K> to_get;
+    std::map<K, V> _got;
+    for (auto i = keys_to_get.begin();
+	 i != keys_to_get.end();
+	 ++i) {
+      VPtr val = in_progress.lookup(*i);
+      if (val) {
+	if (*val)
+	  got->insert(make_pair(*i, val->get()));
+	//else: value cached is empty, key doesn't exist
+      } else {
+	to_get.insert(*i);
+      }
+    }
+    int r = driver->get_keys(to_get, &_got);
+    if (r < 0)
+      return r;
+    for (auto i = _got.begin(); i != _got.end(); ++i) {
+      got->insert(*i);
+    }
+    return 0;
+  } ///< @return error value, 0 on success
+};
+} // namespace
+
+#endif
diff --git a/src/common/mempool.cc b/src/common/mempool.cc
new file mode 100644
index 000000000..79354f708
--- /dev/null
+++ b/src/common/mempool.cc
@@ -0,0 +1,140 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Allen Samuels <allen.samuels@sandisk.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/mempool.h"
+#include "include/demangle.h"
+
+// Thread local variables should save index, not &shard[index],
+// because shard[] is defined in the class
+static thread_local size_t thread_shard_index = mempool::num_shards;
+
+// default to debug_mode off
+bool mempool::debug_mode = false;
+
+// --------------------------------------------------------------
+
+mempool::pool_t& mempool::get_pool(mempool::pool_index_t ix)
+{
+  // We rely on this array being initialized before any invocation of
+  // this function, even if it is called by ctors in other compilation
+  // units that are being initialized before this compilation unit.
+  static mempool::pool_t table[num_pools];
+  return table[ix];
+}
+
+const char *mempool::get_pool_name(mempool::pool_index_t ix) {
+#define P(x) #x,
+  static const char *names[num_pools] = {
+    DEFINE_MEMORY_POOLS_HELPER(P)
+  };
+#undef P
+  return names[ix];
+}
+
+void mempool::dump(ceph::Formatter *f)
+{
+  stats_t total;
+  f->open_object_section("mempool"); // we need (dummy?) topmost section for 
+				     // JSON Formatter to print pool names. It omits them otherwise.
+  f->open_object_section("by_pool");
+  for (size_t i = 0; i < num_pools; ++i) {
+    const pool_t &pool = mempool::get_pool((pool_index_t)i);
+    f->open_object_section(get_pool_name((pool_index_t)i));
+    pool.dump(f, &total);
+    f->close_section();
+  }
+  f->close_section();
+  f->dump_object("total", total);
+  f->close_section();
+}
+
+void mempool::set_debug_mode(bool d)
+{
+  debug_mode = d;
+}
+
+// --------------------------------------------------------------
+// pool_t
+
+size_t mempool::pool_t::allocated_bytes() const
+{
+  ssize_t result = 0;
+  for (size_t i = 0; i < num_shards; ++i) {
+    result += shard[i].bytes;
+  }
+  if (result < 0) {
+    // we raced with some unbalanced allocations/deallocations
+    result = 0;
+  }
+  return (size_t) result;
+}
+
+size_t mempool::pool_t::allocated_items() const
+{
+  ssize_t result = 0;
+  for (size_t i = 0; i < num_shards; ++i) {
+    result += shard[i].items;
+  }
+  if (result < 0) {
+    // we raced with some unbalanced allocations/deallocations
+    result = 0;
+  }
+  return (size_t) result;
+}
+
+void mempool::pool_t::adjust_count(ssize_t items, ssize_t bytes)
+{
+  thread_shard_index = (thread_shard_index == num_shards) ? pick_a_shard_int() : thread_shard_index;
+  shard[thread_shard_index].items += items;
+  shard[thread_shard_index].bytes += bytes;
+}
+
+void mempool::pool_t::get_stats(
+  stats_t *total,
+  std::map<std::string, stats_t> *by_type) const
+{
+  for (size_t i = 0; i < num_shards; ++i) {
+    total->items += shard[i].items;
+    total->bytes += shard[i].bytes;
+  }
+  if (debug_mode) {
+    std::lock_guard shard_lock(lock);
+    for (auto &p : type_map) {
+      std::string n = ceph_demangle(p.second.type_name);
+      stats_t &s = (*by_type)[n];
+      s.bytes = p.second.items * p.second.item_size;
+      s.items = p.second.items;
+    }
+  }
+}
+
+void mempool::pool_t::dump(ceph::Formatter *f, stats_t *ptotal) const
+{
+  stats_t total;
+  std::map<std::string, stats_t> by_type;
+  get_stats(&total, &by_type);
+  if (ptotal) {
+    *ptotal += total;
+  }
+  total.dump(f);
+  if (!by_type.empty()) {
+    f->open_object_section("by_type");
+    for (auto &i : by_type) {
+      f->open_object_section(i.first.c_str());
+      i.second.dump(f);
+      f->close_section();
+    }
+    f->close_section();
+  }
+}
diff --git a/src/common/mime.c b/src/common/mime.c
new file mode 100644
index 000000000..fe45123cc
--- /dev/null
+++ b/src/common/mime.c
@@ -0,0 +1,132 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "common/utf8.h"
+
+#include <errno.h>
+#include <stdio.h>
+
+int mime_encode_as_qp(const char *input, char *output, int outlen)
+{
+	int ret = 1;
+	char *o = output;
+	const unsigned char *i = (const unsigned char*)input;
+	while (1) {
+		int c = *i;
+		if (c == '\0') {
+			break;
+		}
+		else if ((c & 0x80) || (c == '=') || (is_control_character(c))) {
+			if (outlen >= 3) {
+				snprintf(o, outlen, "=%02X", c);
+				outlen -= 3;
+				o += 3;
+			}
+			else
+				outlen = 0;
+			ret += 3;
+		}
+		else {
+			if (outlen >= 1) {
+				snprintf(o, outlen, "%c", c);
+				outlen -= 1;
+				o += 1;
+			}
+			ret += 1;
+		}
+		++i;
+	}
+	return ret;
+}
+
+static inline signed int hexchar_to_int(unsigned int c)
+{
+	switch(c) {
+	case '0':
+		return 0;
+	case '1':
+		return 1;
+	case '2':
+		return 2;
+	case '3':
+		return 3;
+	case '4':
+		return 4;
+	case '5':
+		return 5;
+	case '6':
+		return 6;
+	case '7':
+		return 7;
+	case '8':
+		return 8;
+	case '9':
+		return 9;
+	case 'A':
+	case 'a':
+		return 10;
+	case 'B':
+	case 'b':
+		return 11;
+	case 'C':
+	case 'c':
+		return 12;
+	case 'D':
+	case 'd':
+		return 13;
+	case 'E':
+	case 'e':
+		return 14;
+	case 'F':
+	case 'f':
+		return 15;
+	case '\0':
+	default:
+	    return -EDOM;
+	}
+}
+
+int mime_decode_from_qp(const char *input, char *output, int outlen)
+{
+	int ret = 1;
+	char *o = output;
+	const unsigned char *i = (const unsigned char*)input;
+	while (1) {
+		unsigned int c = *i;
+		if (c == '\0') {
+			break;
+		}
+		else if (c & 0x80) {
+			/* The high bit is never set in quoted-printable encoding! */
+			return -EDOM;
+		}
+		else if (c == '=') {
+			int high = hexchar_to_int(*++i);
+			if (high < 0)
+				return -EINVAL;
+			int low = hexchar_to_int(*++i);
+			if (low < 0)
+				return -EINVAL;
+			c = (high << 4) + low;
+		}
+		++i;
+
+		if (outlen >= 1) {
+			snprintf(o, outlen, "%c", c);
+			outlen -= 1;
+			o += 1;
+		}
+		ret += 1;
+	}
+	return ret;
+}
diff --git a/src/common/mime.h b/src/common/mime.h
new file mode 100644
index 000000000..f62040a22
--- /dev/null
+++ b/src/common/mime.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_MIME_H
+#define CEPH_COMMON_MIME_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Encode a buffer as quoted-printable.
+ *
+ * The input is a null-terminated string.
+ * The output is a null-terminated string representing the input encoded as
+ * a MIME quoted-printable.
+ *
+ * Returns the length of the buffer we would need to do the encoding.
+ * If we don't have enough buffer space, the output will be truncated.
+ *
+ * You may call mime_encode_as_qp(input, NULL, 0) to find the size of the
+ * buffer you will need.
+ */
+signed int mime_encode_as_qp(const char *input, char *output, int outlen);
+
+/* Decode a quoted-printable buffer.
+ *
+ * The input is a null-terminated string encoded as a MIME quoted-printable.
+ * The output is a null-terminated string representing the input decoded.
+ *
+ * Returns a negative error code if the input is not a valid quoted-printable
+ * buffer.
+ * Returns the length of the buffer we would need to do the encoding.
+ * If we don't have enough buffer space, the output will be truncated.
+ *
+ * You may call mime_decode_as_qp(input, NULL, 0) to find the size of the
+ * buffer you will need. The output will never be longer than the input for
+ * this function.
+ */
+signed int mime_decode_from_qp(const char *input, char *output, int outlen);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/common/module.c b/src/common/module.c
new file mode 100644
index 000000000..a2a468ac3
--- /dev/null
+++ b/src/common/module.c
@@ -0,0 +1,95 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "acconfig.h"
+#include "include/compat.h"
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#if defined(__FreeBSD__)
+#include <sys/wait.h>
+#endif 
+
+#ifndef _WIN32
+/*
+ * TODO: Switch to libkmod when we abandon older platforms.  The APIs
+ * we want are:
+ *
+ * - kmod_module_new_from_name() for obtaining handles;
+ * - kmod_module_probe_insert_module() for module_load();
+ * - kmod_module_get_info(), kmod_module_info_get_{key,value}() for
+ *   module_has_param().
+ */
+
+/*
+ * Return command's exit status or -1 on error.
+ */
+static int run_command(const char *command)
+{
+	int status;
+
+	status = system(command);
+	if (status >= 0 && WIFEXITED(status))
+		return WEXITSTATUS(status);
+
+	if (status < 0) {
+		char error_buf[80];
+		char* errp = ceph_strerror_r(errno, error_buf, sizeof(error_buf));
+		fprintf(stderr, "couldn't run '%s': %s\n", command,
+			errp);
+	} else if (WIFSIGNALED(status)) {
+		fprintf(stderr, "'%s' killed by signal %d\n", command,
+			WTERMSIG(status));
+	} else {
+		fprintf(stderr, "weird status from '%s': %d\n", command,
+			status);
+	}
+
+	return -1;
+}
+
+int module_has_param(const char *module, const char *param)
+{
+	char command[128];
+
+	snprintf(command, sizeof(command),
+		 "/sbin/modinfo -F parm %s | /bin/grep -q ^%s:",
+		 module, param);
+
+	return run_command(command) == 0;
+}
+
+int module_load(const char *module, const char *options)
+{
+	char command[128];
+
+	snprintf(command, sizeof(command), "/sbin/modprobe %s %s",
+		 module, (options ? options : ""));
+
+	return run_command(command);
+}
+
+#else
+
+// We're stubbing out those functions, for now.
+int module_has_param(const char *module, const char *param)
+{
+	return -1;
+}
+
+int module_load(const char *module, const char *options)
+{
+	return -1;
+}
+
+#endif /* _WIN32 */
diff --git a/src/common/module.h b/src/common/module.h
new file mode 100644
index 000000000..d5fa6a1a4
--- /dev/null
+++ b/src/common/module.h
@@ -0,0 +1,27 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MODULE_H
+#define CEPH_MODULE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int module_has_param(const char *module, const char *param);
+int module_load(const char *module, const char *options);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CEPH_MODULE_H */
diff --git a/src/common/mutex_debug.cc b/src/common/mutex_debug.cc
new file mode 100644
index 000000000..b832d211e
--- /dev/null
+++ b/src/common/mutex_debug.cc
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/mutex_debug.h"
+#include "common/perf_counters.h"
+#include "common/ceph_context.h"
+#include "common/config.h"
+
+namespace ceph {
+namespace mutex_debug_detail {
+enum {
+  l_mutex_first = 999082,
+  l_mutex_wait,
+  l_mutex_last
+};
+
+mutex_debugging_base::mutex_debugging_base(std::string group, bool ld, bool bt)
+  : group(std::move(group)),
+    lockdep(ld),
+    backtrace(bt)
+{
+  if (_enable_lockdep()) {
+    _register();
+  }
+}
+
+mutex_debugging_base::~mutex_debugging_base() {
+  ceph_assert(nlock == 0);
+  if (_enable_lockdep()) {
+    lockdep_unregister(id);
+  }
+}
+
+void mutex_debugging_base::_register() {
+  id = lockdep_register(group.c_str());
+}
+void mutex_debugging_base::_will_lock(bool recursive) { // about to lock
+  id = lockdep_will_lock(group.c_str(), id, backtrace, recursive);
+}
+void mutex_debugging_base::_locked() {    // just locked
+  id = lockdep_locked(group.c_str(), id, backtrace);
+}
+void mutex_debugging_base::_will_unlock() {  // about to unlock
+  id = lockdep_will_unlock(group.c_str(), id);
+}
+
+} // namespace mutex_debug_detail
+} // namespace ceph
diff --git a/src/common/mutex_debug.h b/src/common/mutex_debug.h
new file mode 100644
index 000000000..c1a4ff2a4
--- /dev/null
+++ b/src/common/mutex_debug.h
@@ -0,0 +1,209 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_MUTEX_DEBUG_H
+#define CEPH_COMMON_MUTEX_DEBUG_H
+
+#include <atomic>
+#include <system_error>
+#include <thread>
+
+#include <pthread.h>
+
+#include "include/ceph_assert.h"
+#include "include/common_fwd.h"
+
+#include "ceph_time.h"
+#include "likely.h"
+#include "lockdep.h"
+
+namespace ceph {
+namespace mutex_debug_detail {
+
+class mutex_debugging_base
+{
+protected:
+  std::string group;
+  int id = -1;
+  bool lockdep;   // track this mutex using lockdep_*
+  bool backtrace; // gather backtrace on lock acquisition
+
+  std::atomic<int> nlock = 0;
+  std::thread::id locked_by = {};
+
+  bool _enable_lockdep() const {
+    return lockdep && g_lockdep;
+  }
+  void _register();
+  void _will_lock(bool recursive=false); // about to lock
+  void _locked(); // just locked
+  void _will_unlock(); // about to unlock
+
+  mutex_debugging_base(std::string group, bool ld = true, bool bt = false);
+  ~mutex_debugging_base();
+
+public:
+  bool is_locked() const {
+    return (nlock > 0);
+  }
+  bool is_locked_by_me() const {
+    return nlock.load(std::memory_order_acquire) > 0 && locked_by == std::this_thread::get_id();
+  }
+  operator bool() const {
+    return is_locked_by_me();
+  }
+};
+
+// Since this is a /debugging/ mutex just define it in terms of the
+// pthread error check mutex.
+template<bool Recursive>
+class mutex_debug_impl : public mutex_debugging_base
+{
+private:
+  pthread_mutex_t m;
+
+  void _init() {
+    pthread_mutexattr_t a;
+    pthread_mutexattr_init(&a);
+    int r;
+    if (recursive)
+      r = pthread_mutexattr_settype(&a, PTHREAD_MUTEX_RECURSIVE);
+    else
+      r = pthread_mutexattr_settype(&a, PTHREAD_MUTEX_ERRORCHECK);
+    ceph_assert(r == 0);
+    r = pthread_mutex_init(&m, &a);
+    ceph_assert(r == 0);
+  }
+
+  bool enable_lockdep(bool no_lockdep) const {
+    if (recursive) {
+      return false;
+    } else if (no_lockdep) {
+      return false;
+    } else {
+      return _enable_lockdep();
+    }
+  }
+
+public:
+  static constexpr bool recursive = Recursive;
+
+  mutex_debug_impl(std::string group, bool ld = true, bool bt = false)
+    : mutex_debugging_base(group, ld, bt) {
+    _init();
+  }
+
+  // Mutex is Destructible
+  ~mutex_debug_impl() {
+    int r = pthread_mutex_destroy(&m);
+    ceph_assert(r == 0);
+  }
+
+  // Mutex concept is non-Copyable
+  mutex_debug_impl(const mutex_debug_impl&) = delete;
+  mutex_debug_impl& operator =(const mutex_debug_impl&) = delete;
+
+  // Mutex concept is non-Movable
+  mutex_debug_impl(mutex_debug_impl&&) = delete;
+  mutex_debug_impl& operator =(mutex_debug_impl&&) = delete;
+
+  void lock_impl() {
+    int r = pthread_mutex_lock(&m);
+    // Allowed error codes for Mutex concept
+    if (unlikely(r == EPERM ||
+		 r == EDEADLK ||
+		 r == EBUSY)) {
+      throw std::system_error(r, std::generic_category());
+    }
+    ceph_assert(r == 0);
+  }
+
+  void unlock_impl() noexcept {
+    int r = pthread_mutex_unlock(&m);
+    ceph_assert(r == 0);
+  }
+
+  bool try_lock_impl() {
+    int r = pthread_mutex_trylock(&m);
+    switch (r) {
+    case 0:
+      return true;
+    case EBUSY:
+      return false;
+    default:
+      throw std::system_error(r, std::generic_category());
+    }
+  }
+  pthread_mutex_t* native_handle() {
+    return &m;
+  }
+
+  void _post_lock() {
+    if (!recursive)
+      ceph_assert(nlock == 0);
+    locked_by = std::this_thread::get_id();
+    nlock.fetch_add(1, std::memory_order_release);
+  }
+
+  void _pre_unlock() {
+    if (recursive) {
+      ceph_assert(nlock > 0);
+    } else {
+      ceph_assert(nlock == 1);
+    }
+    ceph_assert(locked_by == std::this_thread::get_id());
+    if (nlock == 1)
+      locked_by = std::thread::id();
+    nlock.fetch_sub(1, std::memory_order_release);
+  }
+
+  bool try_lock(bool no_lockdep = false) {
+    bool locked = try_lock_impl();
+    if (locked) {
+      if (enable_lockdep(no_lockdep))
+	_locked();
+      _post_lock();
+    }
+    return locked;
+  }
+
+  void lock(bool no_lockdep = false) {
+    if (enable_lockdep(no_lockdep))
+      _will_lock(recursive);
+
+    if (try_lock(no_lockdep))
+      return;
+
+    lock_impl();
+    if (enable_lockdep(no_lockdep))
+      _locked();
+    _post_lock();
+  }
+
+  void unlock(bool no_lockdep = false) {
+    _pre_unlock();
+    if (enable_lockdep(no_lockdep))
+      _will_unlock();
+    unlock_impl();
+  }
+
+};
+
+
+} // namespace mutex_debug_detail
+typedef mutex_debug_detail::mutex_debug_impl<false> mutex_debug;
+typedef mutex_debug_detail::mutex_debug_impl<true> mutex_recursive_debug;
+} // namespace ceph
+
+#endif
diff --git a/src/common/numa.cc b/src/common/numa.cc
new file mode 100644
index 000000000..f62f0d010
--- /dev/null
+++ b/src/common/numa.cc
@@ -0,0 +1,260 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "numa.h"
+
+#include <cstring>
+#include <errno.h>
+#include <iostream>
+
+#include "include/stringify.h"
+#include "common/safe_io.h"
+
+using namespace std::literals;
+
+using std::set;
+
+
+// list
+#if defined(__linux__)
+int parse_cpu_set_list(const char *s,
+		       size_t *cpu_set_size,
+		       cpu_set_t *cpu_set)
+{
+  CPU_ZERO(cpu_set);
+  while (*s) {
+    char *end;
+    int a = strtol(s, &end, 10);
+    if (end == s) {
+      return -EINVAL;
+    }
+    if (*end == '-') {
+      s = end + 1;
+      int b = strtol(s, &end, 10);
+      if (end == s) {
+	return -EINVAL;
+      }
+      for (; a <= b; ++a) {
+	CPU_SET(a, cpu_set);
+      }
+      *cpu_set_size = a;
+    } else {
+      CPU_SET(a, cpu_set);
+      *cpu_set_size = a + 1;
+    }
+    if (*end == 0) {
+      break;
+    }
+    if (*end != ',') {
+      return -EINVAL;
+    }
+    s = end + 1;
+  }
+  return 0;
+}
+
+std::string cpu_set_to_str_list(size_t cpu_set_size,
+				const cpu_set_t *cpu_set)
+{
+  std::string r;
+  unsigned a = 0;
+  while (true) {
+    while (a < cpu_set_size && !CPU_ISSET(a, cpu_set)) {
+      ++a;
+    }
+    if (a >= cpu_set_size) {
+      break;
+    }
+    unsigned b = a + 1;
+    while (b < cpu_set_size && CPU_ISSET(b, cpu_set)) {
+      ++b;
+    }
+    if (r.size()) {
+      r += ",";
+    }
+    if (b > a + 1) {
+      r += stringify(a) + "-" + stringify(b - 1);
+    } else {
+      r += stringify(a);
+    }
+    a = b;
+  }
+  return r;
+}
+
+std::set<int> cpu_set_to_set(size_t cpu_set_size,
+			     const cpu_set_t *cpu_set)
+{
+  set<int> r;
+  unsigned a = 0;
+  while (true) {
+    while (a < cpu_set_size && !CPU_ISSET(a, cpu_set)) {
+      ++a;
+    }
+    if (a >= cpu_set_size) {
+      break;
+    }
+    unsigned b = a + 1;
+    while (b < cpu_set_size && CPU_ISSET(b, cpu_set)) {
+      ++b;
+    }
+    while (a < b) {
+      r.insert(a);
+      ++a;
+    }
+  }
+  return r;
+}
+
+
+int get_numa_node_cpu_set(
+  int node,
+  size_t *cpu_set_size,
+  cpu_set_t *cpu_set)
+{
+  std::string fn = "/sys/devices/system/node/node";
+  fn += stringify(node);
+  fn += "/cpulist";
+  int fd = ::open(fn.c_str(), O_RDONLY);
+  if (fd < 0) {
+    return -errno;
+  }
+  char buf[1024];
+  int r = safe_read(fd, &buf, sizeof(buf));
+  if (r < 0) {
+    goto out;
+  }
+  buf[r] = 0;
+  while (r > 0 && ::isspace(buf[--r])) {
+    buf[r] = 0;
+  }
+  r = parse_cpu_set_list(buf, cpu_set_size, cpu_set);
+  if (r < 0) {
+    goto out;
+  }
+  r = 0;
+ out:
+  ::close(fd);
+  return r;
+}
+
+static int easy_readdir(const std::string& dir, std::set<std::string> *out)
+{
+  DIR *h = ::opendir(dir.c_str());
+  if (!h) {
+    return -errno;
+  }
+  struct dirent *de = nullptr;
+  while ((de = ::readdir(h))) {
+    if (strcmp(de->d_name, ".") == 0 ||
+	strcmp(de->d_name, "..") == 0) {
+      continue;
+    }
+    out->insert(de->d_name);
+  }
+  closedir(h);
+  return 0;
+}
+
+#ifdef HAVE_DPDK
+static std::string get_task_comm(pid_t tid)
+{
+  static const char* comm_fmt = "/proc/self/task/%d/comm";
+  char comm_name[strlen(comm_fmt) + 8];
+  snprintf(comm_name, sizeof(comm_name), comm_fmt, tid);
+  int fd = open(comm_name, O_CLOEXEC | O_RDONLY);
+  if (fd == -1) {
+    return "";
+  }
+  // see linux/sched.h
+  static constexpr int TASK_COMM_LEN = 16;
+  char name[TASK_COMM_LEN];
+  ssize_t n = safe_read(fd, name, sizeof(name));
+  close(fd);
+  if (n < 0) {
+    return "";
+  }
+  assert(static_cast<size_t>(n) <= sizeof(name));
+  if (name[n - 1] == '\n') {
+    name[n - 1] = '\0';
+  } else {
+    name[n] = '\0';
+  }
+  return name;
+}
+#endif
+
+int set_cpu_affinity_all_threads(size_t cpu_set_size, cpu_set_t *cpu_set)
+{
+  // first set my affinity
+  int r = sched_setaffinity(getpid(), cpu_set_size, cpu_set);
+  if (r < 0) {
+    return -errno;
+  }
+
+  // make 2 passes here so that we (hopefully) catch racing threads creating
+  // threads.
+  for (unsigned pass = 0; pass < 2; ++pass) {
+    // enumerate all child threads from /proc
+    std::set<std::string> ls;
+    std::string path = "/proc/"s + stringify(getpid()) + "/task";
+    r = easy_readdir(path, &ls);
+    if (r < 0) {
+      return r;
+    }
+    for (auto& i : ls) {
+      pid_t tid = atoll(i.c_str());
+      if (!tid) {
+	continue;  // wtf
+      }
+      #ifdef HAVE_DPDK
+      std::string thread_name = get_task_comm(tid);
+      static const char *dpdk_worker_name = "lcore-worker";
+      if (!thread_name.compare(0, strlen(dpdk_worker_name), dpdk_worker_name)) {
+	// ignore dpdk reactor thread, as it takes case of numa by itself
+        continue;
+      }
+      #endif
+      r = sched_setaffinity(tid, cpu_set_size, cpu_set);
+      if (r < 0) {
+	return -errno;
+      }
+    }
+  }
+  return 0;
+}
+
+#else
+int parse_cpu_set_list(const char *s,
+		       size_t *cpu_set_size,
+		       cpu_set_t *cpu_set)
+{
+  return -ENOTSUP;
+}
+
+std::string cpu_set_to_str_list(size_t cpu_set_size,
+				const cpu_set_t *cpu_set)
+{
+  return {};
+}
+
+std::set<int> cpu_set_to_set(size_t cpu_set_size,
+			     const cpu_set_t *cpu_set)
+{
+  return {};
+}
+
+int get_numa_node_cpu_set(int node,
+                          size_t *cpu_set_size,
+                          cpu_set_t *cpu_set)
+{
+  return -ENOTSUP;
+}
+
+int set_cpu_affinity_all_threads(size_t cpu_set_size,
+				 cpu_set_t *cpu_set)
+{
+  return -ENOTSUP;
+}
+
+#endif
diff --git a/src/common/numa.h b/src/common/numa.h
new file mode 100644
index 000000000..78851deef
--- /dev/null
+++ b/src/common/numa.h
@@ -0,0 +1,24 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <include/compat.h>
+#include <sched.h>
+#include <ostream>
+#include <set>
+
+int parse_cpu_set_list(const char *s,
+		       size_t *cpu_set_size,
+		       cpu_set_t *cpu_set);
+std::string cpu_set_to_str_list(size_t cpu_set_size,
+				const cpu_set_t *cpu_set);
+std::set<int> cpu_set_to_set(size_t cpu_set_size,
+			     const cpu_set_t *cpu_set);
+
+int get_numa_node_cpu_set(int node,
+			  size_t *cpu_set_size,
+			  cpu_set_t *cpu_set);
+
+int set_cpu_affinity_all_threads(size_t cpu_set_size,
+				 cpu_set_t *cpu_set);
diff --git a/src/common/obj_bencher.cc b/src/common/obj_bencher.cc
new file mode 100644
index 000000000..32ecc9586
--- /dev/null
+++ b/src/common/obj_bencher.cc
@@ -0,0 +1,1440 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2009 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ * Series of functions to test your rados installation. Notice
+ * that this code is not terribly robust -- for instance, if you
+ * try and bench on a pool you don't have permission to access
+ * it will just loop forever.
+ */
+#include "include/compat.h"
+#include <pthread.h>
+#include "common/ceph_mutex.h"
+#include "common/Clock.h"
+#include "obj_bencher.h"
+
+using std::ostream;
+using std::cerr;
+using std::cout;
+using std::setfill;
+using std::setprecision;
+using std::setw;
+using std::string;
+using std::unique_lock;
+using std::unique_ptr;
+
+const std::string BENCH_LASTRUN_METADATA = "benchmark_last_metadata";
+const std::string BENCH_PREFIX = "benchmark_data";
+const std::string BENCH_OBJ_NAME = BENCH_PREFIX + "_%s_%d_object%d";
+
+static char cached_hostname[30] = {0};
+int cached_pid = 0;
+
+static std::string generate_object_prefix_nopid() {
+  if (cached_hostname[0] == 0) {
+    gethostname(cached_hostname, sizeof(cached_hostname)-1);
+    cached_hostname[sizeof(cached_hostname)-1] = 0;
+  }
+
+  std::ostringstream oss;
+  oss << BENCH_PREFIX << "_" << cached_hostname;
+  return oss.str();
+}
+
+static std::string generate_object_prefix(int pid = 0) {
+  if (pid)
+    cached_pid = pid;
+  else if (!cached_pid)
+    cached_pid = getpid();
+
+  std::ostringstream oss;
+  oss << generate_object_prefix_nopid() << "_" << cached_pid;
+  return oss.str();
+}
+
+// this is 8x faster than previous impl based on chained, deduped functions call
+static std::string generate_object_name_fast(int objnum, int pid = 0)
+{
+  if (cached_hostname[0] == 0) {
+	gethostname(cached_hostname, sizeof(cached_hostname)-1);
+	cached_hostname[sizeof(cached_hostname)-1] = 0;
+  }
+
+  if (pid)
+	cached_pid = pid;
+  else if (!cached_pid)
+	cached_pid = getpid();
+
+  char name[512];
+  int n = snprintf(&name[0], sizeof(name),  BENCH_OBJ_NAME.c_str(), cached_hostname, cached_pid, objnum);
+  ceph_assert(n > 0 && n < (int)sizeof(name));
+  return std::string(&name[0], (size_t)n);
+}
+
+static void sanitize_object_contents (bench_data *data, size_t length) {
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(data->object_contents, 'z', length);
+}
+
+ostream& ObjBencher::out(ostream& os, utime_t& t)
+{
+  if (show_time)
+    return t.localtime(os) << " ";
+  else
+    return os;
+}
+
+ostream& ObjBencher::out(ostream& os)
+{
+  utime_t cur_time = ceph_clock_now();
+  return out(os, cur_time);
+}
+
+void *ObjBencher::status_printer(void *_bencher) {
+  ObjBencher *bencher = static_cast<ObjBencher *>(_bencher);
+  bench_data& data = bencher->data;
+  Formatter *formatter = bencher->formatter;
+  ostream *outstream = bencher->outstream;
+  ceph::condition_variable cond;
+  int i = 0;
+  int previous_writes = 0;
+  int cycleSinceChange = 0;
+  double bandwidth;
+  int iops = 0;
+  mono_clock::duration ONE_SECOND = std::chrono::seconds(1);
+  std::unique_lock locker{bencher->lock};
+  if (formatter)
+    formatter->open_array_section("datas");
+  while(!data.done) {
+    mono_time cur_time = mono_clock::now();
+    utime_t t = ceph_clock_now();
+
+    if (i % 20 == 0 && !formatter) {
+      if (i > 0)
+        t.localtime(cout)
+          << " min lat: " << data.min_latency
+          << " max lat: " << data.max_latency
+          << " avg lat: " << data.avg_latency << std::endl;
+      //I'm naughty and don't reset the fill
+      bencher->out(cout, t) << setfill(' ')
+          << setw(5) << "sec"
+          << setw(8) << "Cur ops"
+          << setw(10) << "started"
+          << setw(10) << "finished"
+          << setw(10) << "avg MB/s"
+          << setw(10) << "cur MB/s"
+          << setw(12) << "last lat(s)"
+          << setw(12) << "avg lat(s)" << std::endl;
+    }
+    if (cycleSinceChange)
+      bandwidth = (double)(data.finished - previous_writes)
+        * (data.op_size)
+        / (1024*1024)
+        / cycleSinceChange;
+    else
+      bandwidth = -1;
+
+    if (!std::isnan(bandwidth) && bandwidth > -1) {
+      if (bandwidth > data.idata.max_bandwidth)
+        data.idata.max_bandwidth = bandwidth;
+      if (bandwidth < data.idata.min_bandwidth)
+        data.idata.min_bandwidth = bandwidth;
+
+      ++data.idata.bandwidth_cycles;
+      double delta = bandwidth - data.idata.avg_bandwidth;
+      data.idata.avg_bandwidth += delta / data.idata.bandwidth_cycles;
+      data.idata.bandwidth_diff_sum += delta * (bandwidth - data.idata.avg_bandwidth);
+    }
+
+    if (cycleSinceChange)
+      iops = (double)(data.finished - previous_writes)
+        / cycleSinceChange;
+    else
+      iops = -1;
+
+    if (!std::isnan(iops) && iops > -1) {
+      if (iops > data.idata.max_iops)
+        data.idata.max_iops = iops;
+      if (iops < data.idata.min_iops)
+        data.idata.min_iops = iops;
+
+      ++data.idata.iops_cycles;
+      double delta = iops - data.idata.avg_iops;
+      data.idata.avg_iops += delta / data.idata.iops_cycles;
+      data.idata.iops_diff_sum += delta * (iops - data.idata.avg_iops);
+    }
+    
+    if (formatter)
+      formatter->open_object_section("data");
+
+    // elapsed will be in seconds, by default
+    std::chrono::duration<double> elapsed = cur_time - data.start_time;
+    double avg_bandwidth = (double) (data.op_size) * (data.finished)
+      / elapsed.count() / (1024*1024);
+    if (previous_writes != data.finished) {
+      previous_writes = data.finished;
+      cycleSinceChange = 0;
+      if (!formatter) {
+        bencher->out(cout, t)
+	  << setfill(' ')
+          << setw(5) << i
+	  << ' ' << setw(7) << data.in_flight
+          << ' ' << setw(9) << data.started
+          << ' ' << setw(9) << data.finished
+          << ' ' << setw(9) << avg_bandwidth
+          << ' ' << setw(9) << bandwidth
+          << ' ' << setw(11) << (double)data.cur_latency.count()
+          << ' ' << setw(11) << data.avg_latency << std::endl;
+      } else {
+        formatter->dump_format("sec", "%d", i);
+        formatter->dump_format("cur_ops", "%d", data.in_flight);
+        formatter->dump_format("started", "%d", data.started);
+        formatter->dump_format("finished", "%d", data.finished);
+        formatter->dump_format("avg_bw", "%f", avg_bandwidth);
+        formatter->dump_format("cur_bw", "%f", bandwidth);
+        formatter->dump_format("last_lat", "%f", (double)data.cur_latency.count());
+        formatter->dump_format("avg_lat", "%f", data.avg_latency);
+      }
+    }
+    else {
+      if (!formatter) {
+        bencher->out(cout, t)
+	  << setfill(' ')
+          << setw(5) << i
+	  << ' ' << setw(7) << data.in_flight
+          << ' ' << setw(9) << data.started
+          << ' ' << setw(9) << data.finished
+          << ' ' << setw(9) << avg_bandwidth
+	  << ' ' << setw(9) << '0'
+          << ' ' << setw(11) << '-'
+          << ' '<< setw(11) << data.avg_latency << std::endl;
+      } else {
+        formatter->dump_format("sec", "%d", i);
+        formatter->dump_format("cur_ops", "%d", data.in_flight);
+        formatter->dump_format("started", "%d", data.started);
+        formatter->dump_format("finished", "%d", data.finished);
+        formatter->dump_format("avg_bw", "%f", avg_bandwidth);
+        formatter->dump_format("cur_bw", "%f", 0);
+        formatter->dump_format("last_lat", "%f", 0);
+        formatter->dump_format("avg_lat", "%f", data.avg_latency);
+      }
+    }
+    if (formatter) {
+      formatter->close_section(); // data
+      formatter->flush(*outstream);
+    }
+    ++i;
+    ++cycleSinceChange;
+    cond.wait_for(locker, ONE_SECOND);
+  }
+  if (formatter)
+    formatter->close_section(); //datas
+  if (iops < 0) {
+    std::chrono::duration<double> runtime = mono_clock::now() - data.start_time;
+    data.idata.min_iops = data.idata.max_iops = data.finished / runtime.count();
+  }
+  return NULL;
+}
+
+int ObjBencher::aio_bench(
+  int operation, int secondsToRun,
+  int concurrentios,
+  uint64_t op_size, uint64_t object_size,
+  unsigned max_objects,
+  bool cleanup, bool hints,
+  const std::string& run_name, bool reuse_bench, bool no_verify) {
+
+  if (concurrentios <= 0)
+    return -EINVAL;
+
+  int num_ops = 0;
+  int num_objects = 0;
+  int r = 0;
+  int prev_pid = 0;
+  std::chrono::duration<double> timePassed;
+
+  // default metadata object is used if user does not specify one
+  const std::string run_name_meta = (run_name.empty() ? BENCH_LASTRUN_METADATA : run_name);
+
+  //get data from previous write run, if available
+  if (operation != OP_WRITE || reuse_bench) {
+    uint64_t prev_op_size, prev_object_size;
+    r = fetch_bench_metadata(run_name_meta, &prev_op_size, &prev_object_size,
+			     &num_ops, &num_objects, &prev_pid);
+    if (r < 0) {
+      if (r == -ENOENT) {
+        if (reuse_bench)
+          cerr << "Must write data before using reuse_bench for a write benchmark!" << std::endl;
+        else
+          cerr << "Must write data before running a read benchmark!" << std::endl;
+      }
+      return r;
+    }
+    object_size = prev_object_size;   
+    op_size = prev_op_size;           
+  }
+
+  char* contentsChars = new char[op_size];
+  lock.lock();
+  data.done = false;
+  data.hints = hints;
+  data.object_size = object_size;
+  data.op_size = op_size;
+  data.in_flight = 0;
+  data.started = 0;
+  data.finished = 0;
+  data.min_latency = 9999.0; // this better be higher than initial latency!
+  data.max_latency = 0;
+  data.avg_latency = 0;
+  data.latency_diff_sum = 0;
+  data.object_contents = contentsChars;
+  lock.unlock();
+
+  //fill in contentsChars deterministically so we can check returns
+  sanitize_object_contents(&data, data.op_size);
+
+  if (formatter)
+    formatter->open_object_section("bench");
+
+  if (OP_WRITE == operation) {
+    r = write_bench(secondsToRun, concurrentios, run_name_meta, max_objects, prev_pid);
+    if (r != 0) goto out;
+  }
+  else if (OP_SEQ_READ == operation) {
+    r = seq_read_bench(secondsToRun, num_ops, num_objects, concurrentios, prev_pid, no_verify);
+    if (r != 0) goto out;
+  }
+  else if (OP_RAND_READ == operation) {
+    r = rand_read_bench(secondsToRun, num_ops, num_objects, concurrentios, prev_pid, no_verify);
+    if (r != 0) goto out;
+  }
+
+  if (OP_WRITE == operation && cleanup) {
+    r = fetch_bench_metadata(run_name_meta, &op_size, &object_size,
+                            &num_ops, &num_objects, &prev_pid);
+    if (r < 0) {
+      if (r == -ENOENT)
+        cerr << "Should never happen: bench metadata missing for current run!" << std::endl;
+      goto out;
+    }
+
+    data.start_time = mono_clock::now();
+    out(cout) << "Cleaning up (deleting benchmark objects)" << std::endl;
+
+    r = clean_up(num_objects, prev_pid, concurrentios);
+    if (r != 0) goto out;
+
+    timePassed = mono_clock::now() - data.start_time;
+    out(cout) << "Clean up completed and total clean up time :" << timePassed.count() << std::endl;
+
+    // lastrun file
+    r = sync_remove(run_name_meta);
+    if (r != 0) goto out;
+  }
+
+ out:
+  if (formatter) {
+    formatter->close_section(); // bench
+    formatter->flush(*outstream);
+    *outstream << std::endl;
+  }
+  delete[] contentsChars;
+  return r;
+}
+
+struct lock_cond {
+  explicit lock_cond(ceph::mutex *_lock) : lock(_lock) {}
+  ceph::mutex *lock;
+  ceph::condition_variable cond;
+};
+
+void _aio_cb(void *cb, void *arg) {
+  struct lock_cond *lc = (struct lock_cond *)arg;
+  lc->lock->lock();
+  lc->cond.notify_all();
+  lc->lock->unlock();
+}
+
+int ObjBencher::fetch_bench_metadata(const std::string& metadata_file,
+				     uint64_t *op_size, uint64_t* object_size,
+				     int* num_ops, int* num_objects, int* prevPid) {
+  int r = 0;
+  bufferlist object_data;
+
+  r = sync_read(metadata_file, object_data,
+		sizeof(int) * 2 + sizeof(size_t) * 2);
+  if (r <= 0) {
+    // treat an empty file as a file that does not exist
+    if (r == 0) {
+      r = -ENOENT;
+    }
+    return r;
+  }
+  auto p = object_data.cbegin();
+  decode(*object_size, p);
+  decode(*num_ops, p);
+  decode(*prevPid, p);
+  if (!p.end()) {
+    decode(*op_size, p);
+  } else {
+    *op_size = *object_size;
+  }
+  unsigned ops_per_object = 1;
+  // make sure *op_size value is reasonable
+  if (*op_size > 0 && *object_size > *op_size) {
+    ops_per_object = *object_size / *op_size;
+  }
+  *num_objects = (*num_ops + ops_per_object - 1) / ops_per_object;
+
+  return 0;
+}
+
+int ObjBencher::write_bench(int secondsToRun,
+			    int concurrentios, const string& run_name_meta,
+			    unsigned max_objects, int prev_pid) {
+  if (concurrentios <= 0)
+    return -EINVAL;
+
+  if (!formatter) {
+    out(cout) << "Maintaining " << concurrentios << " concurrent writes of "
+	      << data.op_size << " bytes to objects of size "
+	      << data.object_size << " for up to "
+	      << secondsToRun << " seconds or "
+	      << max_objects << " objects"
+	      << std::endl;
+  } else {
+    formatter->dump_format("concurrent_ios", "%d", concurrentios);
+    formatter->dump_format("object_size", "%d", data.object_size);
+    formatter->dump_format("op_size", "%d", data.op_size);
+    formatter->dump_format("seconds_to_run", "%d", secondsToRun);
+    formatter->dump_format("max_objects", "%d", max_objects);
+  }
+  bufferlist* newContents = 0;
+
+  std::string prefix = prev_pid ? generate_object_prefix(prev_pid) : generate_object_prefix();
+  if (!formatter)
+    out(cout) << "Object prefix: " << prefix << std::endl;
+  else
+    formatter->dump_string("object_prefix", prefix);
+
+  std::vector<string> name(concurrentios);
+  std::string newName;
+  unique_ptr<bufferlist> contents[concurrentios];
+  int r = 0;
+  bufferlist b_write;
+  lock_cond lc(&lock);
+  double total_latency = 0;
+  std::vector<mono_time> start_times(concurrentios);
+  mono_time stopTime;
+  std::chrono::duration<double> timePassed;
+
+  unsigned writes_per_object = 1;
+  if (data.op_size)
+    writes_per_object = data.object_size / data.op_size;
+
+  r = completions_init(concurrentios);
+
+  //set up writes so I can start them together
+  for (int i = 0; i<concurrentios; ++i) {
+    name[i] = generate_object_name_fast(i / writes_per_object);
+    contents[i] = std::make_unique<bufferlist>();
+    snprintf(data.object_contents, data.op_size, "I'm the %16dth op!", i);
+    contents[i]->append(data.object_contents, data.op_size);
+  }
+
+  pthread_t print_thread;
+
+  pthread_create(&print_thread, NULL, ObjBencher::status_printer, (void *)this);
+  ceph_pthread_setname(print_thread, "write_stat");
+  std::unique_lock locker{lock};
+  data.finished = 0;
+  data.start_time = mono_clock::now();
+  locker.unlock();
+  for (int i = 0; i<concurrentios; ++i) {
+    start_times[i] = mono_clock::now();
+    r = create_completion(i, _aio_cb, (void *)&lc);
+    if (r < 0)
+      goto ERR;
+    r = aio_write(name[i], i, *contents[i], data.op_size,
+		  data.op_size * (i % writes_per_object));
+    if (r < 0) {
+      goto ERR;
+    }
+    locker.lock();
+    ++data.started;
+    ++data.in_flight;
+    locker.unlock();
+  }
+
+  //keep on adding new writes as old ones complete until we've passed minimum time
+  int slot;
+
+  //don't need locking for reads because other thread doesn't write
+
+  stopTime = data.start_time + std::chrono::seconds(secondsToRun);
+  slot = 0;
+  locker.lock();
+  while (data.finished < data.started) {
+    bool found = false;
+    while (1) {
+      int old_slot = slot;
+      do {
+        if (completion_is_done(slot)) {
+            found = true;
+            break;
+        }
+        slot++;
+        if (slot == concurrentios) {
+          slot = 0;
+        }
+      } while (slot != old_slot);
+      if (found)
+        break;
+      lc.cond.wait(locker);
+    }
+    locker.unlock();
+
+    completion_wait(slot);
+    locker.lock();
+    r = completion_ret(slot);
+    if (r != 0) {
+      locker.unlock();
+      goto ERR;
+    }
+    data.cur_latency = mono_clock::now() - start_times[slot];
+    total_latency += data.cur_latency.count();
+    if( data.cur_latency.count() > data.max_latency)
+      data.max_latency = data.cur_latency.count();
+    if (data.cur_latency.count() < data.min_latency)
+      data.min_latency = data.cur_latency.count();
+    ++data.finished;
+    double delta = data.cur_latency.count() - data.avg_latency;
+    data.avg_latency = total_latency / data.finished;
+    data.latency_diff_sum += delta * (data.cur_latency.count() - data.avg_latency);
+    --data.in_flight;
+    locker.unlock();
+    release_completion(slot);
+
+    if (!secondsToRun || mono_clock::now() >= stopTime) {
+      locker.lock();
+      continue;
+    }
+
+    if (data.op_size && max_objects &&
+        data.started >=
+            (int)((data.object_size * max_objects + data.op_size - 1) /
+                  data.op_size)) {
+      locker.lock();
+      continue;
+    }
+
+    //write new stuff to backend
+
+    //create new contents and name on the heap, and fill them
+    newName = generate_object_name_fast(data.started / writes_per_object);
+    newContents = contents[slot].get();
+    snprintf(newContents->c_str(), data.op_size, "I'm the %16dth op!", data.started);
+    // we wrote to buffer, going around internal crc cache, so invalidate it now.
+    newContents->invalidate_crc();
+
+    start_times[slot] = mono_clock::now();
+    r = create_completion(slot, _aio_cb, &lc);
+    if (r < 0)
+      goto ERR;
+    r = aio_write(newName, slot, *newContents, data.op_size,
+		  data.op_size * (data.started % writes_per_object));
+    if (r < 0) {
+      goto ERR;
+    }
+    name[slot] = newName;
+    locker.lock();
+    ++data.started;
+    ++data.in_flight;
+  }
+  locker.unlock();
+
+  timePassed = mono_clock::now() - data.start_time;
+  locker.lock();
+  data.done = true;
+  locker.unlock();
+
+  pthread_join(print_thread, NULL);
+
+  double bandwidth;
+  bandwidth = ((double)data.finished)*((double)data.op_size) /
+       timePassed.count();
+  bandwidth = bandwidth/(1024*1024); // we want it in MB/sec
+
+  double bandwidth_stddev;
+  double iops_stddev;
+  double latency_stddev;
+  if (data.idata.bandwidth_cycles > 1) {
+    bandwidth_stddev = std::sqrt(data.idata.bandwidth_diff_sum / (data.idata.bandwidth_cycles - 1));
+  } else {
+    bandwidth_stddev = 0;
+  }
+  if (data.idata.iops_cycles > 1) {
+    iops_stddev = std::sqrt(data.idata.iops_diff_sum / (data.idata.iops_cycles - 1));
+  } else {
+    iops_stddev = 0;
+  }
+  if (data.finished > 1) {
+    latency_stddev = std::sqrt(data.latency_diff_sum / (data.finished - 1));
+  } else {
+    latency_stddev = 0;
+  }
+
+  if (!formatter) {
+    out(cout) << "Total time run:         " << timePassed.count() << std::endl
+       << "Total writes made:      " << data.finished << std::endl
+       << "Write size:             " << data.op_size << std::endl
+       << "Object size:            " << data.object_size << std::endl
+       << "Bandwidth (MB/sec):     " << setprecision(6) << bandwidth << std::endl
+       << "Stddev Bandwidth:       " << bandwidth_stddev << std::endl
+       << "Max bandwidth (MB/sec): " << data.idata.max_bandwidth << std::endl
+       << "Min bandwidth (MB/sec): " << data.idata.min_bandwidth << std::endl
+       << "Average IOPS:           " << (int)(data.finished/timePassed.count()) << std::endl
+       << "Stddev IOPS:            " << iops_stddev << std::endl
+       << "Max IOPS:               " << data.idata.max_iops << std::endl
+       << "Min IOPS:               " << data.idata.min_iops << std::endl
+       << "Average Latency(s):     " << data.avg_latency << std::endl
+       << "Stddev Latency(s):      " << latency_stddev << std::endl
+       << "Max latency(s):         " << data.max_latency << std::endl
+       << "Min latency(s):         " << data.min_latency << std::endl;
+  } else {
+    formatter->dump_format("total_time_run", "%f", timePassed.count());
+    formatter->dump_format("total_writes_made", "%d", data.finished);
+    formatter->dump_format("write_size", "%d", data.op_size);
+    formatter->dump_format("object_size", "%d", data.object_size);
+    formatter->dump_format("bandwidth", "%f", bandwidth);
+    formatter->dump_format("stddev_bandwidth", "%f", bandwidth_stddev);
+    formatter->dump_format("max_bandwidth", "%f", data.idata.max_bandwidth);
+    formatter->dump_format("min_bandwidth", "%f", data.idata.min_bandwidth);
+    formatter->dump_format("average_iops", "%d", (int)(data.finished/timePassed.count()));
+    formatter->dump_format("stddev_iops", "%d", iops_stddev);
+    formatter->dump_format("max_iops", "%d", data.idata.max_iops);
+    formatter->dump_format("min_iops", "%d", data.idata.min_iops);
+    formatter->dump_format("average_latency", "%f", data.avg_latency);
+    formatter->dump_format("stddev_latency", "%f", latency_stddev);
+    formatter->dump_format("max_latency", "%f", data.max_latency);
+    formatter->dump_format("min_latency", "%f", data.min_latency);
+  }
+  //write object size/number data for read benchmarks
+  encode(data.object_size, b_write);
+  encode(data.finished, b_write);
+  encode(prev_pid ? prev_pid : getpid(),  b_write);
+  encode(data.op_size, b_write);
+
+  // persist meta-data for further cleanup or read
+  sync_write(run_name_meta, b_write, sizeof(int)*3);
+
+  completions_done();
+
+  return 0;
+
+ ERR:
+  locker.lock();
+  data.done = 1;
+  locker.unlock();
+  pthread_join(print_thread, NULL);
+  return r;
+}
+
+int ObjBencher::seq_read_bench(
+  int seconds_to_run, int num_ops, int num_objects,
+  int concurrentios, int pid, bool no_verify) {
+
+  lock_cond lc(&lock);
+
+  if (concurrentios <= 0)
+    return -EINVAL;
+
+  std::vector<string> name(concurrentios);
+  std::string newName;
+  unique_ptr<bufferlist> contents[concurrentios];
+  int index[concurrentios];
+  int errors = 0;
+  double total_latency = 0;
+  int r = 0;
+  std::vector<mono_time> start_times(concurrentios);
+  mono_clock::duration time_to_run = std::chrono::seconds(seconds_to_run);
+  std::chrono::duration<double> timePassed;
+  sanitize_object_contents(&data, data.op_size); //clean it up once; subsequent
+  //changes will be safe because string length should remain the same
+
+  unsigned reads_per_object = 1;
+  if (data.op_size)
+    reads_per_object = data.object_size / data.op_size;
+
+  r = completions_init(concurrentios);
+  if (r < 0)
+    return r;
+
+  //set up initial reads
+  for (int i = 0; i < concurrentios; ++i) {
+    name[i] = generate_object_name_fast(i / reads_per_object, pid);
+    contents[i] = std::make_unique<bufferlist>();
+  }
+
+  std::unique_lock locker{lock};
+  data.finished = 0;
+  data.start_time = mono_clock::now();
+  locker.unlock();
+
+  pthread_t print_thread;
+  pthread_create(&print_thread, NULL, status_printer, (void *)this);
+  ceph_pthread_setname(print_thread, "seq_read_stat");
+
+  mono_time finish_time = data.start_time + time_to_run;
+  //start initial reads
+  for (int i = 0; i < concurrentios; ++i) {
+    index[i] = i;
+    start_times[i] = mono_clock::now();
+    create_completion(i, _aio_cb, (void *)&lc);
+    r = aio_read(name[i], i, contents[i].get(), data.op_size,
+		 data.op_size * (i % reads_per_object));
+    if (r < 0) {
+      cerr << "r = " << r << std::endl;
+      goto ERR;
+    }
+    locker.lock();
+    ++data.started;
+    ++data.in_flight;
+    locker.unlock();
+  }
+
+  //keep on adding new reads as old ones complete
+  int slot;
+  bufferlist *cur_contents;
+
+  slot = 0;
+  while (data.finished < data.started) {
+    locker.lock();
+    int old_slot = slot;
+    bool found = false;
+    while (1) {
+      do {
+        if (completion_is_done(slot)) {
+          found = true;
+          break;
+        }
+        slot++;
+        if (slot == concurrentios) {
+          slot = 0;
+        }
+      } while (slot != old_slot);
+      if (found) {
+        break;
+      }
+      lc.cond.wait(locker);
+    }
+
+    // calculate latency here, so memcmp doesn't inflate it
+    data.cur_latency = mono_clock::now() - start_times[slot];
+
+    cur_contents = contents[slot].get();
+    int current_index = index[slot];
+
+    // invalidate internal crc cache
+    cur_contents->invalidate_crc();
+
+    if (!no_verify) {
+      snprintf(data.object_contents, data.op_size, "I'm the %16dth op!", current_index);
+      if ( (cur_contents->length() != data.op_size) ||
+           (memcmp(data.object_contents, cur_contents->c_str(), data.op_size) != 0) ) {
+        cerr << name[slot] << " is not correct!" << std::endl;
+        ++errors;
+      }
+    }
+
+    bool start_new_read = (seconds_to_run && mono_clock::now() < finish_time) &&
+                          num_ops > data.started;
+    if (start_new_read) {
+      newName = generate_object_name_fast(data.started / reads_per_object, pid);
+      index[slot] = data.started;
+    }
+
+    locker.unlock();
+    completion_wait(slot);
+    locker.lock();
+    r = completion_ret(slot);
+    if (r < 0) {
+      cerr << "read got " << r << std::endl;
+      locker.unlock();
+      goto ERR;
+    }
+    total_latency += data.cur_latency.count();
+    if (data.cur_latency.count() > data.max_latency)
+      data.max_latency = data.cur_latency.count();
+    if (data.cur_latency.count() < data.min_latency)
+      data.min_latency = data.cur_latency.count();
+    ++data.finished;
+    data.avg_latency = total_latency / data.finished;
+    --data.in_flight;
+    locker.unlock();
+    release_completion(slot);
+
+    if (!start_new_read)
+      continue;
+
+    //start new read and check data if requested
+    start_times[slot] = mono_clock::now();
+    create_completion(slot, _aio_cb, (void *)&lc);
+    r = aio_read(newName, slot, contents[slot].get(), data.op_size,
+		 data.op_size * (data.started % reads_per_object));
+    if (r < 0) {
+      goto ERR;
+    }
+    locker.lock();
+    ++data.started;
+    ++data.in_flight;
+    locker.unlock();
+    name[slot] = newName;
+  }
+
+  timePassed = mono_clock::now() - data.start_time;
+  locker.lock();
+  data.done = true;
+  locker.unlock();
+
+  pthread_join(print_thread, NULL);
+
+  double bandwidth;
+  bandwidth = ((double)data.finished)*((double)data.op_size)/timePassed.count();
+  bandwidth = bandwidth/(1024*1024); // we want it in MB/sec
+
+  double iops_stddev;
+  if (data.idata.iops_cycles > 1) {
+    iops_stddev = std::sqrt(data.idata.iops_diff_sum / (data.idata.iops_cycles - 1));
+  } else {
+    iops_stddev = 0;
+  }
+
+  if (!formatter) {
+    out(cout) << "Total time run:       " << timePassed.count() << std::endl
+       << "Total reads made:     " << data.finished << std::endl
+       << "Read size:            " << data.op_size << std::endl
+       << "Object size:          " << data.object_size << std::endl
+       << "Bandwidth (MB/sec):   " << setprecision(6) << bandwidth << std::endl
+       << "Average IOPS:         " << (int)(data.finished/timePassed.count()) << std::endl
+       << "Stddev IOPS:          " << iops_stddev << std::endl
+       << "Max IOPS:             " << data.idata.max_iops << std::endl
+       << "Min IOPS:             " << data.idata.min_iops << std::endl
+       << "Average Latency(s):   " << data.avg_latency << std::endl
+       << "Max latency(s):       " << data.max_latency << std::endl
+       << "Min latency(s):       " << data.min_latency << std::endl;
+  } else {
+    formatter->dump_format("total_time_run", "%f", timePassed.count());
+    formatter->dump_format("total_reads_made", "%d", data.finished);
+    formatter->dump_format("read_size", "%d", data.op_size);
+    formatter->dump_format("object_size", "%d", data.object_size);
+    formatter->dump_format("bandwidth", "%f", bandwidth);
+    formatter->dump_format("average_iops", "%d", (int)(data.finished/timePassed.count()));
+    formatter->dump_format("stddev_iops", "%f", iops_stddev);
+    formatter->dump_format("max_iops", "%d", data.idata.max_iops);
+    formatter->dump_format("min_iops", "%d", data.idata.min_iops);
+    formatter->dump_format("average_latency", "%f", data.avg_latency);
+    formatter->dump_format("max_latency", "%f", data.max_latency);
+    formatter->dump_format("min_latency", "%f", data.min_latency);
+  }
+
+  completions_done();
+
+  return (errors > 0 ? -EIO : 0);
+
+ ERR:
+  locker.lock();
+  data.done = 1;
+  locker.unlock();
+  pthread_join(print_thread, NULL);
+  return r;
+}
+
+int ObjBencher::rand_read_bench(
+  int seconds_to_run, int num_ops, int num_objects,
+  int concurrentios, int pid, bool no_verify) {
+
+  lock_cond lc(&lock);
+
+  if (concurrentios <= 0)
+    return -EINVAL;
+
+  std::vector<string> name(concurrentios);
+  std::string newName;
+  unique_ptr<bufferlist> contents[concurrentios];
+  int index[concurrentios];
+  int errors = 0;
+  int r = 0;
+  double total_latency = 0;
+  std::vector<mono_time> start_times(concurrentios);
+  mono_clock::duration time_to_run = std::chrono::seconds(seconds_to_run);
+  std::chrono::duration<double> timePassed;
+  sanitize_object_contents(&data, data.op_size); //clean it up once; subsequent
+  //changes will be safe because string length should remain the same
+
+  unsigned reads_per_object = 1;
+  if (data.op_size)
+    reads_per_object = data.object_size / data.op_size;
+
+  srand (time(NULL));
+
+  r = completions_init(concurrentios);
+  if (r < 0)
+    return r;
+
+  //set up initial reads
+  for (int i = 0; i < concurrentios; ++i) {
+    name[i] = generate_object_name_fast(i / reads_per_object, pid);
+    contents[i] = std::make_unique<bufferlist>();
+  }
+
+  unique_lock locker{lock};
+  data.finished = 0;
+  data.start_time = mono_clock::now();
+  locker.unlock();
+
+  pthread_t print_thread;
+  pthread_create(&print_thread, NULL, status_printer, (void *)this);
+  ceph_pthread_setname(print_thread, "rand_read_stat");
+
+  mono_time finish_time = data.start_time + time_to_run;
+  //start initial reads
+  for (int i = 0; i < concurrentios; ++i) {
+    index[i] = i;
+    start_times[i] = mono_clock::now();
+    create_completion(i, _aio_cb, (void *)&lc);
+    r = aio_read(name[i], i, contents[i].get(), data.op_size,
+		 data.op_size * (i % reads_per_object));
+    if (r < 0) {
+      cerr << "r = " << r << std::endl;
+      goto ERR;
+    }
+    locker.lock();
+    ++data.started;
+    ++data.in_flight;
+    locker.unlock();
+  }
+
+  //keep on adding new reads as old ones complete
+  int slot;
+  bufferlist *cur_contents;
+  int rand_id;
+
+  slot = 0;
+  while (data.finished < data.started) {
+    locker.lock();
+    int old_slot = slot;
+    bool found = false;
+    while (1) {
+      do {
+        if (completion_is_done(slot)) {
+          found = true;
+          break;
+        }
+        slot++;
+        if (slot == concurrentios) {
+          slot = 0;
+        }
+      } while (slot != old_slot);
+      if (found) {
+        break;
+      }
+      lc.cond.wait(locker);
+    }
+
+    // calculate latency here, so memcmp doesn't inflate it
+    data.cur_latency = mono_clock::now() - start_times[slot];
+
+    locker.unlock();
+
+    int current_index = index[slot];
+    cur_contents = contents[slot].get();
+    completion_wait(slot);
+    locker.lock();
+    r = completion_ret(slot);
+    if (r < 0) {
+      cerr << "read got " << r << std::endl;
+      locker.unlock();
+      goto ERR;
+    }
+
+    total_latency += data.cur_latency.count();
+    if (data.cur_latency.count() > data.max_latency)
+      data.max_latency = data.cur_latency.count();
+    if (data.cur_latency.count() < data.min_latency)
+      data.min_latency = data.cur_latency.count();
+    ++data.finished;
+    data.avg_latency = total_latency / data.finished;
+    --data.in_flight;
+
+    if (!no_verify) {
+      snprintf(data.object_contents, data.op_size, "I'm the %16dth op!", current_index);
+      if ((cur_contents->length() != data.op_size) ||
+          (memcmp(data.object_contents, cur_contents->c_str(), data.op_size) != 0)) {
+        cerr << name[slot] << " is not correct!" << std::endl;
+        ++errors;
+      }
+    }
+
+    locker.unlock();
+    release_completion(slot);
+
+    if (!seconds_to_run || mono_clock::now() >= finish_time)
+      continue;
+
+    //start new read and check data if requested
+
+    rand_id = rand() % num_ops;
+    newName = generate_object_name_fast(rand_id / reads_per_object, pid);
+    index[slot] = rand_id;
+
+    // invalidate internal crc cache
+    cur_contents->invalidate_crc();
+
+    start_times[slot] = mono_clock::now();
+    create_completion(slot, _aio_cb, (void *)&lc);
+    r = aio_read(newName, slot, contents[slot].get(), data.op_size,
+		 data.op_size * (rand_id % reads_per_object));
+    if (r < 0) {
+      goto ERR;
+    }
+    locker.lock();
+    ++data.started;
+    ++data.in_flight;
+    locker.unlock();
+    name[slot] = newName;
+  }
+
+  timePassed = mono_clock::now() - data.start_time;
+  locker.lock();
+  data.done = true;
+  locker.unlock();
+
+  pthread_join(print_thread, NULL);
+
+  double bandwidth;
+  bandwidth = ((double)data.finished)*((double)data.op_size)/timePassed.count();
+  bandwidth = bandwidth/(1024*1024); // we want it in MB/sec
+
+  double iops_stddev;
+  if (data.idata.iops_cycles > 1) {
+    iops_stddev = std::sqrt(data.idata.iops_diff_sum / (data.idata.iops_cycles - 1));
+  } else {
+    iops_stddev = 0;
+  }
+
+  if (!formatter) {
+    out(cout) << "Total time run:       " << timePassed.count() << std::endl
+       << "Total reads made:     " << data.finished << std::endl
+       << "Read size:            " << data.op_size << std::endl
+       << "Object size:          " << data.object_size << std::endl
+       << "Bandwidth (MB/sec):   " << setprecision(6) << bandwidth << std::endl
+       << "Average IOPS:         " << (int)(data.finished/timePassed.count()) << std::endl
+       << "Stddev IOPS:          " << iops_stddev << std::endl
+       << "Max IOPS:             " << data.idata.max_iops << std::endl
+       << "Min IOPS:             " << data.idata.min_iops << std::endl
+       << "Average Latency(s):   " << data.avg_latency << std::endl
+       << "Max latency(s):       " << data.max_latency << std::endl
+       << "Min latency(s):       " << data.min_latency << std::endl;
+  } else {
+    formatter->dump_format("total_time_run", "%f", timePassed.count());
+    formatter->dump_format("total_reads_made", "%d", data.finished);
+    formatter->dump_format("read_size", "%d", data.op_size);
+    formatter->dump_format("object_size", "%d", data.object_size);
+    formatter->dump_format("bandwidth", "%f", bandwidth);
+    formatter->dump_format("average_iops", "%d", (int)(data.finished/timePassed.count()));
+    formatter->dump_format("stddev_iops", "%f", iops_stddev);
+    formatter->dump_format("max_iops", "%d", data.idata.max_iops);
+    formatter->dump_format("min_iops", "%d", data.idata.min_iops);
+    formatter->dump_format("average_latency", "%f", data.avg_latency);
+    formatter->dump_format("max_latency", "%f", data.max_latency);
+    formatter->dump_format("min_latency", "%f", data.min_latency);
+  }
+  completions_done();
+
+  return (errors > 0 ? -EIO : 0);
+
+ ERR:
+  locker.lock();
+  data.done = 1;
+  locker.unlock();
+  pthread_join(print_thread, NULL);
+  return r;
+}
+
+int ObjBencher::clean_up(const std::string& orig_prefix, int concurrentios, const std::string& run_name) {
+  int r = 0;
+  uint64_t op_size, object_size;
+  int num_ops, num_objects;
+  int prevPid;
+
+  // default meta object if user does not specify one
+  const std::string run_name_meta = (run_name.empty() ? BENCH_LASTRUN_METADATA : run_name);
+  const std::string prefix = (orig_prefix.empty() ? generate_object_prefix_nopid() : orig_prefix);
+
+  if (prefix.substr(0, BENCH_PREFIX.length()) != BENCH_PREFIX) {
+    cerr << "Specified --prefix invalid, it must begin with \"" << BENCH_PREFIX << "\"" << std::endl;
+    return -EINVAL;
+  }
+
+  std::list<Object> unfiltered_objects;
+  std::set<std::string> meta_namespaces, all_namespaces;
+
+  // If caller set all_nspaces this will be searching
+  // across multiple namespaces.
+  while (true) {
+    bool objects_remain = get_objects(&unfiltered_objects, 20);
+    if (!objects_remain)
+      break;
+
+    std::list<Object>::const_iterator i = unfiltered_objects.begin();
+    for ( ; i != unfiltered_objects.end(); ++i) {
+      if (i->first == run_name_meta) {
+        meta_namespaces.insert(i->second);
+      }
+      if (i->first.substr(0, prefix.length()) == prefix) {
+        all_namespaces.insert(i->second);
+      }
+    }
+  }
+
+  std::set<std::string>::const_iterator i = all_namespaces.begin();
+  for ( ; i != all_namespaces.end(); ++i) {
+    set_namespace(*i);
+
+    // if no metadata file found we should try to do a linear search on the prefix
+    if (meta_namespaces.find(*i) == meta_namespaces.end()) {
+      int r = clean_up_slow(prefix, concurrentios);
+      if (r < 0) {
+        cerr << "clean_up_slow error r= " << r << std::endl;
+        return r;
+      }
+      continue;
+    }
+
+    r = fetch_bench_metadata(run_name_meta, &op_size, &object_size, &num_ops, &num_objects, &prevPid);
+    if (r < 0) {
+      return r;
+    }
+
+    r = clean_up(num_objects, prevPid, concurrentios);
+    if (r != 0) return r;
+
+    r = sync_remove(run_name_meta);
+    if (r != 0) return r;
+  }
+
+  return 0;
+}
+
+int ObjBencher::clean_up(int num_objects, int prevPid, int concurrentios) {
+  lock_cond lc(&lock);
+
+  if (concurrentios <= 0)
+    return -EINVAL;
+
+  std::vector<string> name(concurrentios);
+  std::string newName;
+  int r = 0;
+  int slot = 0;
+
+  unique_lock locker{lock};
+  data.done = false;
+  data.in_flight = 0;
+  data.started = 0;
+  data.finished = 0;
+  locker.unlock();
+
+  // don't start more completions than files
+  if (num_objects == 0) {
+    return 0;
+  } else if (num_objects < concurrentios) {
+    concurrentios = num_objects;
+  }
+
+  r = completions_init(concurrentios);
+  if (r < 0)
+    return r;
+
+  //set up initial removes
+  for (int i = 0; i < concurrentios; ++i) {
+    name[i] = generate_object_name_fast(i, prevPid);
+  }
+
+  //start initial removes
+  for (int i = 0; i < concurrentios; ++i) {
+    create_completion(i, _aio_cb, (void *)&lc);
+    r = aio_remove(name[i], i);
+    if (r < 0) { //naughty, doesn't clean up heap
+      cerr << "r = " << r << std::endl;
+      goto ERR;
+    }
+    locker.lock();
+    ++data.started;
+    ++data.in_flight;
+    locker.unlock();
+  }
+
+  //keep on adding new removes as old ones complete
+  while (data.finished < data.started) {
+    locker.lock();
+    int old_slot = slot;
+    bool found = false;
+    while (1) {
+      do {
+        if (completion_is_done(slot)) {
+          found = true;
+          break;
+        }
+        slot++;
+        if (slot == concurrentios) {
+          slot = 0;
+        }
+      } while (slot != old_slot);
+      if (found) {
+        break;
+      }
+      lc.cond.wait(locker);
+    }
+    locker.unlock();
+    completion_wait(slot);
+    locker.lock();
+    r = completion_ret(slot);
+    if (r != 0 && r != -ENOENT) { // file does not exist
+      cerr << "remove got " << r << std::endl;
+      locker.unlock();
+      goto ERR;
+    }
+    ++data.finished;
+    --data.in_flight;
+    locker.unlock();
+    release_completion(slot);
+
+    if (data.started >= num_objects)
+      continue;
+
+    //start new remove and check data if requested
+    newName = generate_object_name_fast(data.started, prevPid);
+    create_completion(slot, _aio_cb, (void *)&lc);
+    r = aio_remove(newName, slot);
+    if (r < 0) {
+      goto ERR;
+    }
+    locker.lock();
+    ++data.started;
+    ++data.in_flight;
+    locker.unlock();
+    name[slot] = newName;
+  }
+
+  locker.lock();
+  data.done = true;
+  locker.unlock();
+
+  completions_done();
+
+  out(cout) << "Removed " << data.finished << " object" << (data.finished != 1 ? "s" : "") << std::endl;
+
+  return 0;
+
+ ERR:
+  locker.lock();
+  data.done = 1;
+  locker.unlock();
+  return r;
+}
+
+/**
+ * Return objects from the datastore which match a prefix.
+ *
+ * Clears the list and populates it with any objects which match the
+ * prefix. The list is guaranteed to have at least one item when the
+ * function returns true.
+ *
+ * @param prefix the prefix to match against
+ * @param objects [out] return list of objects
+ * @returns true if there are any objects in the store which match
+ * the prefix, false if there are no more
+ */
+bool ObjBencher::more_objects_matching_prefix(const std::string& prefix, std::list<Object>* objects) {
+  std::list<Object> unfiltered_objects;
+
+  objects->clear();
+
+  while (objects->empty()) {
+    bool objects_remain = get_objects(&unfiltered_objects, 20);
+    if (!objects_remain)
+      return false;
+
+    std::list<Object>::const_iterator i = unfiltered_objects.begin();
+    for ( ; i != unfiltered_objects.end(); ++i) {
+      if (i->first.substr(0, prefix.length()) == prefix) {
+        objects->push_back(*i);
+      }
+    }
+  }
+
+  return true;
+}
+
+int ObjBencher::clean_up_slow(const std::string& prefix, int concurrentios) {
+  lock_cond lc(&lock);
+
+  if (concurrentios <= 0) 
+    return -EINVAL;
+
+  std::vector<Object> name(concurrentios);
+  Object newName;
+  int r = 0;
+  int slot = 0;
+  std::list<Object> objects;
+  bool objects_remain = true;
+
+  std::unique_lock locker{lock};
+  data.done = false;
+  data.in_flight = 0;
+  data.started = 0;
+  data.finished = 0;
+  locker.unlock();
+
+  out(cout) << "Warning: using slow linear search" << std::endl;
+
+  r = completions_init(concurrentios);
+  if (r < 0)
+    return r;
+
+  //set up initial removes
+  for (int i = 0; i < concurrentios; ++i) {
+    if (objects.empty()) {
+      // if there are fewer objects than concurrent ios, don't generate extras
+      bool objects_found = more_objects_matching_prefix(prefix, &objects);
+      if (!objects_found) {
+        concurrentios = i;
+        objects_remain = false;
+        break;
+      }
+    }
+
+    name[i] = objects.front();
+    objects.pop_front();
+  }
+
+  //start initial removes
+  for (int i = 0; i < concurrentios; ++i) {
+    create_completion(i, _aio_cb, (void *)&lc);
+    set_namespace(name[i].second);
+    r = aio_remove(name[i].first, i);
+    if (r < 0) { //naughty, doesn't clean up heap
+      cerr << "r = " << r << std::endl;
+      goto ERR;
+    }
+    locker.lock();
+    ++data.started;
+    ++data.in_flight;
+    locker.unlock();
+  }
+
+  //keep on adding new removes as old ones complete
+  while (objects_remain) {
+    locker.lock();
+    int old_slot = slot;
+    bool found = false;
+    while (1) {
+      do {
+        if (completion_is_done(slot)) {
+          found = true;
+          break;
+        }
+        slot++;
+        if (slot == concurrentios) {
+          slot = 0;
+        }
+      } while (slot != old_slot);
+      if (found) {
+        break;
+      }
+      lc.cond.wait(locker);
+    }
+    locker.unlock();
+
+    // get more objects if necessary
+    if (objects.empty()) {
+      objects_remain = more_objects_matching_prefix(prefix, &objects);
+      // quit if there are no more
+      if (!objects_remain) {
+        break;
+      }
+    }
+
+    // get the next object
+    newName = objects.front();
+    objects.pop_front();
+
+    completion_wait(slot);
+    locker.lock();
+    r = completion_ret(slot);
+    if (r != 0 && r != -ENOENT) { // file does not exist
+      cerr << "remove got " << r << std::endl;
+      locker.unlock();
+      goto ERR;
+    }
+    ++data.finished;
+    --data.in_flight;
+    locker.unlock();
+    release_completion(slot);
+
+    //start new remove and check data if requested
+    create_completion(slot, _aio_cb, (void *)&lc);
+    set_namespace(newName.second);
+    r = aio_remove(newName.first, slot);
+    if (r < 0) {
+      goto ERR;
+    }
+    locker.lock();
+    ++data.started;
+    ++data.in_flight;
+    locker.unlock();
+    name[slot] = newName;
+  }
+
+  //wait for final removes to complete
+  while (data.finished < data.started) {
+    slot = data.finished % concurrentios;
+    completion_wait(slot);
+    locker.lock();
+    r = completion_ret(slot);
+    if (r != 0 && r != -ENOENT) { // file does not exist
+      cerr << "remove got " << r << std::endl;
+      locker.unlock();
+      goto ERR;
+    }
+    ++data.finished;
+    --data.in_flight;
+    release_completion(slot);
+    locker.unlock();
+  }
+
+  locker.lock();
+  data.done = true;
+  locker.unlock();
+
+  completions_done();
+
+  out(cout) << "Removed " << data.finished << " object" << (data.finished != 1 ? "s" : "") << std::endl;
+
+  return 0;
+
+ ERR:
+  locker.lock();
+  data.done = 1;
+  locker.unlock();
+  return -EIO;
+}
diff --git a/src/common/obj_bencher.h b/src/common/obj_bencher.h
new file mode 100644
index 000000000..96589db27
--- /dev/null
+++ b/src/common/obj_bencher.h
@@ -0,0 +1,131 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2009 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OBJ_BENCHER_H
+#define CEPH_OBJ_BENCHER_H
+
+#include "common/ceph_context.h"
+#include "common/Formatter.h"
+#include "ceph_time.h"
+#include <cfloat>
+
+using ceph::mono_clock;
+
+struct bench_interval_data {
+  double min_bandwidth = DBL_MAX;
+  double max_bandwidth = 0;
+  double avg_bandwidth = 0;
+  int bandwidth_cycles = 0;
+  double bandwidth_diff_sum = 0;
+  int min_iops = INT_MAX;
+  int max_iops = 0;
+  double avg_iops = 0;
+  int iops_cycles = 0;
+  double iops_diff_sum = 0;
+};
+
+struct bench_data {
+  bool done; //is the benchmark is done
+  uint64_t object_size; //the size of the objects
+  uint64_t op_size;     // the size of the read/write ops
+  bool hints;
+  // same as object_size for write tests
+  int in_flight; //number of reads/writes being waited on
+  int started;
+  int finished;
+  double min_latency;
+  double max_latency;
+  double avg_latency;
+  struct bench_interval_data idata; // data that is updated by time intervals and not by events
+  double latency_diff_sum;
+  std::chrono::duration<double> cur_latency; //latency of last completed transaction - in seconds by default
+  mono_time start_time; //start time for benchmark - use the monotonic clock as we'll measure the passage of time
+  char *object_contents; //pointer to the contents written to each object
+};
+
+const int OP_WRITE     = 1;
+const int OP_SEQ_READ  = 2;
+const int OP_RAND_READ = 3;
+
+// Object is composed of <oid,namespace>
+typedef std::pair<std::string, std::string> Object;
+
+class ObjBencher {
+  bool show_time;
+  Formatter *formatter = NULL;
+  std::ostream *outstream = NULL;
+public:
+  CephContext *cct;
+protected:
+  ceph::mutex lock = ceph::make_mutex("ObjBencher::lock");
+
+  static void *status_printer(void *bencher);
+
+  struct bench_data data;
+
+  int fetch_bench_metadata(const std::string& metadata_file, uint64_t* op_size,
+			   uint64_t* object_size, int* num_ops, int* num_objects, int* prev_pid);
+
+  int write_bench(int secondsToRun, int concurrentios, const std::string& run_name_meta, unsigned max_objects, int prev_pid);
+  int seq_read_bench(int secondsToRun, int num_ops, int num_objects, int concurrentios, int writePid, bool no_verify=false);
+  int rand_read_bench(int secondsToRun, int num_ops, int num_objects, int concurrentios, int writePid, bool no_verify=false);
+
+  int clean_up(int num_objects, int prevPid, int concurrentios);
+  bool more_objects_matching_prefix(const std::string& prefix, std::list<Object>* name);
+
+  virtual int completions_init(int concurrentios) = 0;
+  virtual void completions_done() = 0;
+
+  virtual int create_completion(int i, void (*cb)(void *, void*), void *arg) = 0;
+  virtual void release_completion(int slot) = 0;
+
+  virtual bool completion_is_done(int slot) = 0;
+  virtual int completion_wait(int slot) = 0;
+  virtual int completion_ret(int slot) = 0;
+
+  virtual int aio_read(const std::string& oid, int slot, bufferlist *pbl, size_t len, size_t offset) = 0;
+  virtual int aio_write(const std::string& oid, int slot, bufferlist& bl, size_t len, size_t offset) = 0;
+  virtual int aio_remove(const std::string& oid, int slot) = 0;
+  virtual int sync_read(const std::string& oid, bufferlist& bl, size_t len) = 0;
+  virtual int sync_write(const std::string& oid, bufferlist& bl, size_t len) = 0;
+  virtual int sync_remove(const std::string& oid) = 0;
+
+  virtual bool get_objects(std::list< std::pair<std::string, std::string> >* objects, int num) = 0;
+  virtual void set_namespace(const std::string&) {}
+
+  std::ostream& out(std::ostream& os);
+  std::ostream& out(std::ostream& os, utime_t& t);
+public:
+  explicit ObjBencher(CephContext *cct_) : show_time(false), cct(cct_), data() {}
+  virtual ~ObjBencher() {}
+  int aio_bench(
+    int operation, int secondsToRun,
+    int concurrentios, uint64_t op_size, uint64_t object_size, unsigned max_objects,
+    bool cleanup, bool hints, const std::string& run_name, bool reuse_bench, bool no_verify=false);
+  int clean_up(const std::string& prefix, int concurrentios, const std::string& run_name);
+
+  void set_show_time(bool dt) {
+    show_time = dt;
+  }
+  void set_formatter(Formatter *f) {
+    formatter = f;
+  }
+  void set_outstream(std::ostream& os) {
+    outstream = &os;
+  }
+  int clean_up_slow(const std::string& prefix, int concurrentios);
+};
+
+
+#endif
diff --git a/src/common/openssl_opts_handler.cc b/src/common/openssl_opts_handler.cc
new file mode 100644
index 000000000..81d0c4786
--- /dev/null
+++ b/src/common/openssl_opts_handler.cc
@@ -0,0 +1,146 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (c) 2020 Huawei Technologies Co., Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include "openssl_opts_handler.h"
+
+#include <openssl/bio.h>
+#include <openssl/conf.h>
+#include <openssl/engine.h>
+#include <mutex>
+#include <vector>
+#include <algorithm>
+
+#include "common/debug.h"
+#include "global/global_context.h"
+#include "include/str_list.h"
+#include "include/scope_guard.h"
+
+using std::string;
+using std::ostream;
+using std::vector;
+
+// -----------------------------------------------------------------------------
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_common
+#undef dout_prefix
+#define dout_prefix _prefix(_dout)
+
+static ostream &_prefix(std::ostream *_dout)
+{
+  return *_dout << "OpenSSLOptsHandler: ";
+}
+// -----------------------------------------------------------------------------
+
+string construct_engine_conf(const string &opts)
+{
+  const string conf_header = "openssl_conf=openssl_def\n[openssl_def]\n";
+  const string engine_header = "engines=engine_section\n[engine_section]\n";
+
+  string engine_id, engine_statement, engine_detail;
+  const string id_prefix = "engine";
+  const string suffix = "_section";
+  const char delimiter = '\n';
+
+  int index = 1;
+  vector<string> confs = get_str_vec(opts, ":");
+  for (auto conf : confs) {
+    // Construct engine section statement like "engine1=engine1_section"
+    engine_id = id_prefix + std::to_string(index++);
+    engine_statement += engine_id + "=" + engine_id + suffix + delimiter;
+
+    // Adapt to OpenSSL parser
+    // Replace ',' with '\n' and add section in front
+    std::replace(conf.begin(), conf.end(), ',', delimiter);
+    engine_detail += "[" + engine_id + suffix + "]" + delimiter;
+    engine_detail += conf + delimiter;
+  }
+
+  return conf_header + engine_header + engine_statement + engine_detail;
+}
+
+string get_openssl_error()
+{
+  BIO *bio = BIO_new(BIO_s_mem());
+  if (bio == nullptr) {
+    return "failed to create BIO for more error printing";
+  }
+  ERR_print_errors(bio);
+  char* buf;
+  size_t len = BIO_get_mem_data(bio, &buf);
+  string ret(buf, len);
+  BIO_free(bio);
+  return ret;
+}
+
+void log_error(const string &err)
+{
+  derr << "Intended OpenSSL engine acceleration failed.\n"
+       << "set by openssl_engine_opts = "
+       << g_ceph_context->_conf->openssl_engine_opts
+       << "\ndetail error information:\n" << err << dendl;
+}
+
+void load_module(const string &engine_conf)
+{
+  BIO *mem = BIO_new_mem_buf(engine_conf.c_str(), engine_conf.size());
+  if (mem == nullptr) {
+    log_error("failed to new BIO memory");
+    return;
+  }
+  auto sg_mem = make_scope_guard([&mem] { BIO_free(mem); });
+
+  CONF *conf = NCONF_new(nullptr);
+  if (conf == nullptr) {
+    log_error("failed to new OpenSSL CONF");
+    return;
+  }
+  auto sg_conf = make_scope_guard([&conf] { NCONF_free(conf); });
+
+  if (NCONF_load_bio(conf, mem, nullptr) <= 0) {
+    log_error("failed to load CONF from BIO:\n" + get_openssl_error());
+    return;
+  }
+
+  OPENSSL_load_builtin_modules();
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+  ENGINE_load_builtin_engines();
+#pragma clang diagnostic pop
+#pragma GCC diagnostic pop
+
+  if (CONF_modules_load(
+          conf, nullptr,
+          CONF_MFLAGS_DEFAULT_SECTION | CONF_MFLAGS_IGNORE_MISSING_FILE) <= 0) {
+    log_error("failed to load modules from CONF:\n" + get_openssl_error());
+  }
+}
+
+void init_engine()
+{
+  string opts = g_ceph_context->_conf->openssl_engine_opts;
+  if (opts.empty()) {
+    return;
+  }
+  string engine_conf = construct_engine_conf(opts);
+  load_module(engine_conf);
+}
+
+void ceph::crypto::init_openssl_engine_once()
+{
+  static std::once_flag flag;
+  std::call_once(flag, init_engine);
+}
diff --git a/src/common/openssl_opts_handler.h b/src/common/openssl_opts_handler.h
new file mode 100644
index 000000000..cad9060c9
--- /dev/null
+++ b/src/common/openssl_opts_handler.h
@@ -0,0 +1,24 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (c) 2020 Huawei Technologies Co., Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef CEPH_OPENSSL_OPTS_HANDLER_H
+#define CEPH_OPENSSL_OPTS_HANDLER_H
+
+namespace ceph {
+  namespace crypto {
+    void init_openssl_engine_once();
+  }
+}
+
+#endif
diff --git a/src/common/options.cc b/src/common/options.cc
new file mode 100644
index 000000000..a68e2474a
--- /dev/null
+++ b/src/common/options.cc
@@ -0,0 +1,340 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "acconfig.h"
+#include "options.h"
+#include "common/Formatter.h"
+#include "common/options/build_options.h"
+
+// Helpers for validators
+#include "include/stringify.h"
+#include "include/common_fwd.h"
+#include <boost/algorithm/string.hpp>
+#include <boost/lexical_cast.hpp>
+#include <regex>
+
+// Definitions for enums
+#include "common/perf_counters.h"
+
+// rbd feature and io operation validation
+#include "librbd/Features.h"
+#include "librbd/io/IoOperations.h"
+
+using std::ostream;
+using std::ostringstream;
+
+using ceph::Formatter;
+using ceph::parse_timespan;
+
+namespace {
+class printer {
+  ostream& out;
+public:
+  explicit printer(ostream& os)
+    : out(os) {}
+  template<typename T>
+  void operator()(const T& v) const {
+    out << v;
+  }
+  void operator()(std::monostate) const {
+    return;
+  }
+  void operator()(bool v) const {
+    out << (v ? "true" : "false");
+  }
+  void operator()(double v) const {
+    out << std::fixed << v << std::defaultfloat;
+  }
+  void operator()(const Option::size_t& v) const {
+    out << v.value;
+  }
+  void operator()(const std::chrono::seconds v) const {
+    out << v.count();
+  }
+  void operator()(const std::chrono::milliseconds v) const {
+    out << v.count();
+  }
+};
+}
+
+ostream& operator<<(ostream& os, const Option::value_t& v) {
+  printer p{os};
+  std::visit(p, v);
+  return os;
+}
+
+void Option::dump_value(const char *field_name,
+    const Option::value_t &v, Formatter *f) const
+{
+  if (v == value_t{}) {
+    // This should be nil but Formatter doesn't allow it.
+    f->dump_string(field_name, "");
+    return;
+  }
+  switch (type) {
+  case TYPE_INT:
+    f->dump_int(field_name, std::get<int64_t>(v)); break;
+  case TYPE_UINT:
+    f->dump_unsigned(field_name, std::get<uint64_t>(v)); break;
+  case TYPE_STR:
+    f->dump_string(field_name, std::get<std::string>(v)); break;
+  case TYPE_FLOAT:
+    f->dump_float(field_name, std::get<double>(v)); break;
+  case TYPE_BOOL:
+    f->dump_bool(field_name, std::get<bool>(v)); break;
+  default:
+    f->dump_stream(field_name) << v; break;
+  }
+}
+
+int Option::pre_validate(std::string *new_value, std::string *err) const
+{
+  if (validator) {
+    return validator(new_value, err);
+  } else {
+    return 0;
+  }
+}
+
+int Option::validate(const Option::value_t &new_value, std::string *err) const
+{
+  // Generic validation: min
+  if (min != value_t{}) {
+    if (new_value < min) {
+      std::ostringstream oss;
+      oss << "Value '" << new_value << "' is below minimum " << min;
+      *err = oss.str();
+      return -EINVAL;
+    }
+  }
+
+  // Generic validation: max
+  if (max != value_t{}) {
+    if (new_value > max) {
+      std::ostringstream oss;
+      oss << "Value '" << new_value << "' exceeds maximum " << max;
+      *err = oss.str();
+      return -EINVAL;
+    }
+  }
+
+  // Generic validation: enum
+  if (!enum_allowed.empty() && type == Option::TYPE_STR) {
+    auto found = std::find(enum_allowed.begin(), enum_allowed.end(),
+                           std::get<std::string>(new_value));
+    if (found == enum_allowed.end()) {
+      std::ostringstream oss;
+      oss << "'" << new_value << "' is not one of the permitted "
+                 "values: " << joinify(enum_allowed.begin(),
+                                       enum_allowed.end(),
+                                       std::string(", "));
+      *err = oss.str();
+      return -EINVAL;
+    }
+  }
+
+  return 0;
+}
+
+int Option::parse_value(
+  const std::string& raw_val,
+  value_t *out,
+  std::string *error_message,
+  std::string *normalized_value) const
+{
+  std::string val = raw_val;
+
+  int r = pre_validate(&val, error_message);
+  if (r != 0) {
+    return r;
+  }
+
+  if (type == Option::TYPE_INT) {
+    int64_t f = strict_si_cast<int64_t>(val, error_message);
+    if (!error_message->empty()) {
+      return -EINVAL;
+    }
+    *out = f;
+  } else if (type == Option::TYPE_UINT) {
+    uint64_t f = strict_si_cast<uint64_t>(val, error_message);
+    if (!error_message->empty()) {
+      return -EINVAL;
+    }
+    *out = f;
+  } else if (type == Option::TYPE_STR) {
+    *out = val;
+  } else if (type == Option::TYPE_FLOAT) {
+    double f = strict_strtod(val.c_str(), error_message);
+    if (!error_message->empty()) {
+      return -EINVAL;
+    } else {
+      *out = f;
+    }
+  } else if (type == Option::TYPE_BOOL) {
+    bool b = strict_strtob(val.c_str(), error_message);
+    if (!error_message->empty()) {
+      return -EINVAL;
+    } else {
+      *out = b;
+    }
+  } else if (type == Option::TYPE_ADDR) {
+    entity_addr_t addr;
+    if (!addr.parse(val)){
+      return -EINVAL;
+    }
+    *out = addr;
+  } else if (type == Option::TYPE_ADDRVEC) {
+    entity_addrvec_t addr;
+    if (!addr.parse(val.c_str())){
+      return -EINVAL;
+    }
+    *out = addr;
+  } else if (type == Option::TYPE_UUID) {
+    uuid_d uuid;
+    if (!uuid.parse(val.c_str())) {
+      return -EINVAL;
+    }
+    *out = uuid;
+  } else if (type == Option::TYPE_SIZE) {
+    Option::size_t sz{strict_iecstrtoll(val, error_message)};
+    if (!error_message->empty()) {
+      return -EINVAL;
+    }
+    *out = sz;
+  } else if (type == Option::TYPE_SECS) {
+    try {
+      *out = parse_timespan(val);
+    } catch (const std::invalid_argument& e) {
+      *error_message = e.what();
+      return -EINVAL;
+    }
+  } else if (type == Option::TYPE_MILLISECS) {
+    try {
+      *out = std::chrono::milliseconds(std::stoull(val));
+    } catch (const std::logic_error& e) {
+      *error_message = e.what();
+      return -EINVAL;
+    }
+  } else {
+    ceph_abort();
+  }
+
+  r = validate(*out, error_message);
+  if (r != 0) {
+    return r;
+  }
+
+  if (normalized_value) {
+    *normalized_value = to_str(*out);
+  }
+  return 0;
+}
+
+void Option::dump(Formatter *f) const
+{
+  f->dump_string("name", name);
+
+  f->dump_string("type", type_to_str(type));
+
+  f->dump_string("level", level_to_str(level));
+
+  f->dump_string("desc", desc);
+  f->dump_string("long_desc", long_desc);
+
+  dump_value("default", value, f);
+  dump_value("daemon_default", daemon_value, f);
+
+  f->open_array_section("tags");
+  for (const auto t : tags) {
+    f->dump_string("tag", t);
+  }
+  f->close_section();
+
+  f->open_array_section("services");
+  for (const auto s : services) {
+    f->dump_string("service", s);
+  }
+  f->close_section();
+
+  f->open_array_section("see_also");
+  for (const auto sa : see_also) {
+    f->dump_string("see_also", sa);
+  }
+  f->close_section();
+
+  if (type == TYPE_STR) {
+    f->open_array_section("enum_values");
+    for (const auto &ea : enum_allowed) {
+      f->dump_string("enum_value", ea);
+    }
+    f->close_section();
+  }
+
+  dump_value("min", min, f);
+  dump_value("max", max, f);
+
+  f->dump_bool("can_update_at_runtime", can_update_at_runtime());
+
+  f->open_array_section("flags");
+  if (has_flag(FLAG_RUNTIME)) {
+    f->dump_string("option", "runtime");
+  }
+  if (has_flag(FLAG_NO_MON_UPDATE)) {
+    f->dump_string("option", "no_mon_update");
+  }
+  if (has_flag(FLAG_STARTUP)) {
+    f->dump_string("option", "startup");
+  }
+  if (has_flag(FLAG_CLUSTER_CREATE)) {
+    f->dump_string("option", "cluster_create");
+  }
+  if (has_flag(FLAG_CREATE)) {
+    f->dump_string("option", "create");
+  }
+  f->close_section();
+}
+
+std::string Option::to_str(const Option::value_t& v)
+{
+  return stringify(v);
+}
+
+void Option::print(ostream *out) const
+{
+  *out << name << " - " << desc << "\n";
+  *out << "  (" << type_to_str(type) << ", " << level_to_str(level) << ")\n";
+  if (daemon_value != value_t{}) {
+    *out << "  Default (non-daemon): " << stringify(value) << "\n";
+    *out << "  Default (daemon): " << stringify(daemon_value) << "\n";
+  } else {
+    *out << "  Default: " << stringify(value) << "\n";
+  }
+  if (!enum_allowed.empty()) {
+    *out << "  Possible values: ";
+    for (auto& i : enum_allowed) {
+      *out << " " << stringify(i);
+    }
+    *out << "\n";
+  }
+  if (min != value_t{}) {
+    *out << "  Minimum: " << stringify(min) << "\n"
+	 << "  Maximum: " << stringify(max) << "\n";
+  }
+  *out << "  Can update at runtime: "
+       << (can_update_at_runtime() ? "true" : "false") << "\n";
+  if (!services.empty()) {
+    *out << "  Services: " << services << "\n";
+  }
+  if (!tags.empty()) {
+    *out << "  Tags: " << tags << "\n";
+  }
+  if (!see_also.empty()) {
+    *out << "  See also: " << see_also << "\n";
+  }
+
+  if (long_desc.size()) {
+    *out << "\n" << long_desc << "\n";
+  }
+}
+
+const std::vector<Option> ceph_options = build_options();
diff --git a/src/common/options.h b/src/common/options.h
new file mode 100644
index 000000000..e1d4ec16e
--- /dev/null
+++ b/src/common/options.h
@@ -0,0 +1,424 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <chrono>
+#include <string>
+#include <variant>
+#include <vector>
+#include "include/str_list.h"
+#include "msg/msg_types.h"
+#include "include/uuid.h"
+
+struct Option {
+  enum type_t {
+    TYPE_UINT = 0,
+    TYPE_INT = 1,
+    TYPE_STR = 2,
+    TYPE_FLOAT = 3,
+    TYPE_BOOL = 4,
+    TYPE_ADDR = 5,
+    TYPE_ADDRVEC = 6,
+    TYPE_UUID = 7,
+    TYPE_SIZE = 8,
+    TYPE_SECS = 9,
+    TYPE_MILLISECS = 10,
+  };
+
+  static const char *type_to_c_type_str(type_t t) {
+    switch (t) {
+    case TYPE_UINT: return "uint64_t";
+    case TYPE_INT: return "int64_t";
+    case TYPE_STR: return "std::string";
+    case TYPE_FLOAT: return "double";
+    case TYPE_BOOL: return "bool";
+    case TYPE_ADDR: return "entity_addr_t";
+    case TYPE_ADDRVEC: return "entity_addrvec_t";
+    case TYPE_UUID: return "uuid_d";
+    case TYPE_SIZE: return "uint64_t";
+    case TYPE_SECS: return "secs";
+    case TYPE_MILLISECS: return "millisecs";
+    default: return "unknown";
+    }
+  }
+  static const char *type_to_str(type_t t) {
+    switch (t) {
+    case TYPE_UINT: return "uint";
+    case TYPE_INT: return "int";
+    case TYPE_STR: return "str";
+    case TYPE_FLOAT: return "float";
+    case TYPE_BOOL: return "bool";
+    case TYPE_ADDR: return "addr";
+    case TYPE_ADDRVEC: return "addrvec";
+    case TYPE_UUID: return "uuid";
+    case TYPE_SIZE: return "size";
+    case TYPE_SECS: return "secs";
+    case TYPE_MILLISECS: return "millisecs";
+    default: return "unknown";
+    }
+  }
+  static int str_to_type(const std::string& s) {
+    if (s == "uint") {
+      return TYPE_UINT;
+    }
+    if (s == "int") {
+      return TYPE_INT;
+    }
+    if (s == "str") {
+      return TYPE_STR;
+    }
+    if (s == "float") {
+      return TYPE_FLOAT;
+    }
+    if (s == "bool") {
+      return TYPE_BOOL;
+    }
+    if (s == "addr") {
+      return TYPE_ADDR;
+    }
+    if (s == "addrvec") {
+      return TYPE_ADDRVEC;
+    }
+    if (s == "uuid") {
+      return TYPE_UUID;
+    }
+    if (s == "size") {
+      return TYPE_SIZE;
+    }
+    if (s == "secs") {
+      return TYPE_SECS;
+    }
+    if (s == "millisecs") {
+      return TYPE_MILLISECS;
+    }
+    return -1;
+  }
+
+  /**
+   * Basic: for users, configures some externally visible functional aspect
+   * Advanced: for users, configures some internal behaviour
+   * Development: not for users.  May be dangerous, may not be documented.
+   */
+  enum level_t {
+    LEVEL_BASIC = 0,
+    LEVEL_ADVANCED = 1,
+    LEVEL_DEV = 2,
+    LEVEL_UNKNOWN = 3,
+  };
+
+  static const char *level_to_str(level_t l) {
+    switch (l) {
+      case LEVEL_BASIC: return "basic";
+      case LEVEL_ADVANCED: return "advanced";
+      case LEVEL_DEV: return "dev";
+      default: return "unknown";
+    }
+  }
+
+  enum flag_t {
+    FLAG_RUNTIME = 0x1,         ///< option can be changed at runtime
+    FLAG_NO_MON_UPDATE = 0x2,   ///< option cannot be changed via mon config
+    FLAG_STARTUP = 0x4,         ///< option can only take effect at startup
+    FLAG_CLUSTER_CREATE = 0x8,  ///< option only has effect at cluster creation
+    FLAG_CREATE = 0x10,         ///< option only has effect at daemon creation
+    FLAG_MGR = 0x20,            ///< option is a mgr module option
+    FLAG_MINIMAL_CONF = 0x40,   ///< option should go in a minimal ceph.conf
+  };
+
+  struct size_t {
+    std::uint64_t value;
+    operator uint64_t() const {
+      return static_cast<uint64_t>(value);
+    }
+    bool operator==(const size_t& rhs) const {
+      return value == rhs.value;
+    }
+  };
+
+  using value_t = std::variant<
+    std::monostate,
+    std::string,
+    uint64_t,
+    int64_t,
+    double,
+    bool,
+    entity_addr_t,
+    entity_addrvec_t,
+    std::chrono::seconds,
+    std::chrono::milliseconds,
+    size_t,
+    uuid_d>;
+  const std::string name;
+  const type_t type;
+  const level_t level;
+
+  std::string desc;
+  std::string long_desc;
+
+  unsigned flags = 0;
+
+  int subsys = -1; // if >= 0, we are a subsys debug level
+
+  value_t value;
+  value_t daemon_value;
+
+  static std::string to_str(const value_t& v);
+
+  // Items like mon, osd, rgw, rbd, ceph-fuse.  This is advisory metadata
+  // for presentation layers (like web dashboards, or generated docs), so that
+  // they know which options to display where.
+  // Additionally: "common" for settings that exist in any Ceph code.  Do
+  // not use common for settings that are just shared some places: for those
+  // places, list them.
+  std::vector<const char*> services;
+
+  // Topics like:
+  // "service": a catchall for the boring stuff like log/asok paths.
+  // "network"
+  // "performance": a setting that may need adjustment depending on
+  //                environment/workload to get best performance.
+  std::vector<const char*> tags;
+
+  std::vector<const char*> see_also;
+
+  value_t min, max;
+  std::vector<const char*> enum_allowed;
+
+  /**
+   * Return nonzero and set second argument to error string if the
+   * value is invalid.
+   *
+   * These callbacks are more than just validators, as they can also
+   * modify the value as it passes through.
+   */
+  typedef std::function<int(std::string *, std::string *)> validator_fn_t;
+  validator_fn_t validator;
+
+  Option(std::string const &name, type_t t, level_t l)
+    : name(name), type(t), level(l)
+  {
+    // While value_t is nullable (via std::monostate), we don't ever
+    // want it set that way in an Option instance: within an instance,
+    // the type of ::value should always match the declared type.
+    switch (type) {
+    case TYPE_INT:
+      value = int64_t(0); break;
+    case TYPE_UINT:
+      value = uint64_t(0); break;
+    case TYPE_STR:
+      value = std::string(""); break;
+    case TYPE_FLOAT:
+      value = 0.0; break;
+    case TYPE_BOOL:
+      value = false; break;
+    case TYPE_ADDR:
+      value = entity_addr_t(); break;
+    case TYPE_ADDRVEC:
+      value = entity_addrvec_t(); break;
+    case TYPE_UUID:
+      value = uuid_d(); break;
+    case TYPE_SIZE:
+      value = size_t{0}; break;
+    case TYPE_SECS:
+      value = std::chrono::seconds{0}; break;
+    case TYPE_MILLISECS:
+      value = std::chrono::milliseconds{0}; break;
+    default:
+      ceph_abort();
+    }
+  }
+
+  void dump_value(const char *field_name, const value_t &v, ceph::Formatter *f) const;
+
+  // Validate and potentially modify incoming string value
+  int pre_validate(std::string *new_value, std::string *err) const;
+
+  // Validate properly typed value against bounds
+  int validate(const Option::value_t &new_value, std::string *err) const;
+
+  // const char * must be explicit to avoid it being treated as an int
+  Option& set_value(value_t& v, const char *new_value) {
+    v = std::string(new_value);
+    return *this;
+  }
+
+  // bool is an integer, but we don't think so. teach it the hard way.
+  template<typename T>
+  using is_not_integer_t =
+      std::enable_if_t<!std::is_integral_v<T> || std::is_same_v<T, bool>, int>;
+  template<typename T>
+  using is_integer_t =
+      std::enable_if_t<std::is_integral_v<T> && !std::is_same_v<T, bool>, int>;
+  template<typename T, typename = is_not_integer_t<T>>
+  Option& set_value(value_t& v, const T& new_value) {
+    v = new_value;
+    return *this;
+  }
+
+  // For potentially ambiguous types, inspect Option::type and
+  // do some casting.  This is necessary to make sure that setting
+  // a float option to "0" actually sets the double part of variant.
+  template<typename T, typename = is_integer_t<T>>
+  Option& set_value(value_t& v, T new_value) {
+    switch (type) {
+    case TYPE_INT:
+      v = int64_t(new_value); break;
+    case TYPE_UINT:
+      v = uint64_t(new_value); break;
+    case TYPE_FLOAT:
+      v = double(new_value); break;
+    case TYPE_BOOL:
+      v = bool(new_value); break;
+    case TYPE_SIZE:
+      v = size_t{static_cast<std::uint64_t>(new_value)}; break;
+    case TYPE_SECS:
+      v = std::chrono::seconds{new_value}; break;
+    case TYPE_MILLISECS:
+      v = std::chrono::milliseconds{new_value}; break;
+    default:
+      std::cerr << "Bad type in set_value: " << name << ": "
+                << typeid(T).name() << std::endl;
+      ceph_abort();
+    }
+    return *this;
+  }
+
+  /// parse and validate a string input
+  int parse_value(
+    const std::string& raw_val,
+    value_t *out,
+    std::string *error_message,
+    std::string *normalized_value=nullptr) const;
+
+  template<typename T>
+  Option& set_default(const T& v) {
+    return set_value(value, v);
+  }
+
+  template<typename T>
+  Option& set_daemon_default(const T& v) {
+    return set_value(daemon_value, v);
+  }
+  Option& add_tag(const char* tag) {
+    tags.push_back(tag);
+    return *this;
+  }
+  Option& add_tag(const std::initializer_list<const char*>& ts) {
+    tags.insert(tags.end(), ts);
+    return *this;
+  }
+  Option& add_service(const char* service) {
+    services.push_back(service);
+    return *this;
+  }
+  Option& add_service(const std::initializer_list<const char*>& ss) {
+    services.insert(services.end(), ss);
+    return *this;
+  }
+  Option& add_see_also(const char* t) {
+    see_also.push_back(t);
+    return *this;
+  }
+  Option& add_see_also(const std::initializer_list<const char*>& ts) {
+    see_also.insert(see_also.end(), ts);
+    return *this;
+  }
+  Option& set_description(const char* new_desc) {
+    desc = new_desc;
+    return *this;
+  }
+  Option& set_long_description(const char* new_desc) {
+    long_desc = new_desc;
+    return *this;
+  }
+
+  template<typename T>
+  Option& set_min(const T& mi) {
+    set_value(min, mi);
+    return *this;
+  }
+
+  template<typename T>
+  Option& set_min_max(const T& mi, const T& ma) {
+    set_value(min, mi);
+    set_value(max, ma);
+    return *this;
+  }
+
+  Option& set_enum_allowed(const std::vector<const char*>& allowed)
+  {
+    enum_allowed = allowed;
+    return *this;
+  }
+
+  Option &set_flag(flag_t f) {
+    flags |= f;
+    return *this;
+  }
+  Option &set_flags(flag_t f) {
+    flags |= f;
+    return *this;
+  }
+
+  Option &set_validator(const validator_fn_t  &validator_)
+  {
+    validator = validator_;
+    return *this;
+  }
+
+  Option &set_subsys(int s) {
+    subsys = s;
+    return *this;
+  }
+
+  void dump(ceph::Formatter *f) const;
+  void print(std::ostream *out) const;
+
+  bool has_flag(flag_t f) const {
+    return flags & f;
+  }
+
+  /**
+   * A crude indicator of whether the value may be
+   * modified safely at runtime -- should be replaced
+   * with proper locking!
+   */
+  bool can_update_at_runtime() const
+  {
+    return
+      (has_flag(FLAG_RUNTIME)
+        || (!has_flag(FLAG_MGR)
+          && (type == TYPE_BOOL || type == TYPE_INT
+            || type == TYPE_UINT || type == TYPE_FLOAT
+            || type == TYPE_SIZE || type == TYPE_SECS
+            || type == TYPE_MILLISECS)))
+      && !has_flag(FLAG_STARTUP)
+      && !has_flag(FLAG_CLUSTER_CREATE)
+      && !has_flag(FLAG_CREATE);
+  }
+};
+
+constexpr unsigned long long operator"" _min (unsigned long long min) {
+  return min * 60;
+}
+constexpr unsigned long long operator"" _hr (unsigned long long hr) {
+  return hr * 60 * 60;
+}
+constexpr unsigned long long operator"" _day (unsigned long long day) {
+  return day * 24 * 60 * 60;
+}
+constexpr unsigned long long operator"" _K (unsigned long long n) {
+  return n << 10;
+}
+constexpr unsigned long long operator"" _M (unsigned long long n) {
+  return n << 20;
+}
+constexpr unsigned long long operator"" _G (unsigned long long n) {
+  return n << 30;
+}
+constexpr unsigned long long operator"" _T (unsigned long long n) {
+  return n << 40;
+}
+
+extern const std::vector<Option> ceph_options;
diff --git a/src/common/options/CMakeLists.txt b/src/common/options/CMakeLists.txt
new file mode 100644
index 000000000..f12a5513a
--- /dev/null
+++ b/src/common/options/CMakeLists.txt
@@ -0,0 +1,112 @@
+set(common_options_srcs build_options.cc)
+set(legacy_options_headers)
+set(options_yamls)
+
+# to mimic the behavior of file(CONFIGURE ...)
+file(GENERATE OUTPUT configure_file.cmake
+  CONTENT "configure_file(\${input_file} \${output_file} @ONLY)")
+function(file_configure input_file output_file)
+  set(cmake_defs
+    -D input_file=${input_file}
+    -D output_file=${output_file})
+  file(STRINGS ${input_file} subvars REGEX "@[^@]+@")
+  foreach(line ${subvars})
+    string(REGEX REPLACE ".*@([^@]+)@.*" "\\1"
+      var "${line}")
+    set(value ${${var}})
+    list(APPEND cmake_defs -D ${var}=${value})
+  endforeach()
+  add_custom_command(OUTPUT ${output_file}
+    COMMAND ${CMAKE_COMMAND} ${cmake_defs} -P configure_file.cmake
+    DEPENDS ${input_file}
+    VERBATIM)
+endfunction()
+
+function(add_options name)
+  set(yaml_in_file ${CMAKE_CURRENT_SOURCE_DIR}/${name}.yaml.in)
+  set(yaml_file ${CMAKE_CURRENT_BINARY_DIR}/${name}.yaml)
+  file_configure("${yaml_in_file}"
+    "${yaml_file}" @ONLY)
+  list(APPEND options_yamls ${yaml_file})
+  set(options_yamls ${options_yamls} PARENT_SCOPE)
+  set(cc_file "${name}_options.cc")
+  set(h_file "${PROJECT_BINARY_DIR}/include/${name}_legacy_options.h")
+  add_custom_command(PRE_BUILD
+    OUTPUT ${cc_file} ${h_file}
+    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/y2c.py
+      --input ${yaml_file}
+      --output ${cc_file}
+      --legacy ${h_file}
+      --name ${name}
+      DEPENDS ${yaml_file})
+  list(APPEND common_options_srcs ${cc_file})
+  set(common_options_srcs ${common_options_srcs} PARENT_SCOPE)
+  list(APPEND legacy_options_headers ${h_file})
+  set(legacy_options_headers ${legacy_options_headers} PARENT_SCOPE)
+endfunction()
+
+set(osd_erasure_code_plugins "jerasure" "lrc")
+if(WITH_EC_ISA_PLUGIN)
+  list(APPEND osd_erasure_code_plugins "isa")
+endif()
+string(REPLACE ";" " " osd_erasure_code_plugins "${osd_erasure_code_plugins}")
+
+set(keyring_paths
+  "/etc/ceph/$cluster.$name.keyring"
+  "/etc/ceph/$cluster.keyring"
+  "/etc/ceph/keyring"
+  "/etc/ceph/keyring.bin")
+if(FREEBSD)
+  list(APPEND keyring_paths
+    "/usr/local/etc/ceph/$cluster.$name.keyring"
+    "/usr/local/etc/ceph/$cluster.keyring"
+    "/usr/local/etc/ceph/keyring"
+    "/usr/local/etc/ceph/keyring.bin")
+endif()
+string(REPLACE ";" "," keyring_paths "${keyring_paths}")
+
+set(ms_bind_retry_count 3)
+set(ms_bind_retry_delay 5)
+if(FREEBSD)
+  # FreeBSD does not use SO_REAUSEADDR so allow for a bit more time per default
+  set(ms_bind_retry_count 6)
+  set(ms_bind_retry_delay 6)
+endif()
+
+set(mgr_disabled_modules "")
+if(WITH_MGR)
+  # https://tracker.ceph.com/issues/45147
+  if(Python3_VERSION VERSION_EQUAL 3.8)
+    set(mgr_disabled_modules "diskprediction_local")
+    message(STATUS "mgr module disabled for ${Python3_VERSION}: ${mgr_disabled_modules}")
+  endif()
+endif()
+
+add_options(global)
+add_options(cephfs-mirror)
+add_options(crimson)
+add_options(mgr)
+add_options(mds)
+add_options(mds-client)
+add_options(mon)
+add_options(osd)
+add_options(rbd)
+add_options(rbd-mirror)
+add_options(immutable-object-cache)
+add_options(ceph-exporter)
+
+# if set to empty string, system default luarocks package location (if exist) will be used
+set(rgw_luarocks_location "")
+if(WITH_RADOSGW_LUA_PACKAGES)
+  set(rgw_luarocks_location "/tmp/luarocks")
+endif()
+add_options(rgw)
+
+add_library(common-options-objs OBJECT
+  ${common_options_srcs})
+add_custom_target(legacy-option-headers
+  DEPENDS ${legacy_options_headers})
+
+include(AddCephTest)
+add_ceph_test(validate-options
+  ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/validate-options.py ${options_yamls})
diff --git a/src/common/options/build_options.cc b/src/common/options/build_options.cc
new file mode 100644
index 000000000..867fc2efd
--- /dev/null
+++ b/src/common/options/build_options.cc
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "build_options.h"
+
+#include <algorithm>
+#include <cstring>
+
+std::vector<Option> get_global_options();
+std::vector<Option> get_mgr_options();
+std::vector<Option> get_mon_options();
+std::vector<Option> get_crimson_options();
+std::vector<Option> get_osd_options();
+std::vector<Option> get_rgw_options();
+std::vector<Option> get_rbd_options();
+std::vector<Option> get_rbd_mirror_options();
+std::vector<Option> get_immutable_object_cache_options();
+std::vector<Option> get_mds_options();
+std::vector<Option> get_mds_client_options();
+std::vector<Option> get_cephfs_mirror_options();
+std::vector<Option> get_ceph_exporter_options();
+
+std::vector<Option> build_options()
+{
+  std::vector<Option> result = get_global_options();
+
+  auto ingest = [&result](std::vector<Option>&& options, const char* svc) {
+    for (auto &o : options) {
+      if (std::none_of(o.services.begin(), o.services.end(),
+                       [svc](const char* known_svc) {
+                         return std::strcmp(known_svc, svc) == 0;
+                       })) {
+        o.add_service(svc);
+      }
+      result.push_back(std::move(o));
+    }
+  };
+
+  ingest(get_crimson_options(), "osd");
+  ingest(get_mgr_options(), "mgr");
+  ingest(get_mon_options(), "mon");
+  ingest(get_osd_options(), "osd");
+  ingest(get_rgw_options(), "rgw");
+  ingest(get_rbd_options(), "rbd");
+  ingest(get_rbd_mirror_options(), "rbd-mirror");
+  ingest(get_immutable_object_cache_options(), "immutable-object-cache");
+  ingest(get_mds_options(), "mds");
+  ingest(get_mds_client_options(), "mds_client");
+  ingest(get_cephfs_mirror_options(), "cephfs-mirror");
+  ingest(get_ceph_exporter_options(), "ceph-exporter");
+
+  return result;
+}
diff --git a/src/common/options/build_options.h b/src/common/options/build_options.h
new file mode 100644
index 000000000..6689e5e72
--- /dev/null
+++ b/src/common/options/build_options.h
@@ -0,0 +1,8 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+#include <vector>
+#include "common/options.h"
+
+std::vector<Option> build_options();
diff --git a/src/common/options/ceph-exporter.yaml.in b/src/common/options/ceph-exporter.yaml.in
new file mode 100644
index 000000000..798a185e9
--- /dev/null
+++ b/src/common/options/ceph-exporter.yaml.in
@@ -0,0 +1,54 @@
+# -*- mode: YAML -*-
+---
+
+options:
+- name: exporter_sock_dir
+  type: str
+  level: advanced
+  desc: The path to ceph daemons socket files dir
+  default: /var/run/ceph/
+  services:
+  - ceph-exporter
+  flags:
+  - runtime
+- name: exporter_addr
+  type: str
+  level: advanced
+  desc: Host ip address where exporter is deployed
+  default: 0.0.0.0
+  services:
+  - ceph-exporter
+- name: exporter_http_port
+  type: int
+  level: advanced
+  desc: Port to deploy exporter on. Default is 9926
+  default: 9926
+  services:
+  - ceph-exporter
+- name: exporter_prio_limit
+  type: int
+  level: advanced
+  desc: Only perf counters greater than or equal to exporter_prio_limit are fetched
+  default: 5
+  services:
+  - ceph-exporter
+  flags:
+  - runtime
+- name: exporter_stats_period
+  type: int
+  level: advanced
+  desc: Time to wait before sending requests again to exporter server (seconds)
+  default: 5
+  services:
+  - ceph-exporter
+  flags:
+  - runtime
+- name: exporter_sort_metrics
+  type: bool
+  level: advanced
+  desc: If true it will sort the metrics and group them.
+  default: true
+  services:
+  - ceph-exporter
+  flags:
+  - runtime
diff --git a/src/common/options/cephfs-mirror.yaml.in b/src/common/options/cephfs-mirror.yaml.in
new file mode 100644
index 000000000..78f86dfb1
--- /dev/null
+++ b/src/common/options/cephfs-mirror.yaml.in
@@ -0,0 +1,94 @@
+# -*- mode: YAML -*-
+---
+
+options:
+- name: cephfs_mirror_max_concurrent_directory_syncs
+  type: uint
+  level: advanced
+  desc: maximum number of concurrent snapshot synchronization threads
+  long_desc: maximum number of directory snapshots that can be synchronized concurrently
+    by cephfs-mirror daemon. Controls the number of synchronization threads.
+  default: 3
+  services:
+  - cephfs-mirror
+  min: 1
+- name: cephfs_mirror_action_update_interval
+  type: secs
+  level: advanced
+  desc: interval for driving asynchronous mirror actions
+  long_desc: Interval in seconds to process pending mirror update actions.
+  default: 2
+  services:
+  - cephfs-mirror
+  min: 1
+- name: cephfs_mirror_restart_mirror_on_blocklist_interval
+  type: secs
+  level: advanced
+  desc: interval to restart blocklisted instances
+  long_desc: Interval in seconds to restart blocklisted mirror instances. Setting
+    to zero (0) disables restarting blocklisted instances.
+  default: 30
+  services:
+  - cephfs-mirror
+  min: 0
+- name: cephfs_mirror_max_snapshot_sync_per_cycle
+  type: uint
+  level: advanced
+  desc: number of snapshots to mirror in one cycle
+  long_desc: maximum number of snapshots to mirror when a directory is picked up for
+    mirroring by worker threads.
+  default: 3
+  services:
+  - cephfs-mirror
+  min: 1
+- name: cephfs_mirror_directory_scan_interval
+  type: uint
+  level: advanced
+  desc: interval to scan directories to mirror snapshots
+  long_desc: interval in seconds to scan configured directories for snapshot mirroring.
+  default: 10
+  services:
+  - cephfs-mirror
+  min: 1
+- name: cephfs_mirror_max_consecutive_failures_per_directory
+  type: uint
+  level: advanced
+  desc: consecutive failed directory synchronization attempts before marking a directory
+    as "failed"
+  long_desc: number of consecutive snapshot synchronization failures to mark a directory
+    as "failed". failed directories are retried for synchronization less frequently.
+  default: 10
+  services:
+  - cephfs-mirror
+  min: 0
+- name: cephfs_mirror_retry_failed_directories_interval
+  type: uint
+  level: advanced
+  desc: failed directory retry interval for synchronization
+  long_desc: interval in seconds to retry synchronization for failed directories.
+  default: 60
+  services:
+  - cephfs-mirror
+  min: 1
+- name: cephfs_mirror_restart_mirror_on_failure_interval
+  type: secs
+  level: advanced
+  desc: interval to restart failed mirror instances
+  long_desc: Interval in seconds to restart failed mirror instances. Setting to zero
+    (0) disables restarting failed mirror instances.
+  default: 20
+  services:
+  - cephfs-mirror
+  min: 0
+- name: cephfs_mirror_mount_timeout
+  type: secs
+  level: advanced
+  desc: timeout for mounting primary/secondary ceph file system
+  long_desc: Timeout in seconds for mounting primary or secondary (remote) ceph file system
+    by the cephfs-mirror daemon. Setting this to a higher value could result in the mirror
+    daemon getting stalled when mounting a file system if the cluster is not reachable. This
+    option is used to override the usual client_mount_timeout.
+  default: 10
+  services:
+  - cephfs-mirror
+  min: 0
+\ No newline at end of file
diff --git a/src/common/options/crimson.yaml.in b/src/common/options/crimson.yaml.in
new file mode 100644
index 000000000..1007998fa
--- /dev/null
+++ b/src/common/options/crimson.yaml.in
@@ -0,0 +1,119 @@
+# -*- mode: YAML -*-
+---
+
+options:
+- name: crimson_osd_obc_lru_size
+  type: uint
+  level: advanced
+  desc: Number of obcs to cache
+  default: 10
+- name: crimson_osd_scheduler_concurrency
+  type: uint
+  level: advanced
+  desc: The maximum number concurrent IO operations, 0 for unlimited
+  default: 0
+- name: crimson_alien_op_num_threads
+  type: uint
+  level: advanced
+  desc: The number of threads for serving alienized ObjectStore
+  default: 6
+  flags:
+  - startup
+- name: crimson_seastar_smp
+  type: uint
+  level: advanced
+  desc: Number of seastar reactor threads to use for the osd
+  default: 1
+  flags:
+  - startup
+- name: crimson_alien_thread_cpu_cores
+  type: str
+  level: advanced
+  desc: CPU cores on which alienstore threads will run in cpuset(7) format
+- name: seastore_segment_size
+  type: size
+  desc: Segment size to use for SegmentManager
+  level: advanced
+  default: 64_M
+- name: seastore_device_size
+  type: size
+  desc: Total size to use for SegmentManager block file if created
+  level: dev
+  default: 50_G
+- name: seastore_block_create
+  type: bool
+  level: dev
+  desc: Create SegmentManager file if it doesn't exist
+  default: true
+  see_also:
+  - seastore_device_size
+- name: seastore_journal_batch_capacity
+  type: uint
+  level: dev
+  desc: The number limit of records in a journal batch
+  default: 16
+- name: seastore_journal_batch_flush_size
+  type: size
+  level: dev
+  desc: The size threshold to force flush a journal batch
+  default: 16_M
+- name: seastore_journal_iodepth_limit
+  type: uint
+  level: dev
+  desc: The io depth limit to submit journal records
+  default: 5
+- name: seastore_journal_batch_preferred_fullness
+  type: float
+  level: dev
+  desc: The record fullness threshold to flush a journal batch
+  default: 0.95
+- name: seastore_default_max_object_size
+  type: uint
+  level: dev
+  desc: default logical address space reservation for seastore objects' data
+  default: 16777216
+- name: seastore_default_object_metadata_reservation
+  type: uint
+  level: dev
+  desc: default logical address space reservation for seastore objects' metadata
+  default: 16777216
+- name: seastore_cache_lru_size
+  type: size
+  level: advanced
+  desc: Size in bytes of extents to keep in cache.
+  default: 64_M
+- name: seastore_obj_data_write_amplification
+  type: float
+  level: advanced
+  desc: split extent if ratio of total extent size to write size exceeds this value
+  default: 1.25
+- name: seastore_max_concurrent_transactions
+  type: uint
+  level: advanced
+  desc: maximum concurrent transactions that seastore allows
+  default: 8
+- name: seastore_main_device_type
+  type: str
+  level: dev
+  desc: The main device type seastore uses (SSD or RANDOM_BLOCK_SSD)
+  default: SSD
+- name: seastore_cbjournal_size
+  type: size
+  level: dev
+  desc: Total size to use for CircularBoundedJournal if created, it is valid only if seastore_main_device_type is RANDOM_BLOCK
+  default: 5_G
+- name: seastore_multiple_tiers_stop_evict_ratio
+  type: float
+  level: advanced
+  desc: When the used ratio of main tier is less than this value, then stop evict cold data to the cold tier.
+  default: 0.5
+- name: seastore_multiple_tiers_default_evict_ratio
+  type: float
+  level: advanced
+  desc: Begin evicting cold data to the cold tier when the used ratio of the main tier reaches this value.
+  default: 0.6
+- name: seastore_multiple_tiers_fast_evict_ratio
+  type: float
+  level: advanced
+  desc: Begin fast eviction when the used ratio of the main tier reaches this value.
+  default: 0.7
diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in
new file mode 100644
index 000000000..fa426a115
--- /dev/null
+++ b/src/common/options/global.yaml.in
@@ -0,0 +1,6396 @@
+# -*- mode: YAML -*-
+---
+
+options:
+- name: host
+  type: str
+  level: basic
+  desc: local hostname
+  long_desc: if blank, ceph assumes the short hostname (hostname -s)
+  tags:
+  - network
+  services:
+  - common
+  flags:
+  - no_mon_update
+  with_legacy: true
+- name: fsid
+  type: uuid
+  level: basic
+  desc: cluster fsid (uuid)
+  fmt_desc: The cluster ID. One per cluster.
+    May be generated by a deployment tool if not specified.
+  note: Do not set this value if you use a deployment tool that does
+    it for you.
+  tags:
+  - service
+  services:
+  - common
+  flags:
+  - no_mon_update
+  - startup
+- name: public_addr
+  type: addr
+  level: basic
+  desc: public-facing address to bind to
+  fmt_desc: The IP address for the public (front-side) network.
+   Set for each daemon.
+  services:
+  - mon
+  - mds
+  - osd
+  - mgr
+  flags:
+  - startup
+  with_legacy: true
+- name: public_addrv
+  type: addrvec
+  level: basic
+  desc: public-facing address to bind to
+  services:
+  - mon
+  - mds
+  - osd
+  - mgr
+  flags:
+  - startup
+  with_legacy: true
+- name: public_bind_addr
+  type: addr
+  level: advanced
+  services:
+  - mon
+  flags:
+  - startup
+  fmt_desc: In some dynamic deployments the Ceph MON daemon might bind
+   to an IP address locally that is different from the ``public_addr``
+   advertised to other peers in the network. The environment must ensure
+   that routing rules are set correctly. If ``public_bind_addr`` is set
+   the Ceph Monitor daemon will bind to it locally and use ``public_addr``
+   in the monmaps to advertise its address to peers. This behavior is limited
+   to the Monitor daemon.
+  with_legacy: true
+- name: cluster_addr
+  type: addr
+  level: basic
+  desc: cluster-facing address to bind to
+  fmt_desc: The IP address for the cluster (back-side) network.
+   Set for each daemon.
+  tags:
+  - network
+  services:
+  - osd
+  flags:
+  - startup
+  with_legacy: true
+- name: public_network
+  type: str
+  level: advanced
+  desc: Network(s) from which to choose a public address to bind to
+  fmt_desc: The IP address and netmask of the public (front-side) network
+   (e.g., ``192.168.0.0/24``). Set in ``[global]``. You may specify
+   comma-separated subnets. The format of it looks like
+   ``{ip-address}/{netmask} [, {ip-address}/{netmask}]``
+  tags:
+  - network
+  services:
+  - mon
+  - mds
+  - osd
+  - mgr
+  flags:
+  - startup
+  with_legacy: true
+- name: public_network_interface
+  type: str
+  level: advanced
+  desc: Interface name(s) from which to choose an address from a public_network to
+    bind to; public_network must also be specified.
+  tags:
+  - network
+  services:
+  - mon
+  - mds
+  - osd
+  - mgr
+  see_also:
+  - public_network
+  flags:
+  - startup
+- name: cluster_network
+  type: str
+  level: advanced
+  desc: Network(s) from which to choose a cluster address to bind to
+  fmt_desc: The IP address and netmask of the cluster (back-side) network
+   (e.g., ``10.0.0.0/24``).  Set in ``[global]``. You may specify
+   comma-separated subnets. The format of it looks like
+   ``{ip-address}/{netmask} [, {ip-address}/{netmask}]``
+  tags:
+  - network
+  services:
+  - osd
+  flags:
+  - startup
+  with_legacy: true
+- name: cluster_network_interface
+  type: str
+  level: advanced
+  desc: Interface name(s) from which to choose an address from a cluster_network to
+    bind to; cluster_network must also be specified.
+  tags:
+  - network
+  services:
+  - mon
+  - mds
+  - osd
+  - mgr
+  see_also:
+  - cluster_network
+  flags:
+  - startup
+- name: monmap
+  type: str
+  level: advanced
+  desc: path to MonMap file
+  long_desc: This option is normally used during mkfs, but can also be used to identify
+    which monitors to connect to.
+  services:
+  - mon
+  flags:
+  - no_mon_update
+  - create
+- name: mon_host
+  type: str
+  level: basic
+  desc: list of hosts or addresses to search for a monitor
+  long_desc: This is a list of IP addresses or hostnames that are separated by commas, whitespace, or semicolons. Hostnames are resolved via DNS. All A and AAAA records are included in the search list.
+  services:
+  - common
+  flags:
+  - no_mon_update
+  - startup
+- name: mon_host_override
+  type: str
+  level: advanced
+  desc: monitor(s) to use overriding the MonMap
+  fmt_desc: This is the list of monitors that the Ceph process **initially** contacts when first establishing communication with the Ceph cluster. This overrides the known monitor list that is derived from MonMap updates sent to older Ceph instances (like librados cluster handles). This option is expected to be useful primarily for debugging.
+  services:
+  - common
+  flags:
+  - no_mon_update
+  - startup
+- name: mon_dns_srv_name
+  type: str
+  level: advanced
+  desc: name of DNS SRV record to check for monitor addresses
+  fmt_desc: the service name used querying the DNS for the monitor hosts/addresses
+  default: ceph-mon
+  tags:
+  - network
+  services:
+  - common
+  see_also:
+  - mon_host
+  flags:
+  - startup
+- name: container_image
+  type: str
+  level: basic
+  desc: container image (used by cephadm orchestrator)
+  default: docker.io/ceph/daemon-base:latest-master-devel
+  flags:
+  - startup
+- name: no_config_file
+  type: bool
+  level: advanced
+  desc: signal that we don't require a config file to be present
+  long_desc: When specified, we won't be looking for a configuration file, and will
+    instead expect that whatever options or values are required for us to work will
+    be passed as arguments.
+  default: false
+  tags:
+  - config
+  services:
+  - common
+  flags:
+  - no_mon_update
+  - startup
+- name: lockdep
+  type: bool
+  level: dev
+  desc: enable lockdep lock dependency analyzer
+  default: false
+  services:
+  - common
+  flags:
+  - no_mon_update
+  - startup
+  with_legacy: true
+- name: lockdep_force_backtrace
+  type: bool
+  level: dev
+  desc: always gather current backtrace at every lock
+  default: false
+  services:
+  - common
+  see_also:
+  - lockdep
+  flags:
+  - startup
+  with_legacy: true
+- name: run_dir
+  type: str
+  level: advanced
+  desc: path for the 'run' directory for storing pid and socket files
+  default: /var/run/ceph
+  services:
+  - common
+  see_also:
+  - admin_socket
+  flags:
+  - startup
+  with_legacy: true
+- name: admin_socket
+  type: str
+  level: advanced
+  desc: path for the runtime control socket file, used by the 'ceph daemon' command
+  fmt_desc: The socket for executing administrative commands on a daemon,
+    irrespective of whether Ceph Monitors have established a quorum.
+  daemon_default: $run_dir/$cluster-$name.asok
+  services:
+  - common
+  flags:
+  - startup
+  # default changed by common_preinit()
+  with_legacy: true
+- name: admin_socket_mode
+  type: str
+  level: advanced
+  desc: file mode to set for the admin socket file, e.g, '0755'
+  services:
+  - common
+  see_also:
+  - admin_socket
+  flags:
+  - startup
+  with_legacy: true
+- name: daemonize
+  type: bool
+  level: advanced
+  desc: whether to daemonize (background) after startup
+  default: false
+  daemon_default: true
+  tags:
+  - service
+  services:
+  - mon
+  - mgr
+  - osd
+  - mds
+  see_also:
+  - pid_file
+  - chdir
+  flags:
+  - no_mon_update
+  - startup
+  # default changed by common_preinit()
+  with_legacy: true
+- name: setuser
+  type: str
+  level: advanced
+  desc: uid or user name to switch to on startup
+  long_desc: This is normally specified by the systemd unit file.
+  tags:
+  - service
+  services:
+  - mon
+  - mgr
+  - osd
+  - mds
+  see_also:
+  - setgroup
+  flags:
+  - startup
+  with_legacy: true
+- name: setgroup
+  type: str
+  level: advanced
+  desc: gid or group name to switch to on startup
+  long_desc: This is normally specified by the systemd unit file.
+  tags:
+  - service
+  services:
+  - mon
+  - mgr
+  - osd
+  - mds
+  see_also:
+  - setuser
+  flags:
+  - startup
+  with_legacy: true
+- name: setuser_match_path
+  type: str
+  level: advanced
+  desc: if set, setuser/setgroup is condition on this path matching ownership
+  long_desc: If setuser or setgroup are specified, and this option is non-empty, then
+    the uid/gid of the daemon will only be changed if the file or directory specified
+    by this option has a matching uid and/or gid.  This exists primarily to allow
+    switching to user ceph for OSDs to be conditional on whether the osd data contents
+    have also been chowned after an upgrade.  This is normally specified by the systemd
+    unit file.
+  tags:
+  - service
+  services:
+  - mon
+  - mgr
+  - osd
+  - mds
+  see_also:
+  - setuser
+  - setgroup
+  flags:
+  - startup
+  with_legacy: true
+- name: pid_file
+  type: str
+  level: advanced
+  desc: path to write a pid file (if any)
+  fmt_desc: The file in which the mon, osd or mds will write its
+    PID.  For instance, ``/var/run/$cluster/$type.$id.pid``
+    will create /var/run/ceph/mon.a.pid for the ``mon`` with
+    id ``a`` running in the ``ceph`` cluster. The ``pid
+    file`` is removed when the daemon stops gracefully. If
+    the process is not daemonized (i.e. runs with the ``-f``
+    or ``-d`` option), the ``pid file`` is not created.
+  tags:
+  - service
+  services:
+  - mon
+  - mgr
+  - osd
+  - mds
+  flags:
+  - startup
+  with_legacy: true
+- name: chdir
+  type: str
+  level: advanced
+  desc: path to chdir(2) to after daemonizing
+  fmt_desc: The directory Ceph daemons change to once they are
+    up and running. Default ``/`` directory recommended.
+  tags:
+  - service
+  services:
+  - mon
+  - mgr
+  - osd
+  - mds
+  see_also:
+  - daemonize
+  flags:
+  - no_mon_update
+  - startup
+  with_legacy: true
+- name: fatal_signal_handlers
+  type: bool
+  level: advanced
+  desc: whether to register signal handlers for SIGABRT etc that dump a stack trace
+  long_desc: This is normally true for daemons and values for libraries.
+  fmt_desc: If set, we will install signal handlers for SEGV, ABRT, BUS, ILL,
+    FPE, XCPU, XFSZ, SYS signals to generate a useful log message
+  default: true
+  tags:
+  - service
+  services:
+  - mon
+  - mgr
+  - osd
+  - mds
+  flags:
+  - startup
+  with_legacy: true
+- name: crash_dir
+  type: str
+  level: advanced
+  desc: Directory where crash reports are archived
+  default: /var/lib/ceph/crash
+  flags:
+  - startup
+  with_legacy: true
+- name: restapi_log_level
+  type: str
+  level: advanced
+  desc: default set by python code
+  with_legacy: true
+- name: restapi_base_url
+  type: str
+  level: advanced
+  desc: default set by python code
+  with_legacy: true
+- name: erasure_code_dir
+  type: str
+  level: advanced
+  desc: directory where erasure-code plugins can be found
+  default: @CEPH_INSTALL_FULL_PKGLIBDIR@/erasure-code
+  services:
+  - mon
+  - osd
+  flags:
+  - startup
+  with_legacy: true
+- name: log_file
+  type: str
+  level: basic
+  desc: path to log file
+  fmt_desc: The location of the logging file for your cluster.
+  daemon_default: /var/log/ceph/$cluster-$name.log
+  see_also:
+  - log_to_file
+  - log_to_stderr
+  - err_to_stderr
+  - log_to_syslog
+  - err_to_syslog
+  # default changed by common_preinit()
+  with_legacy: true
+- name: log_max_new
+  type: int
+  level: advanced
+  desc: max unwritten log entries to allow before waiting to flush to the log
+  fmt_desc: The maximum number of new log files.
+  default: 1000
+  see_also:
+  - log_max_recent
+  # default changed by common_preinit()
+  with_legacy: true
+- name: log_max_recent
+  type: int
+  level: advanced
+  desc: recent log entries to keep in memory to dump in the event of a crash
+  long_desc: The purpose of this option is to log at a higher debug level only to
+    the in-memory buffer, and write out the detailed log messages only if there is
+    a crash.  Only log entries below the lower log level will be written unconditionally
+    to the log.  For example, debug_osd=1/5 will write everything <= 1 to the log
+    unconditionally but keep entries at levels 2-5 in memory.  If there is a seg fault
+    or assertion failure, all entries will be dumped to the log.
+  min: 1
+  default: 500
+  daemon_default: 10000
+  # default changed by common_preinit()
+  with_legacy: true
+- name: log_to_file
+  type: bool
+  level: basic
+  desc: send log lines to a file
+  fmt_desc: Determines if logging messages should appear in a file.
+  default: true
+  see_also:
+  - log_file
+  with_legacy: true
+- name: log_to_stderr
+  type: bool
+  level: basic
+  desc: send log lines to stderr
+  fmt_desc: Determines if logging messages should appear in ``stderr``.
+  default: true
+  daemon_default: false
+  with_legacy: true
+- name: err_to_stderr
+  type: bool
+  level: basic
+  desc: send critical error log lines to stderr
+  fmt_desc: Determines if error messages should appear in ``stderr``.
+  default: false
+  daemon_default: true
+  with_legacy: true
+- name: log_stderr_prefix
+  type: str
+  level: advanced
+  desc: String to prefix log messages with when sent to stderr
+  long_desc: This is useful in container environments when combined with mon_cluster_log_to_stderr.  The
+    mon log prefixes each line with the channel name (e.g., 'default', 'audit'), while
+    log_stderr_prefix can be set to 'debug '.
+  see_also:
+  - mon_cluster_log_to_stderr
+- name: log_to_syslog
+  type: bool
+  level: basic
+  desc: send log lines to syslog facility
+  fmt_desc: Determines if logging messages should appear in ``syslog``.
+  default: false
+  with_legacy: true
+- name: err_to_syslog
+  type: bool
+  level: basic
+  desc: send critical error log lines to syslog facility
+  fmt_desc: Determines if error messages should appear in ``syslog``.
+  default: false
+  with_legacy: true
+- name: log_flush_on_exit
+  type: bool
+  level: advanced
+  desc: set a process exit handler to ensure the log is flushed on exit
+  fmt_desc: Determines if Ceph should flush the log files after exit.
+  default: false
+  with_legacy: true
+- name: log_stop_at_utilization
+  type: float
+  level: basic
+  desc: stop writing to the log file when device utilization reaches this ratio
+  default: 0.97
+  see_also:
+  - log_file
+  min: 0
+  max: 1
+  with_legacy: true
+- name: log_to_graylog
+  type: bool
+  level: basic
+  desc: send log lines to remote graylog server
+  default: false
+  see_also:
+  - err_to_graylog
+  - log_graylog_host
+  - log_graylog_port
+  with_legacy: true
+- name: err_to_graylog
+  type: bool
+  level: basic
+  desc: send critical error log lines to remote graylog server
+  default: false
+  see_also:
+  - log_to_graylog
+  - log_graylog_host
+  - log_graylog_port
+  with_legacy: true
+- name: log_graylog_host
+  type: str
+  level: basic
+  desc: address or hostname of graylog server to log to
+  default: 127.0.0.1
+  see_also:
+  - log_to_graylog
+  - err_to_graylog
+  - log_graylog_port
+  with_legacy: true
+- name: log_graylog_port
+  type: int
+  level: basic
+  desc: port number for the remote graylog server
+  default: 12201
+  see_also:
+  - log_graylog_host
+  with_legacy: true
+- name: log_to_journald
+  type: bool
+  level: basic
+  desc: send log lines to journald
+  default: false
+  see_also:
+  - err_to_journald
+- name: err_to_journald
+  type: bool
+  level: basic
+  desc: send critical error log lines to journald
+  default: false
+  see_also:
+  - log_to_journald
+- name: log_coarse_timestamps
+  type: bool
+  level: advanced
+  desc: timestamp log entries from coarse system clock to improve performance
+  default: true
+  tags:
+  - performance
+  - service
+  services:
+  - common
+# options will take k/v pairs, or single-item that will be assumed as general
+# default for all, regardless of channel.
+# e.g., "info" would be taken as the same as "default=info"
+# also, "default=daemon audit=local0" would mean
+#    "default all to 'daemon', override 'audit' with 'local0'
+- name: clog_to_monitors
+  type: str
+  level: advanced
+  desc: Make daemons send cluster log messages to monitors
+  fmt_desc: Determines if ``clog`` messages should be sent to monitors.
+  default: default=true
+  flags:
+  - runtime
+  with_legacy: true
+  services:
+  - mgr
+  - osd
+  - mds
+- name: clog_to_syslog
+  type: str
+  level: advanced
+  desc: Make daemons send cluster log messages to syslog
+  fmt_desc: Determines if ``clog`` messages should be sent to syslog.
+  default: 'false'
+  flags:
+  - runtime
+  with_legacy: true
+  services:
+  - mon
+  - mgr
+  - osd
+  - mds
+- name: clog_to_syslog_level
+  type: str
+  level: advanced
+  desc: Syslog level for cluster log messages
+  default: info
+  see_also:
+  - clog_to_syslog
+  flags:
+  - runtime
+  with_legacy: true
+  services:
+  - mon
+  - mgr
+  - osd
+  - mds
+- name: clog_to_syslog_facility
+  type: str
+  level: advanced
+  desc: Syslog facility for cluster log messages
+  default: default=daemon audit=local0
+  see_also:
+  - clog_to_syslog
+  flags:
+  - runtime
+  with_legacy: true
+  services:
+  - mon
+  - mgr
+  - osd
+  - mds
+- name: clog_to_graylog
+  type: str
+  level: advanced
+  desc: Make daemons send cluster log to graylog
+  default: 'false'
+  flags:
+  - runtime
+  services:
+  - mon
+  - mgr
+  - osd
+  - mds
+- name: clog_to_graylog_host
+  type: str
+  level: advanced
+  desc: Graylog host to cluster log messages
+  default: 127.0.0.1
+  see_also:
+  - clog_to_graylog
+  flags:
+  - runtime
+  with_legacy: true
+  services:
+  - mon
+  - mgr
+  - osd
+  - mds
+- name: clog_to_graylog_port
+  type: str
+  level: advanced
+  desc: Graylog port number for cluster log messages
+  default: '12201'
+  see_also:
+  - clog_to_graylog
+  flags:
+  - runtime
+  with_legacy: true
+  services:
+  - mon
+  - mgr
+  - osd
+  - mds
+- name: enable_experimental_unrecoverable_data_corrupting_features
+  type: str
+  level: advanced
+  desc: Enable named (or all with '*') experimental features that may be untested,
+    dangerous, and/or cause permanent data loss
+  flags:
+  - runtime
+  with_legacy: true
+- name: plugin_dir
+  type: str
+  level: advanced
+  desc: Base directory for dynamically loaded plugins
+  default: @CEPH_INSTALL_FULL_PKGLIBDIR@
+  services:
+  - mon
+  - osd
+  flags:
+  - startup
+- name: compressor_zlib_isal
+  type: bool
+  level: advanced
+  desc: Use Intel ISA-L accelerated zlib implementation if available
+  default: false
+  with_legacy: true
+# regular zlib compression level, not applicable to isa-l optimized version
+- name: compressor_zlib_level
+  type: int
+  level: advanced
+  desc: Zlib compression level to use
+  default: 5
+  with_legacy: true
+# regular zlib compression winsize, not applicable to isa-l optimized version
+- name: compressor_zlib_winsize
+  type: int
+  level: advanced
+  desc: Zlib compression winsize to use
+  default: -15
+  min: -15
+  max: 32
+  with_legacy: true
+# regular zstd compression level
+- name: compressor_zstd_level
+  type: int
+  level: advanced
+  desc: Zstd compression level to use
+  default: 1
+  with_legacy: true
+- name: qat_compressor_enabled
+  type: bool
+  level: advanced
+  desc: Enable Intel QAT acceleration support for compression if available
+  default: false
+  with_legacy: true
+- name: qat_compressor_session_max_number
+  type: uint
+  level: advanced
+  desc: Set the maximum number of session within Qatzip when using QAT compressor
+  default: 256
+- name: plugin_crypto_accelerator
+  type: str
+  level: advanced
+  desc: Crypto accelerator library to use
+  default: crypto_isal
+  with_legacy: true
+- name: openssl_engine_opts
+  type: str
+  level: advanced
+  desc: Use engine for specific openssl algorithm
+  long_desc: 'Pass opts in this way: engine_id=engine1,dynamic_path=/some/path/engine1.so,default_algorithms=DIGESTS:engine_id=engine2,dynamic_path=/some/path/engine2.so,default_algorithms=CIPHERS,other_ctrl=other_value'
+  flags:
+  - startup
+  with_legacy: true
+- name: mempool_debug
+  type: bool
+  level: dev
+  default: false
+  flags:
+  - no_mon_update
+  with_legacy: true
+- name: thp
+  type: bool
+  level: dev
+  desc: enable transparent huge page (THP) support
+  long_desc: Ceph is known to suffer from memory fragmentation due to THP use. This
+    is indicated by RSS usage above configured memory targets. Enabling THP is currently
+    discouraged until selective use of THP by Ceph is implemented.
+  default: false
+  flags:
+  - startup
+- name: key
+  type: str
+  level: advanced
+  desc: Authentication key
+  long_desc: A CephX authentication key, base64 encoded.  It normally looks something
+    like 'AQAtut9ZdMbNJBAAHz6yBAWyJyz2yYRyeMWDag=='.
+  fmt_desc: The key (i.e., the text string of the key itself). Not recommended.
+  see_also:
+  - keyfile
+  - keyring
+  flags:
+  - no_mon_update
+  - startup
+  with_legacy: true
+- name: keyfile
+  type: str
+  level: advanced
+  desc: Path to a file containing a key
+  long_desc: The file should contain a CephX authentication key and optionally a trailing
+    newline, but nothing else.
+  fmt_desc: The path to a key file (i.e,. a file containing only the key).
+  see_also:
+  - key
+  flags:
+  - no_mon_update
+  - startup
+  with_legacy: true
+- name: keyring
+  type: str
+  level: advanced
+  desc: Path to a keyring file.
+  long_desc: A keyring file is an INI-style formatted file where the section names
+    are client or daemon names (e.g., 'osd.0') and each section contains a 'key' property
+    with CephX authentication key as the value.
+  # please note, document are generated without accessing to the CMake
+  # variables, so please update the document manually with a representive
+  # default value using the ":default:" option of ".. confval::" directive.
+  default: @keyring_paths@
+  see_also:
+  - key
+  - keyfile
+  flags:
+  - no_mon_update
+  - startup
+  with_legacy: true
+- name: heartbeat_interval
+  type: int
+  level: advanced
+  desc: Frequency of internal heartbeat checks (seconds)
+  default: 5
+  flags:
+  - startup
+  with_legacy: true
+- name: heartbeat_file
+  type: str
+  level: advanced
+  desc: File to touch on successful internal heartbeat
+  long_desc: If set, this file will be touched every time an internal heartbeat check
+    succeeds.
+  see_also:
+  - heartbeat_interval
+  flags:
+  - startup
+  with_legacy: true
+- name: heartbeat_inject_failure
+  type: int
+  level: dev
+  default: 0
+  with_legacy: true
+- name: perf
+  type: bool
+  level: advanced
+  desc: Enable internal performance metrics
+  long_desc: If enabled, collect and expose internal health metrics
+  default: true
+  with_legacy: true
+- name: ms_type
+  type: str
+  level: advanced
+  desc: Messenger implementation to use for network communication
+  fmt_desc: Transport type used by Async Messenger. Can be ``async+posix``,
+    ``async+dpdk`` or ``async+rdma``. Posix uses standard TCP/IP networking and is
+    default. Other transports may be experimental and support may be limited.
+  default: async+posix
+  flags:
+  - startup
+  with_legacy: true
+- name: ms_public_type
+  type: str
+  level: advanced
+  desc: Messenger implementation to use for the public network
+  long_desc: If not specified, use ms_type
+  see_also:
+  - ms_type
+  flags:
+  - startup
+  with_legacy: true
+- name: ms_cluster_type
+  type: str
+  level: advanced
+  desc: Messenger implementation to use for the internal cluster network
+  long_desc: If not specified, use ms_type
+  see_also:
+  - ms_type
+  flags:
+  - startup
+  with_legacy: true
+- name: ms_mon_cluster_mode
+  type: str
+  level: basic
+  desc: Connection modes (crc, secure) for intra-mon connections in order of preference
+  fmt_desc: the connection mode (or permitted modes) to use between monitors.
+  default: secure crc
+  see_also:
+  - ms_mon_service_mode
+  - ms_mon_client_mode
+  - ms_service_mode
+  - ms_cluster_mode
+  - ms_client_mode
+  flags:
+  - startup
+- name: ms_mon_service_mode
+  type: str
+  level: basic
+  desc: Allowed connection modes (crc, secure) for connections to mons
+  fmt_desc: a list of permitted modes for clients or
+    other Ceph daemons to use when connecting to monitors.
+  default: secure crc
+  see_also:
+  - ms_service_mode
+  - ms_mon_cluster_mode
+  - ms_mon_client_mode
+  - ms_cluster_mode
+  - ms_client_mode
+  flags:
+  - startup
+- name: ms_mon_client_mode
+  type: str
+  level: basic
+  desc: Connection modes (crc, secure) for connections from clients to monitors in
+    order of preference
+  fmt_desc: a list of connection modes, in order of
+    preference, for clients or non-monitor daemons to use when
+    connecting to monitors.
+  default: secure crc
+  see_also:
+  - ms_mon_service_mode
+  - ms_mon_cluster_mode
+  - ms_service_mode
+  - ms_cluster_mode
+  - ms_client_mode
+  flags:
+  - startup
+- name: ms_cluster_mode
+  type: str
+  level: basic
+  desc: Connection modes (crc, secure) for intra-cluster connections in order of preference
+  fmt_desc: connection mode (or permitted modes) used
+    for intra-cluster communication between Ceph daemons.  If multiple
+    modes are listed, the modes listed first are preferred.
+  default: crc secure
+  see_also:
+  - ms_service_mode
+  - ms_client_mode
+  flags:
+  - startup
+- name: ms_service_mode
+  type: str
+  level: basic
+  desc: Allowed connection modes (crc, secure) for connections to daemons
+  fmt_desc: a list of permitted modes for clients to use
+    when connecting to the cluster.
+  default: crc secure
+  see_also:
+  - ms_cluster_mode
+  - ms_client_mode
+  flags:
+  - startup
+- name: ms_client_mode
+  type: str
+  level: basic
+  desc: Connection modes (crc, secure) for connections from clients in order of preference
+  fmt_desc: a list of connection modes, in order of
+    preference, for clients to use (or allow) when talking to a Ceph
+    cluster.
+  default: crc secure
+  see_also:
+  - ms_cluster_mode
+  - ms_service_mode
+  flags:
+  - startup
+- name: ms_osd_compress_mode
+  type: str
+  level: advanced
+  desc: Compression policy to use in Messenger for communicating with OSD
+  default: none
+  services:
+  - osd
+  enum_values:
+  - none
+  - force
+  see_also:
+  - ms_compress_secure
+  flags:
+  - runtime
+- name: ms_osd_compress_min_size
+  type: uint
+  level: advanced
+  desc: Minimal message size eligable for on-wire compression
+  default: 1_K
+  services:
+  - osd
+  see_also:
+  - ms_osd_compress_mode
+  flags:
+  - runtime
+- name: ms_osd_compression_algorithm
+  type: str
+  level: advanced
+  desc: Compression algorithm to use in Messenger when communicating with OSD
+  long_desc: Compression algorithm for connections with OSD in order of preference 
+    Although the default value is set to snappy, a list
+    (like snappy zlib zstd etc.) is acceptable as well. 
+  default: snappy
+  services:
+  - osd
+  see_also:
+  - ms_osd_compress_mode
+  flags:
+  - runtime
+- name: ms_compress_secure
+  type: bool
+  level: advanced
+  desc: Allowing compression when on-wire encryption is enabled
+  long_desc: Combining encryption with compression reduces the level of security of
+    messages between peers. In case both encryption and compression are enabled, 
+    compression setting will be ignored and message will not be compressed. 
+    This behaviour can be override using this setting. 
+  default: false
+  see_also: 
+  - ms_osd_compress_mode
+  flags:
+  - runtime
+- name: ms_learn_addr_from_peer
+  type: bool
+  level: advanced
+  desc: Learn address from what IP our first peer thinks we connect from
+  long_desc: Use the IP address our first peer (usually a monitor) sees that we are
+    connecting from.  This is useful if a client is behind some sort of NAT and we
+    want to see it identified by its local (not NATed) address.
+  default: true
+  with_legacy: true
+- name: ms_tcp_nodelay
+  type: bool
+  level: advanced
+  desc: Disable Nagle's algorithm and send queued network traffic immediately
+  fmt_desc: Ceph enables ``ms_tcp_nodelay`` so that each request is sent
+   immediately (no buffering). Disabling `Nagle's algorithm`_
+   increases network traffic, which can introduce latency. If you
+   experience large numbers of small packets, you may try
+   disabling ``ms_tcp_nodelay``.
+  default: true
+  with_legacy: true
+- name: ms_tcp_rcvbuf
+  type: size
+  level: advanced
+  desc: Size of TCP socket receive buffer
+  fmt_desc: The size of the socket buffer on the receiving end of a network
+   connection. Disable by default.
+  default: 0
+  with_legacy: true
+- name: ms_tcp_prefetch_max_size
+  type: size
+  level: advanced
+  desc: Maximum amount of data to prefetch out of the socket receive buffer
+  default: 4_K
+  with_legacy: true
+- name: ms_initial_backoff
+  type: float
+  level: advanced
+  desc: Initial backoff after a network error is detected (seconds)
+  fmt_desc: The initial time to wait before reconnecting on a fault.
+  default: 0.2
+  with_legacy: true
+- name: ms_max_backoff
+  type: float
+  level: advanced
+  desc: Maximum backoff after a network error before retrying (seconds)
+  fmt_desc: The maximum time to wait before reconnecting on a fault.
+  default: 15
+  see_also:
+  - ms_initial_backoff
+  with_legacy: true
+- name: ms_crc_data
+  type: bool
+  level: dev
+  desc: Set and/or verify crc32c checksum on data payload sent over network
+  default: true
+  with_legacy: true
+- name: ms_crc_header
+  type: bool
+  level: dev
+  desc: Set and/or verify crc32c checksum on header payload sent over network
+  default: true
+  with_legacy: true
+- name: ms_die_on_bad_msg
+  type: bool
+  level: dev
+  desc: Induce a daemon crash/exit when a bad network message is received
+  fmt_desc: Debug option; do not configure.
+  default: false
+  with_legacy: true
+- name: ms_die_on_unhandled_msg
+  type: bool
+  level: dev
+  desc: Induce a daemon crash/exit when an unrecognized message is received
+  default: false
+  with_legacy: true
+- name: ms_die_on_old_message
+  type: bool
+  level: dev
+  desc: Induce a daemon crash/exit when a old, undecodable message is received
+  default: false
+  with_legacy: true
+- name: ms_die_on_skipped_message
+  type: bool
+  level: dev
+  desc: Induce a daemon crash/exit if sender skips a message sequence number
+  default: false
+  with_legacy: true
+- name: ms_die_on_bug
+  type: bool
+  level: dev
+  desc: Induce a crash/exit on various bugs (for testing purposes)
+  default: false
+  with_legacy: true
+- name: ms_dispatch_throttle_bytes
+  type: size
+  level: advanced
+  desc: Limit messages that are read off the network but still being processed
+  fmt_desc: Throttles total size of messages waiting to be dispatched.
+  default: 100_M
+  with_legacy: true
+- name: ms_bind_ipv4
+  type: bool
+  level: advanced
+  desc: Bind servers to IPv4 address(es)
+  fmt_desc: Enables Ceph daemons to bind to IPv4 addresses.
+  default: true
+  see_also:
+  - ms_bind_ipv6
+- name: ms_bind_ipv6
+  type: bool
+  level: advanced
+  desc: Bind servers to IPv6 address(es)
+  fmt_desc: Enables Ceph daemons to bind to IPv6 addresses.
+  default: false
+  see_also:
+  - ms_bind_ipv4
+  with_legacy: true
+- name: ms_bind_prefer_ipv4
+  type: bool
+  level: advanced
+  desc: Prefer IPV4 over IPV6 address(es)
+  default: false
+- name: ms_bind_msgr1
+  type: bool
+  level: advanced
+  desc: Bind servers to msgr1 (legacy) protocol address(es)
+  default: true
+  see_also:
+  - ms_bind_msgr2
+- name: ms_bind_msgr2
+  type: bool
+  level: advanced
+  desc: Bind servers to msgr2 (nautilus+) protocol address(es)
+  default: true
+  see_also:
+  - ms_bind_msgr1
+- name: ms_bind_port_min
+  type: int
+  level: advanced
+  desc: Lowest port number to bind daemon(s) to
+  fmt_desc: The minimum port number to which an OSD or MDS daemon will bind.
+  default: 6800
+  with_legacy: true
+- name: ms_bind_port_max
+  type: int
+  level: advanced
+  desc: Highest port number to bind daemon(s) to
+  fmt_desc: The maximum port number to which an OSD or MDS daemon will bind.
+  default: 7568
+  with_legacy: true
+# FreeBSD does not use SO_REAUSEADDR so allow for a bit more time per default
+- name: ms_bind_retry_count
+  type: int
+  level: advanced
+  desc: Number of attempts to make while bind(2)ing to a port
+  default: @ms_bind_retry_count@
+  with_legacy: true
+# FreeBSD does not use SO_REAUSEADDR so allow for a bit more time per default
+- name: ms_bind_retry_delay
+  type: int
+  level: advanced
+  desc: Delay between bind(2) attempts (seconds)
+  default: @ms_bind_retry_delay@
+  with_legacy: true
+- name: ms_bind_before_connect
+  type: bool
+  level: advanced
+  desc: Call bind(2) on client sockets
+  default: false
+  with_legacy: true
+- name: ms_tcp_listen_backlog
+  type: int
+  level: advanced
+  desc: Size of queue of incoming connections for accept(2)
+  default: 512
+  with_legacy: true
+- name: ms_connection_ready_timeout
+  type: uint
+  level: advanced
+  desc: Time before we declare a not yet ready connection as dead (seconds)
+  default: 10
+  with_legacy: true
+- name: ms_connection_idle_timeout
+  type: uint
+  level: advanced
+  desc: Time before an idle connection is closed (seconds)
+  default: 900
+  with_legacy: true
+- name: ms_pq_max_tokens_per_priority
+  type: uint
+  level: dev
+  default: 16_M
+  with_legacy: true
+- name: ms_pq_min_cost
+  type: size
+  level: dev
+  default: 64_K
+  with_legacy: true
+- name: ms_inject_socket_failures
+  type: uint
+  level: dev
+  desc: Inject a socket failure every Nth socket operation
+  fmt_desc: Debug option; do not configure.
+  default: 0
+  with_legacy: true
+- name: ms_inject_delay_type
+  type: str
+  level: dev
+  desc: Entity type to inject delays for
+  flags:
+  - runtime
+  with_legacy: true
+- name: ms_inject_delay_max
+  type: float
+  level: dev
+  desc: Max delay to inject
+  default: 1
+  with_legacy: true
+- name: ms_inject_delay_probability
+  type: float
+  level: dev
+  default: 0
+  with_legacy: true
+- name: ms_inject_internal_delays
+  type: float
+  level: dev
+  desc: Inject various internal delays to induce races (seconds)
+  default: 0
+  with_legacy: true
+- name: ms_inject_network_congestion
+  type: uint
+  level: dev
+  desc: Inject a network congestions that stuck with N times operations
+  default: 0
+  with_legacy: true
+- name: ms_blackhole_osd
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: ms_blackhole_mon
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: ms_blackhole_mds
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: ms_blackhole_mgr
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: ms_blackhole_client
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: ms_dump_on_send
+  type: bool
+  level: advanced
+  desc: Hexdump message to debug log on message send
+  default: false
+  with_legacy: true
+- name: ms_dump_corrupt_message_level
+  type: int
+  level: advanced
+  desc: Log level at which to hexdump corrupt messages we receive
+  default: 1
+  with_legacy: true
+# number of worker processing threads for async messenger created on init
+- name: ms_async_op_threads
+  type: uint
+  level: advanced
+  desc: Threadpool size for AsyncMessenger (ms_type=async)
+  fmt_desc: Initial number of worker threads used by each Async Messenger instance.
+    Should be at least equal to highest number of replicas, but you can
+    decrease it if you are low on CPU core count and/or you host a lot of
+    OSDs on single server.
+  default: 3
+  min: 1
+  max: 24
+  with_legacy: true
+- name: ms_async_reap_threshold
+  type: uint
+  level: dev
+  desc: number of deleted connections before we reap
+  default: 5
+  min: 1
+  with_legacy: true
+- name: ms_async_rdma_device_name
+  type: str
+  level: advanced
+  with_legacy: true
+- name: ms_async_rdma_enable_hugepage
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: ms_async_rdma_buffer_size
+  type: size
+  level: advanced
+  default: 128_K
+  with_legacy: true
+- name: ms_async_rdma_send_buffers
+  type: uint
+  level: advanced
+  default: 1_K
+  with_legacy: true
+# size of the receive buffer pool, 0 is unlimited
+- name: ms_async_rdma_receive_buffers
+  type: uint
+  level: advanced
+  default: 32_K
+  with_legacy: true
+# max number of wr in srq
+- name: ms_async_rdma_receive_queue_len
+  type: uint
+  level: advanced
+  default: 4_K
+  with_legacy: true
+# support srq
+- name: ms_async_rdma_support_srq
+  type: bool
+  level: advanced
+  default: true
+  with_legacy: true
+- name: ms_async_rdma_port_num
+  type: uint
+  level: advanced
+  default: 1
+  with_legacy: true
+- name: ms_async_rdma_polling_us
+  type: uint
+  level: advanced
+  default: 1000
+  with_legacy: true
+- name: ms_async_rdma_gid_idx
+  type: int
+  level: advanced
+  desc: use gid_idx to select GID for choosing RoCEv1 or RoCEv2
+  default: 0
+  with_legacy: true
+# GID format: "fe80:0000:0000:0000:7efe:90ff:fe72:6efe", no zero folding
+- name: ms_async_rdma_local_gid
+  type: str
+  level: advanced
+  with_legacy: true
+# 0=RoCEv1, 1=RoCEv2, 2=RoCEv1.5
+- name: ms_async_rdma_roce_ver
+  type: int
+  level: advanced
+  default: 1
+  with_legacy: true
+# in RoCE, this means PCP
+- name: ms_async_rdma_sl
+  type: int
+  level: advanced
+  default: 3
+  with_legacy: true
+# in RoCE, this means DSCP
+- name: ms_async_rdma_dscp
+  type: int
+  level: advanced
+  default: 96
+  with_legacy: true
+# when there are enough accept failures, indicating there are unrecoverable failures,
+# just do ceph_abort() . Here we make it configurable.
+- name: ms_max_accept_failures
+  type: int
+  level: advanced
+  desc: The maximum number of consecutive failed accept() calls before considering
+    the daemon is misconfigured and abort it.
+  default: 4
+  with_legacy: true
+# rdma connection management
+- name: ms_async_rdma_cm
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: ms_async_rdma_type
+  type: str
+  level: advanced
+  default: ib
+  with_legacy: true
+- name: ms_dpdk_port_id
+  type: int
+  level: advanced
+  default: 0
+  with_legacy: true
+# it is modified in unittest so that use SAFE_OPTION to declare
+- name: ms_dpdk_coremask
+  type: str
+  level: advanced
+  default: '0xF'
+  see_also:
+  - ms_async_op_threads
+  with_legacy: true
+- name: ms_dpdk_memory_channel
+  type: str
+  level: advanced
+  default: '4'
+  with_legacy: true
+- name: ms_dpdk_hugepages
+  type: str
+  level: advanced
+  with_legacy: true
+- name: ms_dpdk_pmd
+  type: str
+  level: advanced
+  with_legacy: true
+- name: ms_dpdk_devs_allowlist
+  type: str
+  level: advanced
+  desc: NIC's PCIe address are allowed to use
+  long_desc: for a single NIC use ms_dpdk_devs_allowlist=-a 0000:7d:010 or --allow=0000:7d:010;
+    for a bond nics use ms_dpdk_devs_allowlist=--allow=0000:7d:01.0 --allow=0000:7d:02.6
+    --vdev=net_bonding0,mode=2,slave=0000:7d:01.0,slave=0000:7d:02.6.
+- name: ms_dpdk_host_ipv4_addr
+  type: str
+  level: advanced
+  with_legacy: true
+- name: ms_dpdk_gateway_ipv4_addr
+  type: str
+  level: advanced
+  with_legacy: true
+- name: ms_dpdk_netmask_ipv4_addr
+  type: str
+  level: advanced
+  with_legacy: true
+- name: ms_dpdk_lro
+  type: bool
+  level: advanced
+  default: true
+  with_legacy: true
+- name: ms_dpdk_enable_tso
+  type: bool
+  level: advanced
+  default: true
+- name: ms_dpdk_hw_flow_control
+  type: bool
+  level: advanced
+  default: true
+  with_legacy: true
+# Weighing of a hardware network queue relative to a software queue (0=no work, 1=     equal share)")
+- name: ms_dpdk_hw_queue_weight
+  type: float
+  level: advanced
+  default: 1
+  with_legacy: true
+- name: ms_dpdk_debug_allow_loopback
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: ms_dpdk_rx_buffer_count_per_core
+  type: int
+  level: advanced
+  default: 8192
+  with_legacy: true
+- name: inject_early_sigterm
+  type: bool
+  level: dev
+  desc: send ourselves a SIGTERM early during startup
+  default: false
+  with_legacy: true
+# list of initial cluster mon ids; if specified, need majority to form initial quorum and create new cluster
+- name: mon_initial_members
+  type: str
+  level: advanced
+  fmt_desc: The IDs of initial monitors in a cluster during startup. If 
+    specified, Ceph requires an odd number of monitors to form an 
+    initial quorum (e.g., 3).
+  note: A *majority* of monitors in your cluster must be able to reach 
+    each other in order to establish a quorum. You can decrease the initial 
+    number of monitors to establish a quorum with this setting.
+  services:
+  - mon
+  flags:
+  - no_mon_update
+  - cluster_create
+  with_legacy: true
+- name: mon_max_pg_per_osd
+  type: uint
+  level: advanced
+  desc: Max number of PGs per OSD the cluster will allow
+  long_desc: If the number of PGs per OSD exceeds this, a health warning will be visible
+    in `ceph status`.  This is also used in automated PG management, as the threshold
+    at which some pools' pg_num may be shrunk in order to enable increasing the pg_num
+    of others.
+  default: 250
+  flags:
+  - runtime
+  services:
+  - mgr
+  - mon
+  min: 1
+- name: mon_osd_full_ratio
+  type: float
+  level: advanced
+  desc: full ratio of OSDs to be set during initial creation of the cluster
+  default: 0.95
+  flags:
+  - no_mon_update
+  - cluster_create
+  with_legacy: true
+- name: mon_osd_backfillfull_ratio
+  type: float
+  level: advanced
+  default: 0.9
+  flags:
+  - no_mon_update
+  - cluster_create
+  with_legacy: true
+- name: mon_osd_nearfull_ratio
+  type: float
+  level: advanced
+  desc: nearfull ratio for OSDs to be set during initial creation of cluster
+  default: 0.85
+  flags:
+  - no_mon_update
+  - cluster_create
+  with_legacy: true
+- name: mon_osd_initial_require_min_compat_client
+  type: str
+  level: advanced
+  default: luminous
+  flags:
+  - no_mon_update
+  - cluster_create
+  with_legacy: true
+- name: mon_allow_pool_delete
+  type: bool
+  level: advanced
+  desc: allow pool deletions
+  fmt_desc: Should monitors allow pools to be removed, regardless of what the pool flags say?
+  default: false
+  services:
+  - mon
+  with_legacy: true
+- name: mon_fake_pool_delete
+  type: bool
+  level: advanced
+  desc: fake pool deletions by renaming the rados pool
+  default: false
+  services:
+  - mon
+  with_legacy: true
+- name: mon_globalid_prealloc
+  type: uint
+  level: advanced
+  desc: number of globalid values to preallocate
+  long_desc: This setting caps how many new clients can authenticate with the cluster
+    before the monitors have to perform a write to preallocate more.  Large values
+    burn through the 64-bit ID space more quickly.
+  fmt_desc: The number of global IDs to pre-allocate for clients and daemons in the cluster.
+  default: 10000
+  services:
+  - mon
+  with_legacy: true
+- name: mon_osd_report_timeout
+  type: int
+  level: advanced
+  desc: time before OSDs who do not report to the mons are marked down (seconds)
+  fmt_desc: The grace period in seconds before declaring
+              unresponsive Ceph OSD Daemons ``down``.
+  default: 15_min
+  services:
+  - mon
+  with_legacy: true
+- name: mon_warn_on_insecure_global_id_reclaim
+  type: bool
+  level: advanced
+  desc: issue AUTH_INSECURE_GLOBAL_ID_RECLAIM health warning if any connected
+    clients are insecurely reclaiming global_id
+  default: true
+  services:
+  - mon
+  see_also:
+  - mon_warn_on_insecure_global_id_reclaim_allowed
+  - auth_allow_insecure_global_id_reclaim
+  - auth_expose_insecure_global_id_reclaim
+- name: mon_warn_on_insecure_global_id_reclaim_allowed
+  type: bool
+  level: advanced
+  desc: issue AUTH_INSECURE_GLOBAL_ID_RECLAIM_ALLOWED health warning if insecure
+    global_id reclaim is allowed
+  default: true
+  services:
+  - mon
+  see_also:
+  - mon_warn_on_insecure_global_id_reclaim
+  - auth_allow_insecure_global_id_reclaim
+  - auth_expose_insecure_global_id_reclaim
+- name: mon_warn_on_msgr2_not_enabled
+  type: bool
+  level: advanced
+  desc: issue MON_MSGR2_NOT_ENABLED health warning if monitors are all running Nautilus
+    but not all binding to a msgr2 port
+  default: true
+  services:
+  - mon
+  see_also:
+  - ms_bind_msgr2
+- name: mon_warn_on_slow_ping_time
+  type: float
+  level: advanced
+  desc: Override mon_warn_on_slow_ping_ratio with specified threshold in milliseconds
+  fmt_desc: Override ``mon_warn_on_slow_ping_ratio`` with a specific value.
+    Raise ``HEALTH_WARN`` if any heartbeat between OSDs exceeds
+    ``mon_warn_on_slow_ping_time`` milliseconds.  The default is 0 (disabled).
+  default: 0
+  services:
+  - mgr
+  - osd
+  see_also:
+  - mon_warn_on_slow_ping_ratio
+- name: mon_warn_on_slow_ping_ratio
+  type: float
+  level: advanced
+  desc: Issue a health warning if heartbeat ping longer than percentage of osd_heartbeat_grace
+  fmt_desc: Raise ``HEALTH_WARN`` when any heartbeat between OSDs exceeds
+    ``mon_warn_on_slow_ping_ratio`` of ``osd_heartbeat_grace``.
+  default: 0.05
+  services:
+  - mgr
+  - osd
+  see_also:
+  - osd_heartbeat_grace
+  - mon_warn_on_slow_ping_time
+- name: mon_max_snap_prune_per_epoch
+  type: uint
+  level: advanced
+  desc: max number of pruned snaps we will process in a single OSDMap epoch
+  default: 100
+  services:
+  - mon
+- name: mon_min_osdmap_epochs
+  type: int
+  level: advanced
+  desc: min number of OSDMaps to store
+  fmt_desc: Minimum number of OSD map epochs to keep at all times.
+  default: 500
+  services:
+  - mon
+  with_legacy: true
+- name: mon_max_log_epochs
+  type: int
+  level: advanced
+  desc: max number of past cluster log epochs to store
+  fmt_desc: Maximum number of Log epochs the monitor should keep.
+  default: 500
+  services:
+  - mon
+  with_legacy: true
+- name: mon_max_mdsmap_epochs
+  type: int
+  level: advanced
+  desc: max number of FSMaps/MDSMaps to store
+  fmt_desc: The maximum number of mdsmap epochs to trim during a single proposal.
+  default: 500
+  services:
+  - mon
+  with_legacy: true
+- name: mon_max_mgrmap_epochs
+  type: int
+  level: advanced
+  desc: max number of MgrMaps to store
+  default: 500
+  services:
+  - mon
+- name: mon_max_osd
+  type: int
+  level: advanced
+  desc: max number of OSDs in a cluster
+  fmt_desc: The maximum number of OSDs allowed in the cluster.
+  default: 10000
+  services:
+  - mon
+  with_legacy: true
+- name: mon_probe_timeout
+  type: float
+  level: advanced
+  desc: timeout for querying other mons during bootstrap pre-election phase (seconds)
+  fmt_desc: Number of seconds the monitor will wait to find peers before bootstrapping.
+  default: 2
+  services:
+  - mon
+  with_legacy: true
+- name: mon_client_bytes
+  type: size
+  level: advanced
+  desc: max bytes of outstanding client messages mon will read off the network
+  fmt_desc: The amount of client message data allowed in memory (in bytes).
+  default: 100_M
+  services:
+  - mon
+  with_legacy: true
+- name: mon_warn_pg_not_scrubbed_ratio
+  type: float
+  level: advanced
+  desc: Percentage of the scrub max interval past the scrub max interval to warn
+  default: 0.5
+  see_also:
+  - osd_scrub_max_interval
+  min: 0
+  with_legacy: true
+- name: mon_warn_pg_not_deep_scrubbed_ratio
+  type: float
+  level: advanced
+  desc: Percentage of the deep scrub interval past the deep scrub interval to warn
+  default: 0.75
+  see_also:
+  - osd_deep_scrub_interval
+  min: 0
+  with_legacy: true
+- name: mon_scrub_interval
+  type: secs
+  level: advanced
+  desc: frequency for scrubbing mon database
+  fmt_desc: How often the monitor scrubs its store by comparing
+    the stored checksums with the computed ones for all stored
+    keys. (0 disables it. dangerous, use with care)
+  default: 1_day
+  services:
+  - mon
+- name: mon_scrub_timeout
+  type: int
+  level: advanced
+  desc: timeout to restart scrub of mon quorum participant does not respond for the
+    latest chunk
+  default: 5_min
+  services:
+  - mon
+  with_legacy: true
+- name: mon_scrub_max_keys
+  type: int
+  level: advanced
+  desc: max keys per on scrub chunk/step
+  fmt_desc: The maximum number of keys to scrub each time.
+  default: 100
+  services:
+  - mon
+  with_legacy: true
+# probability of injected crc mismatch [0.0, 1.0]
+- name: mon_scrub_inject_crc_mismatch
+  type: float
+  level: dev
+  desc: probability for injecting crc mismatches into mon scrub
+  default: 0
+  services:
+  - mon
+  with_legacy: true
+# probability of injected missing keys [0.0, 1.0]
+- name: mon_scrub_inject_missing_keys
+  type: float
+  level: dev
+  desc: probability for injecting missing keys into mon scrub
+  default: 0
+  services:
+  - mon
+  with_legacy: true
+- name: mon_config_key_max_entry_size
+  type: size
+  level: advanced
+  desc: Defines the number of bytes allowed to be held in a single config-key entry
+  fmt_desc: The maximum size of config-key entry (in bytes)
+  default: 64_K
+  services:
+  - mon
+  with_legacy: true
+- name: mon_sync_timeout
+  type: float
+  level: advanced
+  desc: timeout before canceling sync if syncing mon does not respond
+  fmt_desc: Number of seconds the monitor will wait for the next update
+    message from its sync provider before it gives up and bootstrap
+    again.
+  default: 1_min
+  services:
+  - mon
+  with_legacy: true
+- name: mon_sync_max_payload_size
+  type: size
+  level: advanced
+  desc: target max message payload for mon sync
+  fmt_desc: The maximum size for a sync payload (in bytes).
+  default: 1_M
+  services:
+  - mon
+  with_legacy: true
+- name: mon_sync_max_payload_keys
+  type: int
+  level: advanced
+  desc: target max keys in message payload for mon sync
+  default: 2000
+  services:
+  - mon
+  with_legacy: true
+- name: mon_sync_debug
+  type: bool
+  level: dev
+  desc: enable extra debugging during mon sync
+  default: false
+  services:
+  - mon
+  with_legacy: true
+- name: mon_inject_sync_get_chunk_delay
+  type: float
+  level: dev
+  desc: inject delay during sync (seconds)
+  default: 0
+  services:
+  - mon
+  with_legacy: true
+- name: mon_osd_min_down_reporters
+  type: uint
+  level: advanced
+  desc: number of OSDs from different subtrees who need to report a down OSD for it
+    to count
+  fmt_desc: The minimum number of Ceph OSD Daemons required to report a
+              ``down`` Ceph OSD Daemon.
+  default: 2
+  services:
+  - mon
+  see_also:
+  - mon_osd_reporter_subtree_level
+- name: mon_osd_reporter_subtree_level
+  type: str
+  level: advanced
+  desc: in which level of parent bucket the reporters are counted
+  fmt_desc: In which level of parent bucket the reporters are counted. The OSDs
+              send failure reports to monitors if they find a peer that is not responsive.
+              Monitors mark the reported ``OSD`` out and then ``down`` after a grace period.
+  default: host
+  services:
+  - mon
+  flags:
+  - runtime
+- name: mon_osd_snap_trim_queue_warn_on
+  type: int
+  level: advanced
+  desc: Warn when snap trim queue is that large (or larger).
+  long_desc: Warn when snap trim queue length for at least one PG crosses this value,
+    as this is indicator of snap trimmer not keeping up, wasting disk space
+  default: 32768
+  services:
+  - mon
+  with_legacy: true
+# force mon to trim maps to this point, regardless of min_last_epoch_clean (dangerous)
+- name: mon_osd_force_trim_to
+  type: int
+  level: dev
+  desc: force mons to trim osdmaps through this epoch
+  fmt_desc: Force monitor to trim osdmaps to this point, even if there is
+    PGs not clean at the specified epoch (0 disables it. dangerous,
+    use with care)
+  default: 0
+  services:
+  - mon
+  with_legacy: true
+- name: mon_debug_extra_checks
+  type: bool
+  level: dev
+  desc: Enable some additional monitor checks
+  long_desc: Enable some additional monitor checks that would be too expensive to
+    run on production systems, or would only be relevant while testing or debugging.
+  default: false
+  services:
+  - mon
+- name: mon_debug_block_osdmap_trim
+  type: bool
+  level: dev
+  desc: Block OSDMap trimming while the option is enabled.
+  long_desc: Blocking OSDMap trimming may be quite helpful to easily reproduce states
+    in which the monitor keeps (hundreds of) thousands of osdmaps.
+  default: false
+  services:
+  - mon
+- name: mon_debug_deprecated_as_obsolete
+  type: bool
+  level: dev
+  desc: treat deprecated mon commands as obsolete
+  default: false
+  services:
+  - mon
+  with_legacy: true
+- name: mon_debug_dump_transactions
+  type: bool
+  level: dev
+  desc: dump paxos transactions to log
+  default: false
+  services:
+  - mon
+  see_also:
+  - mon_debug_dump_location
+  with_legacy: true
+- name: mon_debug_dump_json
+  type: bool
+  level: dev
+  desc: dump paxos transasctions to log as json
+  default: false
+  services:
+  - mon
+  see_also:
+  - mon_debug_dump_transactions
+  with_legacy: true
+- name: mon_debug_dump_location
+  type: str
+  level: dev
+  desc: file to dump paxos transactions to
+  default: /var/log/ceph/$cluster-$name.tdump
+  services:
+  - mon
+  see_also:
+  - mon_debug_dump_transactions
+  with_legacy: true
+- name: mon_debug_no_require_quincy
+  type: bool
+  level: dev
+  desc: do not set quincy feature for new mon clusters
+  default: false
+  services:
+  - mon
+  flags:
+  - cluster_create
+- name: mon_debug_no_require_reef
+  type: bool
+  level: dev
+  desc: do not set reef feature for new mon clusters
+  default: false
+  services:
+  - mon
+  flags:
+  - cluster_create
+- name: mon_debug_no_require_bluestore_for_ec_overwrites
+  type: bool
+  level: dev
+  desc: do not require bluestore OSDs to enable EC overwrites on a rados pool
+  default: false
+  services:
+  - mon
+  with_legacy: true
+- name: mon_debug_no_initial_persistent_features
+  type: bool
+  level: dev
+  desc: do not set any monmap features for new mon clusters
+  default: false
+  services:
+  - mon
+  flags:
+  - cluster_create
+  with_legacy: true
+- name: mon_inject_transaction_delay_max
+  type: float
+  level: dev
+  desc: max duration of injected delay in paxos
+  default: 10
+  services:
+  - mon
+  with_legacy: true
+# range [0, 1]
+- name: mon_inject_transaction_delay_probability
+  type: float
+  level: dev
+  desc: probability of injecting a delay in paxos
+  default: 0
+  services:
+  - mon
+  with_legacy: true
+- name: mon_inject_pg_merge_bounce_probability
+  type: float
+  level: dev
+  desc: probability of failing and reverting a pg_num decrement
+  default: 0
+  services:
+  - mon
+# kill the sync provider at a specific point in the work flow
+- name: mon_sync_provider_kill_at
+  type: int
+  level: dev
+  desc: kill mon sync requester at specific point
+  default: 0
+  services:
+  - mon
+  with_legacy: true
+# kill the sync requester at a specific point in the work flow
+- name: mon_sync_requester_kill_at
+  type: int
+  level: dev
+  desc: kill mon sync requestor at specific point
+  default: 0
+  services:
+  - mon
+  with_legacy: true
+# force monitor to join quorum even if it has been previously removed from the map
+- name: mon_force_quorum_join
+  type: bool
+  level: advanced
+  desc: force mon to rejoin quorum even though it was just removed
+  fmt_desc: Force monitor to join quorum even if it has been previously removed from the map
+  default: false
+  services:
+  - mon
+  with_legacy: true
+# type of keyvaluedb backend
+- name: mon_keyvaluedb
+  type: str
+  level: advanced
+  desc: database backend to use for the mon database
+  default: rocksdb
+  services:
+  - mon
+  enum_values:
+  - leveldb
+  - rocksdb
+  flags:
+  - create
+  with_legacy: true
+# UNSAFE -- TESTING ONLY! Allows addition of a cache tier with preexisting snaps
+- name: mon_debug_unsafe_allow_tier_with_nonempty_snaps
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mon
+  with_legacy: true
+# required of mon, mds, osd daemons
+- name: auth_cluster_required
+  type: str
+  level: advanced
+  desc: authentication methods required by the cluster
+  fmt_desc: If enabled, the Ceph Storage Cluster daemons (i.e., ``ceph-mon``,
+   ``ceph-osd``, ``ceph-mds`` and ``ceph-mgr``) must authenticate with
+   each other. Valid settings are ``cephx`` or ``none``.
+  default: cephx
+  with_legacy: true
+# required by daemons of clients
+- name: auth_service_required
+  type: str
+  level: advanced
+  desc: authentication methods required by service daemons
+  fmt_desc: If enabled, the Ceph Storage Cluster daemons require Ceph Clients
+   to authenticate with the Ceph Storage Cluster in order to access
+   Ceph services. Valid settings are ``cephx`` or ``none``.
+  default: cephx
+  with_legacy: true
+# what clients require of daemons
+- name: auth_client_required
+  type: str
+  level: advanced
+  desc: authentication methods allowed by clients
+  fmt_desc: If enabled, the Ceph Client requires the Ceph Storage Cluster to
+   authenticate with the Ceph Client. Valid settings are ``cephx``
+   or ``none``.
+  default: cephx, none
+  with_legacy: true
+# deprecated; default value for above if they are not defined.
+- name: auth_supported
+  type: str
+  level: advanced
+  desc: authentication methods required (deprecated)
+  with_legacy: true
+- name: max_rotating_auth_attempts
+  type: int
+  level: advanced
+  desc: number of attempts to initialize rotating keys before giving up
+  default: 10
+  with_legacy: true
+- name: rotating_keys_bootstrap_timeout
+  type: int
+  level: advanced
+  desc: timeout for obtaining rotating keys during bootstrap phase (seconds)
+  default: 30
+- name: rotating_keys_renewal_timeout
+  type: int
+  level: advanced
+  desc: timeout for updating rotating keys (seconds)
+  default: 10
+- name: cephx_require_signatures
+  type: bool
+  level: advanced
+  default: false
+  fmt_desc: If set to ``true``, Ceph requires signatures on all message
+   traffic between the Ceph Client and the Ceph Storage Cluster, and
+   between daemons comprising the Ceph Storage Cluster.
+
+   Ceph Argonaut and Linux kernel versions prior to 3.19 do
+   not support signatures; if such clients are in use this
+   option can be turned off to allow them to connect.
+  with_legacy: true
+- name: cephx_require_version
+  type: int
+  level: advanced
+  desc: Cephx version required (1 = pre-mimic, 2 = mimic+)
+  default: 2
+  with_legacy: true
+- name: cephx_cluster_require_signatures
+  type: bool
+  level: advanced
+  default: false
+  fmt_desc:    If set to ``true``, Ceph requires signatures on all message
+   traffic between Ceph daemons comprising the Ceph Storage Cluster.
+  with_legacy: true
+- name: cephx_cluster_require_version
+  type: int
+  level: advanced
+  desc: Cephx version required by the cluster from clients (1 = pre-mimic, 2 = mimic+)
+  default: 2
+  with_legacy: true
+- name: cephx_service_require_signatures
+  type: bool
+  level: advanced
+  default: false
+  fmt_desc: If set to ``true``, Ceph requires signatures on all message
+   traffic between Ceph Clients and the Ceph Storage Cluster.
+  with_legacy: true
+- name: cephx_service_require_version
+  type: int
+  level: advanced
+  desc: Cephx version required from ceph services (1 = pre-mimic, 2 = mimic+)
+  default: 2
+  with_legacy: true
+# Default to signing session messages if supported
+- name: cephx_sign_messages
+  type: bool
+  level: advanced
+  default: true
+  fmt_desc: If the Ceph version supports message signing, Ceph will sign
+   all messages so they are more difficult to spoof.
+  with_legacy: true
+- name: auth_mon_ticket_ttl
+  type: float
+  level: advanced
+  default: 72_hr
+  with_legacy: true
+- name: auth_service_ticket_ttl
+  type: float
+  level: advanced
+  default: 1_hr
+  fmt_desc: When the Ceph Storage Cluster sends a Ceph Client a ticket for
+   authentication, the Ceph Storage Cluster assigns the ticket a
+   time to live.
+  with_legacy: true
+- name: auth_allow_insecure_global_id_reclaim
+  type: bool
+  level: advanced
+  desc: Allow reclaiming global_id without presenting a valid ticket proving
+    previous possession of that global_id
+  long_desc: Allowing unauthorized global_id (re)use poses a security risk.
+    Unfortunately, older clients may omit their ticket on reconnects and
+    therefore rely on this being allowed for preserving their global_id for
+    the lifetime of the client instance. Setting this value to false would
+    immediately prevent new connections from those clients (assuming
+    auth_expose_insecure_global_id_reclaim set to true) and eventually break
+    existing sessions as well (regardless of auth_expose_insecure_global_id_reclaim
+    setting).
+  default: true
+  see_also:
+  - mon_warn_on_insecure_global_id_reclaim
+  - mon_warn_on_insecure_global_id_reclaim_allowed
+  - auth_expose_insecure_global_id_reclaim
+  with_legacy: true
+- name: auth_expose_insecure_global_id_reclaim
+  type: bool
+  level: advanced
+  desc: Force older clients that may omit their ticket on reconnects to
+    reconnect as part of establishing a session
+  long_desc: 'In permissive mode (auth_allow_insecure_global_id_reclaim set
+    to true), this helps with identifying clients that are not patched. In
+    enforcing mode (auth_allow_insecure_global_id_reclaim set to false), this
+    is a fail-fast mechanism: don''t establish a session that will almost
+    inevitably be broken later.'
+  default: true
+  see_also:
+  - mon_warn_on_insecure_global_id_reclaim
+  - mon_warn_on_insecure_global_id_reclaim_allowed
+  - auth_allow_insecure_global_id_reclaim
+  with_legacy: true
+# if true, assert when weird things happen
+- name: auth_debug
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+# how many mons to try to connect to in parallel during hunt
+- name: mon_client_hunt_parallel
+  type: uint
+  level: advanced
+  default: 3
+  with_legacy: true
+# try new mon every N seconds until we connect
+- name: mon_client_hunt_interval
+  type: float
+  level: advanced
+  default: 3
+  fmt_desc: The client will try a new monitor every ``N`` seconds until it
+    establishes a connection.
+  with_legacy: true
+# send logs every N seconds
+- name: mon_client_log_interval
+  type: float
+  level: advanced
+  desc: How frequently we send queued cluster log messages to mon
+  default: 1
+  with_legacy: true
+# ping every N seconds
+- name: mon_client_ping_interval
+  type: float
+  level: advanced
+  default: 10
+  fmt_desc: The client will ping the monitor every ``N`` seconds.
+  with_legacy: true
+# fail if we don't hear back
+- name: mon_client_ping_timeout
+  type: float
+  level: advanced
+  default: 30
+  with_legacy: true
+- name: mon_client_hunt_interval_backoff
+  type: float
+  level: advanced
+  default: 1.5
+  with_legacy: true
+- name: mon_client_hunt_interval_min_multiple
+  type: float
+  level: advanced
+  default: 1
+  with_legacy: true
+- name: mon_client_hunt_interval_max_multiple
+  type: float
+  level: advanced
+  default: 10
+  with_legacy: true
+- name: mon_client_max_log_entries_per_message
+  type: int
+  level: advanced
+  default: 1000
+  fmt_desc: The maximum number of log entries a monitor will generate
+    per client message.
+  with_legacy: true
+- name: mon_client_directed_command_retry
+  type: int
+  level: dev
+  desc: Number of times to try sending a command directed at a specific monitor
+  default: 2
+  with_legacy: true
+# whitespace-separated list of key=value pairs describing crush location
+- name: crush_location
+  type: str
+  level: advanced
+  with_legacy: true
+- name: crush_location_hook
+  type: str
+  level: advanced
+  with_legacy: true
+- name: crush_location_hook_timeout
+  type: int
+  level: advanced
+  default: 10
+  with_legacy: true
+- name: objecter_tick_interval
+  type: float
+  level: dev
+  default: 5
+  with_legacy: true
+# before we ask for a map
+- name: objecter_timeout
+  type: float
+  level: advanced
+  desc: Seconds before in-flight op is considered 'laggy' and we query mon for the
+    latest OSDMap
+  default: 10
+  with_legacy: true
+- name: objecter_inflight_op_bytes
+  type: size
+  level: advanced
+  desc: Max in-flight data in bytes (both directions)
+  default: 100_M
+  with_legacy: true
+- name: objecter_inflight_ops
+  type: uint
+  level: advanced
+  desc: Max in-flight operations
+  default: 1_K
+  with_legacy: true
+# num of completion locks per each session, for serializing same object responses
+- name: objecter_completion_locks_per_session
+  type: uint
+  level: dev
+  default: 32
+  with_legacy: true
+# suppress watch pings
+- name: objecter_inject_no_watch_ping
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+# ignore the first reply for each write, and resend the osd op instead
+- name: objecter_retry_writes_after_first_reply
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: objecter_debug_inject_relock_delay
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: filer_max_purge_ops
+  type: uint
+  level: advanced
+  desc: Max in-flight operations for purging a striped range (e.g., MDS journal)
+  default: 10
+  with_legacy: true
+- name: filer_max_truncate_ops
+  type: uint
+  level: advanced
+  desc: Max in-flight operations for truncating/deleting a striped sequence (e.g.,
+    MDS journal)
+  default: 128
+  with_legacy: true
+- name: journaler_write_head_interval
+  type: int
+  level: advanced
+  desc: Interval in seconds between journal header updates (to help bound replay time)
+  default: 15
+# * journal object size
+- name: journaler_prefetch_periods
+  type: uint
+  level: advanced
+  desc: Number of striping periods to prefetch while reading MDS journal
+  default: 10
+  # we need at least 2 periods to make progress.
+  min: 2
+# * journal object size
+- name: journaler_prezero_periods
+  type: uint
+  level: advanced
+  desc: Number of striping periods to zero head of MDS journal write position
+  default: 5
+  # we need to zero at least two periods, minimum, to ensure that we
+  # have a full empty object/period in front of us.
+  min: 2
+- name: osd_calc_pg_upmaps_aggressively
+  type: bool
+  level: advanced
+  desc: try to calculate PG upmaps more aggressively, e.g., by doing a fairly exhaustive
+    search of existing PGs that can be unmapped or upmapped
+  default: true
+  flags:
+  - runtime
+- name: osd_calc_pg_upmaps_aggressively_fast
+  type: bool
+  level: advanced
+  desc: Prevent very long (>10 minutes) calculations in some extreme cases (applicable
+    only to aggressive mode)
+  default: true
+  flags:
+  - runtime
+- name: osd_calc_pg_upmaps_local_fallback_retries
+  type: uint
+  level: advanced
+  desc: 'Maximum number of PGs we can attempt to unmap or upmap for a specific overfull
+    or underfull osd per iteration '
+  default: 100
+  flags:
+  - runtime
+# 1 = host
+- name: osd_crush_chooseleaf_type
+  type: int
+  level: dev
+  desc: default chooseleaf type for osdmaptool --create
+  fmt_desc: The bucket type to use for ``chooseleaf`` in a CRUSH rule. Uses
+    ordinal rank rather than name.
+  default: 1
+  flags:
+  - cluster_create
+  with_legacy: true
+# try to use gmt for hitset archive names if all osds in cluster support it
+- name: osd_pool_use_gmt_hitset
+  type: bool
+  level: dev
+  desc: use UTC for hitset timestamps
+  long_desc: This setting only exists for compatibility with hammer (and older) clusters.
+  default: true
+  with_legacy: true
+# whether turn on fast read on the pool or not
+- name: osd_pool_default_ec_fast_read
+  type: bool
+  level: advanced
+  desc: set ec_fast_read for new erasure-coded pools
+  fmt_desc: Whether to turn on fast read on the pool or not. It will be used as
+    the default setting of newly created erasure coded pools if ``fast_read``
+    is not specified at create time.
+  default: false
+  services:
+  - mon
+  with_legacy: true
+- name: osd_pool_default_crush_rule
+  type: int
+  level: advanced
+  desc: CRUSH rule for newly created pools
+  fmt_desc: The default CRUSH rule to use when creating a replicated pool. The
+    default value of ``-1`` means "pick the rule with the lowest numerical ID and
+    use that".  This is to make pool creation work in the absence of rule 0.
+  default: -1
+  services:
+  - mon
+- name: osd_pool_default_size
+  type: uint
+  level: advanced
+  desc: the number of copies of an object for new replicated pools
+  fmt_desc: Sets the number of replicas for objects in the pool. The default
+    value is the same as
+    ``ceph osd pool set {pool-name} size {size}``.
+  default: 3
+  services:
+  - mon
+  min: 0
+  max: 10
+  flags:
+  - runtime
+- name: osd_pool_default_min_size
+  type: uint
+  level: advanced
+  desc: the minimal number of copies allowed to write to a degraded pool for new replicated
+    pools
+  long_desc: 0 means no specific default; ceph will use size-size/2
+  fmt_desc: Sets the minimum number of written replicas for objects in the
+    pool in order to acknowledge an I/O operation to the client.  If
+    minimum is not met, Ceph will not acknowledge the I/O to the
+    client, **which may result in data loss**. This setting ensures
+    a minimum number of replicas when operating in ``degraded`` mode.
+    The default value is ``0`` which means no particular minimum. If ``0``,
+    minimum is ``size - (size / 2)``.
+  default: 0
+  services:
+  - mon
+  see_also:
+  - osd_pool_default_size
+  min: 0
+  max: 255
+  flags:
+  - runtime
+- name: osd_pool_default_pg_num
+  type: uint
+  level: advanced
+  desc: number of PGs for new pools
+  fmt_desc: The default number of placement groups for a pool. The default
+    value is the same as ``pg_num`` with ``mkpool``.
+  long_desc: With default value of `osd_pool_default_pg_autoscale_mode` being 
+    `on` the number of PGs for new pools will start out with 1 pg, unless the 
+    user specifies the pg_num.
+  default: 32
+  services:
+  - mon
+  see_also: 
+  - osd_pool_default_pg_autoscale_mode
+  flags:
+  - runtime
+- name: osd_pool_default_pgp_num
+  type: uint
+  level: advanced
+  desc: number of PGs for placement purposes (0 to match pg_num)
+  fmt_desc: |
+    The default number of placement groups for placement for a pool.
+    The default value is the same as ``pgp_num`` with ``mkpool``.
+    PG and PGP should be equal (for now). Note: should not be set unless
+    autoscaling is disabled.
+  default: 0
+  services:
+  - mon
+  see_also:
+  - osd_pool_default_pg_num
+  - osd_pool_default_pg_autoscale_mode
+  flags:
+  - runtime
+- name: osd_pool_default_type
+  type: str
+  level: advanced
+  desc: default type of pool to create
+  default: replicated
+  services:
+  - mon
+  enum_values:
+  - replicated
+  - erasure
+  flags:
+  - runtime
+- name: osd_pool_default_erasure_code_profile
+  type: str
+  level: advanced
+  desc: default erasure code profile for new erasure-coded pools
+  default: plugin=jerasure technique=reed_sol_van k=2 m=2
+  services:
+  - mon
+  flags:
+  - runtime
+- name: osd_erasure_code_plugins
+  type: str
+  level: advanced
+  desc: erasure code plugins to load
+  default: @osd_erasure_code_plugins@
+  services:
+  - mon
+  - osd
+  flags:
+  - startup
+  with_legacy: true
+- name: osd_pool_default_flags
+  type: int
+  level: dev
+  desc: (integer) flags to set on new pools
+  fmt_desc: The default flags for new pools.
+  default: 0
+  services:
+  - mon
+  with_legacy: true
+# use new pg hashing to prevent pool/pg overlap
+- name: osd_pool_default_flag_hashpspool
+  type: bool
+  level: advanced
+  desc: set hashpspool (better hashing scheme) flag on new pools
+  default: true
+  services:
+  - mon
+  with_legacy: true
+# pool can't be deleted
+- name: osd_pool_default_flag_nodelete
+  type: bool
+  level: advanced
+  desc: set nodelete flag on new pools
+  fmt_desc: Set the ``nodelete`` flag on new pools, which prevents pool removal.
+  default: false
+  services:
+  - mon
+  with_legacy: true
+# pool's pg and pgp num can't be changed
+- name: osd_pool_default_flag_nopgchange
+  type: bool
+  level: advanced
+  desc: set nopgchange flag on new pools
+  fmt_desc: Set the ``nopgchange`` flag on new pools. Does not allow the number of PGs to be changed.
+  default: false
+  services:
+  - mon
+  with_legacy: true
+# pool's size and min size can't be changed
+- name: osd_pool_default_flag_nosizechange
+  type: bool
+  level: advanced
+  desc: set nosizechange flag on new pools
+  fmt_desc: Set the ``nosizechange`` flag on new pools. Does not allow the ``size`` to be changed.
+  default: false
+  services:
+  - mon
+  with_legacy: true
+- name: osd_pool_default_flag_bulk
+  type: bool
+  level: advanced
+  desc: set bulk flag on new pools
+  fmt_desc: Set the ``bulk`` flag on new pools. Allowing autoscaler to use scale-down mode.
+  default: false
+  services:
+  - mon
+  with_legacy: true
+- name: osd_pool_default_hit_set_bloom_fpp
+  type: float
+  level: advanced
+  default: 0.05
+  services:
+  - mon
+  see_also:
+  - osd_tier_default_cache_hit_set_type
+  with_legacy: true
+- name: osd_pool_default_cache_target_dirty_ratio
+  type: float
+  level: advanced
+  default: 0.4
+  with_legacy: true
+- name: osd_pool_default_cache_target_dirty_high_ratio
+  type: float
+  level: advanced
+  default: 0.6
+  with_legacy: true
+- name: osd_pool_default_cache_target_full_ratio
+  type: float
+  level: advanced
+  default: 0.8
+  with_legacy: true
+# seconds
+- name: osd_pool_default_cache_min_flush_age
+  type: int
+  level: advanced
+  default: 0
+  with_legacy: true
+# seconds
+- name: osd_pool_default_cache_min_evict_age
+  type: int
+  level: advanced
+  default: 0
+  with_legacy: true
+# max size to check for eviction
+- name: osd_pool_default_cache_max_evict_check_size
+  type: int
+  level: advanced
+  default: 10
+  with_legacy: true
+- name: osd_pool_default_pg_autoscale_mode
+  type: str
+  level: advanced
+  desc: Default PG autoscaling behavior for new pools
+  long_desc: With default value `on`, the autoscaler starts a new pool with 1
+    pg, unless the user specifies the pg_num.
+  default: 'on'
+  enum_values:
+  - 'off'
+  - 'warn'
+  - 'on'
+  flags:
+  - runtime
+- name: osd_pool_default_read_lease_ratio
+  type: float
+  level: dev
+  desc: Default read_lease_ratio for a pool, as a multiple of osd_heartbeat_grace
+  long_desc: This should be <= 1.0 so that the read lease will have expired by the
+    time we decide to mark a peer OSD down.
+  default: 0.8
+  see_also:
+  - osd_heartbeat_grace
+  flags:
+  - runtime
+  with_legacy: true
+# min target size for a HitSet
+- name: osd_hit_set_min_size
+  type: int
+  level: advanced
+  default: 1000
+  with_legacy: true
+# max target size for a HitSet
+- name: osd_hit_set_max_size
+  type: int
+  level: advanced
+  default: 100000
+  with_legacy: true
+# rados namespace for hit_set tracking
+- name: osd_hit_set_namespace
+  type: str
+  level: advanced
+  default: .ceph-internal
+  with_legacy: true
+# conservative default throttling values
+- name: osd_tier_promote_max_objects_sec
+  type: uint
+  level: advanced
+  default: 25
+  with_legacy: true
+- name: osd_tier_promote_max_bytes_sec
+  type: size
+  level: advanced
+  default: 5_M
+  with_legacy: true
+- name: osd_tier_default_cache_mode
+  type: str
+  level: advanced
+  default: writeback
+  enum_values:
+  - none
+  - writeback
+  - forward
+  - readonly
+  - readforward
+  - readproxy
+  - proxy
+  flags:
+  - runtime
+- name: osd_tier_default_cache_hit_set_count
+  type: uint
+  level: advanced
+  default: 4
+- name: osd_tier_default_cache_hit_set_period
+  type: uint
+  level: advanced
+  default: 1200
+- name: osd_tier_default_cache_hit_set_type
+  type: str
+  level: advanced
+  default: bloom
+  enum_values:
+  - bloom
+  - explicit_hash
+  - explicit_object
+  flags:
+  - runtime
+- name: osd_tier_default_cache_min_read_recency_for_promote
+  type: uint
+  level: advanced
+  desc: number of recent HitSets the object must appear in to be promoted (on read)
+  default: 1
+- name: osd_tier_default_cache_min_write_recency_for_promote
+  type: uint
+  level: advanced
+  desc: number of recent HitSets the object must appear in to be promoted (on write)
+  default: 1
+- name: osd_tier_default_cache_hit_set_grade_decay_rate
+  type: uint
+  level: advanced
+  default: 20
+- name: osd_tier_default_cache_hit_set_search_last_n
+  type: uint
+  level: advanced
+  default: 1
+- name: osd_objecter_finishers
+  type: int
+  level: advanced
+  default: 1
+  flags:
+  - startup
+  with_legacy: true
+- name: osd_map_dedup
+  type: bool
+  level: advanced
+  default: true
+  fmt_desc: Enable removing duplicates in the OSD map.
+  with_legacy: true
+- name: osd_map_message_max
+  type: int
+  level: advanced
+  desc: maximum number of OSDMaps to include in a single message
+  fmt_desc: The maximum map entries allowed per MOSDMap message.
+  default: 40
+  services:
+  - osd
+  - mon
+  with_legacy: true
+- name: osd_map_message_max_bytes
+  type: size
+  level: advanced
+  desc: maximum number of bytes worth of OSDMaps to include in a single message
+  default: 10_M
+  services:
+  - osd
+  - mon
+  with_legacy: true
+# do not assert on divergent_prior entries which aren't in the log and whose on-disk objects are newer
+- name: osd_ignore_stale_divergent_priors
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: osd_heartbeat_interval
+  type: int
+  level: dev
+  desc: Interval (in seconds) between peer pings
+  fmt_desc: How often an Ceph OSD Daemon pings its peers (in seconds).
+  default: 6
+  min: 1
+  max: 1_min
+  with_legacy: true
+# (seconds) how long before we decide a peer has failed
+# This setting is read by the MONs and OSDs and has to be set to a equal value in both settings of the configuration
+- name: osd_heartbeat_grace
+  type: int
+  level: advanced
+  default: 20
+  fmt_desc: The elapsed time when a Ceph OSD Daemon hasn't shown a heartbeat
+              that the Ceph Storage Cluster considers it ``down``.
+              This setting must be set in both the [mon] and [osd] or [global]
+              sections so that it is read by both monitor and OSD daemons.
+  with_legacy: true
+- name: osd_heartbeat_stale
+  type: int
+  level: advanced
+  desc: Interval (in seconds) we mark an unresponsive heartbeat peer as stale.
+  long_desc: Automatically mark unresponsive heartbeat sessions as stale and tear
+    them down. The primary benefit is that OSD doesn't need to keep a flood of blocked
+    heartbeat messages around in memory.
+  default: 10_min
+# prio the heartbeat tcp socket and set dscp as CS6 on it if true
+- name: osd_heartbeat_use_min_delay_socket
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+# the minimum size of OSD heartbeat messages to send
+- name: osd_heartbeat_min_size
+  type: size
+  level: advanced
+  desc: Minimum heartbeat packet size in bytes. Will add dummy payload if heartbeat
+    packet is smaller than this.
+  default: 2000
+  with_legacy: true
+# max number of parallel snap trims/pg
+- name: osd_pg_max_concurrent_snap_trims
+  type: uint
+  level: advanced
+  default: 2
+  min: 1
+  with_legacy: true
+# max number of trimming pgs
+- name: osd_max_trimming_pgs
+  type: uint
+  level: advanced
+  default: 2
+  with_legacy: true
+# minimum number of peers that must be reachable to mark ourselves
+# back up after being wrongly marked down.
+- name: osd_heartbeat_min_healthy_ratio
+  type: float
+  level: advanced
+  default: 0.33
+  with_legacy: true
+# (seconds) how often to ping monitor if no peers
+- name: osd_mon_heartbeat_interval
+  type: int
+  level: advanced
+  default: 30
+  fmt_desc: How often the Ceph OSD Daemon pings a Ceph Monitor if it has no
+              Ceph OSD Daemon peers.
+  with_legacy: true
+- name: osd_mon_heartbeat_stat_stale
+  type: int
+  level: advanced
+  desc: Stop reporting on heartbeat ping times not updated for this many seconds.
+  long_desc: Stop reporting on old heartbeat information unless this is set to zero
+  fmt_desc: Stop reporting on heartbeat ping times which haven't been updated for
+              this many seconds.  Set to zero to disable this action.
+  default: 1_hr
+# failures, up_thru, boot.
+- name: osd_mon_report_interval
+  type: int
+  level: advanced
+  desc: Frequency of OSD reports to mon for peer failures, fullness status changes
+  fmt_desc: The number of seconds a Ceph OSD Daemon may wait
+              from startup or another reportable event before reporting
+              to a Ceph Monitor.
+  default: 5
+  with_legacy: true
+# max updates in flight
+- name: osd_mon_report_max_in_flight
+  type: int
+  level: advanced
+  default: 2
+  with_legacy: true
+# (second) how often to send beacon message to monitor
+- name: osd_beacon_report_interval
+  type: int
+  level: advanced
+  default: 5_min
+  with_legacy: true
+# report pg stats for any given pg at least this often
+- name: osd_pg_stat_report_interval_max
+  type: int
+  level: advanced
+  default: 500
+  with_legacy: true
+# Max number of snap intervals to report to mgr in pg_stat_t
+- name: osd_max_snap_prune_intervals_per_epoch
+  type: uint
+  level: dev
+  desc: Max number of snap intervals to report to mgr in pg_stat_t
+  default: 512
+  with_legacy: true
+- name: osd_default_data_pool_replay_window
+  type: int
+  level: advanced
+  default: 45
+  fmt_desc: The time (in seconds) for an OSD to wait for a client to replay
+    a request.
+- name: osd_auto_mark_unfound_lost
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: osd_check_for_log_corruption
+  type: bool
+  level: advanced
+  default: false
+  fmt_desc: Check log files for corruption. Can be computationally expensive.
+  with_legacy: true
+- name: osd_use_stale_snap
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: osd_rollback_to_cluster_snap
+  type: str
+  level: advanced
+  with_legacy: true
+- name: osd_default_notify_timeout
+  type: uint
+  level: advanced
+  desc: default number of seconds after which notify propagation times out. used if
+    a client has not specified other value
+  fmt_desc: The OSD default notification timeout (in seconds).
+  default: 30
+  with_legacy: true
+- name: osd_kill_backfill_at
+  type: int
+  level: dev
+  default: 0
+  with_legacy: true
+# Bounds how infrequently a new map epoch will be persisted for a pg
+# make this < map_cache_size!
+- name: osd_pg_epoch_persisted_max_stale
+  type: uint
+  level: advanced
+  default: 40
+  with_legacy: true
+- name: osd_target_pg_log_entries_per_osd
+  type: uint
+  level: dev
+  desc: target number of PG entries total on an OSD - limited per pg by the min and
+    max options below
+  default: 300000
+  see_also:
+  - osd_max_pg_log_entries
+  - osd_min_pg_log_entries
+  with_legacy: true
+- name: osd_min_pg_log_entries
+  type: uint
+  level: dev
+  desc: minimum number of entries to maintain in the PG log
+  fmt_desc: The minimum number of placement group logs to maintain
+    when trimming log files.
+  default: 250
+  services:
+  - osd
+  see_also:
+  - osd_max_pg_log_entries
+  - osd_pg_log_dups_tracked
+  - osd_target_pg_log_entries_per_osd
+  with_legacy: true
+- name: osd_max_pg_log_entries
+  type: uint
+  level: dev
+  desc: maximum number of entries to maintain in the PG log
+  fmt_desc: The maximum number of placement group logs to maintain
+    when trimming log files.
+  default: 10000
+  services:
+  - osd
+  see_also:
+  - osd_min_pg_log_entries
+  - osd_pg_log_dups_tracked
+  - osd_target_pg_log_entries_per_osd
+  with_legacy: true
+- name: osd_pg_log_dups_tracked
+  type: uint
+  level: dev
+  desc: how many versions back to track in order to detect duplicate ops; this is
+    combined with both the regular pg log entries and additional minimal dup detection
+    entries
+  default: 3000
+  services:
+  - osd
+  see_also:
+  - osd_min_pg_log_entries
+  - osd_max_pg_log_entries
+  with_legacy: true
+- name: osd_object_clean_region_max_num_intervals
+  type: int
+  level: dev
+  desc: number of intervals in clean_offsets
+  long_desc: partial recovery uses multiple intervals to record the clean part of
+    the objectwhen the number of intervals is greater than osd_object_clean_region_max_num_intervals,
+    minimum interval will be trimmed(0 will recovery the entire object data interval)
+  default: 10
+  services:
+  - osd
+  with_legacy: true
+# max entries factor before force recovery
+- name: osd_force_recovery_pg_log_entries_factor
+  type: float
+  level: dev
+  default: 1.3
+  with_legacy: true
+- name: osd_pg_log_trim_min
+  type: uint
+  level: dev
+  desc: Minimum number of log entries to trim at once. This lets us trim in larger
+    batches rather than with each write.
+  default: 100
+  see_also:
+  - osd_max_pg_log_entries
+  - osd_min_pg_log_entries
+  with_legacy: true
+- name: osd_force_auth_primary_missing_objects
+  type: uint
+  level: advanced
+  desc: Approximate missing objects above which to force auth_log_shard to be primary
+    temporarily
+  default: 100
+- name: osd_async_recovery_min_cost
+  type: uint
+  level: advanced
+  desc: A mixture measure of number of current log entries difference and historical
+    missing objects,  above which we switch to use asynchronous recovery when appropriate
+  default: 100
+  flags:
+  - runtime
+- name: osd_max_pg_per_osd_hard_ratio
+  type: float
+  level: advanced
+  desc: Maximum number of PG per OSD, a factor of 'mon_max_pg_per_osd'
+  long_desc: OSD will refuse to instantiate PG if the number of PG it serves exceeds
+    this number.
+  fmt_desc: The ratio of number of PGs per OSD allowed by the cluster before the
+    OSD refuses to create new PGs. An OSD stops creating new PGs if the number
+    of PGs it serves exceeds
+    ``osd_max_pg_per_osd_hard_ratio`` \* ``mon_max_pg_per_osd``.
+  default: 3
+  see_also:
+  - mon_max_pg_per_osd
+  min: 1
+- name: osd_pg_log_trim_max
+  type: uint
+  level: advanced
+  desc: maximum number of entries to remove at once from the PG log
+  default: 10000
+  services:
+  - osd
+  see_also:
+  - osd_min_pg_log_entries
+  - osd_max_pg_log_entries
+  with_legacy: true
+# how many seconds old makes an op complaint-worthy
+- name: osd_op_complaint_time
+  type: float
+  level: advanced
+  default: 30
+  fmt_desc: An operation becomes complaint worthy after the specified number
+    of seconds have elapsed.
+  with_legacy: true
+- name: osd_command_max_records
+  type: int
+  level: advanced
+  default: 256
+  fmt_desc: Limits the number of lost objects to return.
+  with_legacy: true
+# max peer osds to report that are blocking our progress
+- name: osd_max_pg_blocked_by
+  type: uint
+  level: advanced
+  default: 16
+  with_legacy: true
+- name: osd_op_log_threshold
+  type: int
+  level: advanced
+  default: 5
+  fmt_desc: How many operations logs to display at once.
+  with_legacy: true
+- name: osd_backoff_on_unfound
+  type: bool
+  level: advanced
+  default: true
+  with_legacy: true
+# [mainly for debug?] object unreadable/writeable
+- name: osd_backoff_on_degraded
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+# [debug] pg peering
+- name: osd_backoff_on_peering
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: osd_debug_shutdown
+  type: bool
+  level: dev
+  desc: Turn up debug levels during shutdown
+  default: false
+  with_legacy: true
+# crash osd if client ignores a backoff; useful for debugging
+- name: osd_debug_crash_on_ignored_backoff
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: osd_debug_inject_dispatch_delay_probability
+  type: float
+  level: dev
+  default: 0
+  with_legacy: true
+- name: osd_debug_inject_dispatch_delay_duration
+  type: float
+  level: dev
+  default: 0.1
+  with_legacy: true
+- name: osd_debug_drop_ping_probability
+  desc: N/A
+  type: float
+  level: dev
+  default: 0
+  with_legacy: true
+- name: osd_debug_drop_ping_duration
+  desc: N/A
+  type: int
+  level: dev
+  default: 0
+  with_legacy: true
+- name: osd_debug_op_order
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: osd_debug_verify_missing_on_start
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: osd_debug_verify_snaps
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: osd_debug_verify_stray_on_activate
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: osd_debug_skip_full_check_in_backfill_reservation
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: osd_debug_reject_backfill_probability
+  type: float
+  level: dev
+  default: 0
+  with_legacy: true
+# inject failure during copyfrom completion
+- name: osd_debug_inject_copyfrom_error
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: osd_debug_misdirected_ops
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: osd_debug_skip_full_check_in_recovery
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: osd_debug_random_push_read_error
+  type: float
+  level: dev
+  default: 0
+  with_legacy: true
+- name: osd_debug_verify_cached_snaps
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: osd_debug_deep_scrub_sleep
+  type: float
+  level: dev
+  desc: Inject an expensive sleep during deep scrub IO to make it easier to induce
+    preemption
+  default: 0
+  with_legacy: true
+- name: osd_debug_no_acting_change
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: osd_debug_no_purge_strays
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: osd_debug_pretend_recovery_active
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+# enable/disable OSD op tracking
+- name: osd_enable_op_tracker
+  type: bool
+  level: advanced
+  default: true
+  with_legacy: true
+# The number of shards for holding the ops
+- name: osd_num_op_tracker_shard
+  type: uint
+  level: advanced
+  default: 32
+  with_legacy: true
+# Max number of completed ops to track
+- name: osd_op_history_size
+  type: uint
+  level: advanced
+  default: 20
+  fmt_desc: The maximum number of completed operations to track.
+  with_legacy: true
+# Oldest completed op to track
+- name: osd_op_history_duration
+  type: uint
+  level: advanced
+  default: 600
+  fmt_desc: The oldest completed operation to track.
+  with_legacy: true
+# Max number of slow ops to track
+- name: osd_op_history_slow_op_size
+  type: uint
+  level: advanced
+  default: 20
+  with_legacy: true
+# track the op if over this threshold
+- name: osd_op_history_slow_op_threshold
+  type: float
+  level: advanced
+  default: 10
+  with_legacy: true
+# to adjust various transactions that batch smaller items
+- name: osd_target_transaction_size
+  type: int
+  level: advanced
+  default: 30
+  with_legacy: true
+# what % full makes an OSD "full" (failsafe)
+- name: osd_failsafe_full_ratio
+  type: float
+  level: advanced
+  default: 0.97
+  with_legacy: true
+- name: osd_fast_shutdown
+  type: bool
+  level: advanced
+  desc: Fast, immediate shutdown
+  long_desc: Setting this to false makes the OSD do a slower teardown of all state
+    when it receives a SIGINT or SIGTERM or when shutting down for any other reason.  That
+    slow shutdown is primarilyy useful for doing memory leak checking with valgrind.
+  default: true
+  with_legacy: true
+- name: osd_fast_shutdown_timeout
+  type: int
+  level: advanced
+  desc: timeout in seconds for osd fast-shutdown (0 is unlimited)
+  default: 15
+  with_legacy: true
+  min: 0
+- name: osd_fast_shutdown_notify_mon
+  type: bool
+  level: advanced
+  desc: Tell mon about OSD shutdown on immediate shutdown
+  long_desc: Tell the monitor the OSD is shutting down on immediate shutdown. This
+    helps with cluster log messages from other OSDs reporting it immediately failed.
+  default: true
+  see_also:
+  - osd_fast_shutdown
+  - osd_mon_shutdown_timeout
+  with_legacy: true
+# immediately mark OSDs as down once they refuse to accept connections
+- name: osd_fast_fail_on_connection_refused
+  type: bool
+  level: advanced
+  default: true
+  fmt_desc: If this option is enabled, crashed OSDs are marked down
+    immediately by connected peers and MONs (assuming that the
+    crashed OSD host survives). Disable it to restore old
+    behavior, at the expense of possible long I/O stalls when
+    OSDs crash in the middle of I/O operations.
+  with_legacy: true
+- name: osd_pg_object_context_cache_count
+  type: int
+  level: advanced
+  default: 64
+  with_legacy: true
+# true if LTTng-UST tracepoints should be enabled
+- name: osd_tracing
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+# true if function instrumentation should use LTTng
+- name: osd_function_tracing
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+# use fast info attr, if we can
+- name: osd_fast_info
+  type: bool
+  level: advanced
+  default: true
+  with_legacy: true
+# determines whether PGLog::check() compares written out log to stored log
+- name: osd_debug_pg_log_writeout
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+# Max number of loop before we reset thread-pool's handle
+- name: osd_loop_before_reset_tphandle
+  type: uint
+  level: advanced
+  default: 64
+  with_legacy: true
+# default timeout while caling WaitInterval on an empty queue
+- name: threadpool_default_timeout
+  type: int
+  level: advanced
+  default: 1_min
+  with_legacy: true
+# default wait time for an empty queue before pinging the hb timeout
+- name: threadpool_empty_queue_max_wait
+  type: int
+  level: advanced
+  default: 2
+  with_legacy: true
+- name: leveldb_log_to_ceph_log
+  type: bool
+  level: advanced
+  default: true
+  with_legacy: true
+- name: leveldb_write_buffer_size
+  type: size
+  level: advanced
+  default: 8_M
+  with_legacy: true
+- name: leveldb_cache_size
+  type: size
+  level: advanced
+  default: 128_M
+  with_legacy: true
+- name: leveldb_block_size
+  type: size
+  level: advanced
+  default: 0
+  with_legacy: true
+- name: leveldb_bloom_size
+  type: int
+  level: advanced
+  default: 0
+  with_legacy: true
+- name: leveldb_max_open_files
+  type: int
+  level: advanced
+  default: 0
+  with_legacy: true
+- name: leveldb_compression
+  type: bool
+  level: advanced
+  default: true
+  with_legacy: true
+- name: leveldb_paranoid
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: leveldb_log
+  type: str
+  level: advanced
+  default: /dev/null
+  with_legacy: true
+- name: leveldb_compact_on_mount
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: rocksdb_log_to_ceph_log
+  type: bool
+  level: advanced
+  default: true
+  with_legacy: true
+- name: rocksdb_cache_size
+  type: size
+  level: advanced
+  default: 512_M
+  flags:
+  - runtime
+  with_legacy: true
+# ratio of cache for row (vs block)
+- name: rocksdb_cache_row_ratio
+  type: float
+  level: advanced
+  default: 0
+  with_legacy: true
+# rocksdb block cache shard bits, 4 bit -> 16 shards
+- name: rocksdb_cache_shard_bits
+  type: int
+  level: advanced
+  default: 4
+  with_legacy: true
+# 'lru' or 'clock'
+- name: rocksdb_cache_type
+  type: str
+  level: advanced
+  default: binned_lru
+  with_legacy: true
+- name: rocksdb_block_size
+  type: size
+  level: advanced
+  default: 4_K
+  with_legacy: true
+# Enabling this will have 5-10% impact on performance for the stats collection
+- name: rocksdb_perf
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+# For rocksdb, this behavior will be an overhead of 5%~10%, collected only rocksdb_perf is enabled.
+- name: rocksdb_collect_compaction_stats
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+# For rocksdb, this behavior will be an overhead of 5%~10%, collected only rocksdb_perf is enabled.
+- name: rocksdb_collect_extended_stats
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+# For rocksdb, this behavior will be an overhead of 5%~10%, collected only rocksdb_perf is enabled.
+- name: rocksdb_collect_memory_stats
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: rocksdb_delete_range_threshold
+  type: uint
+  level: advanced
+  desc: The number of keys required to invoke DeleteRange when deleting muliple keys.
+  default: 1_M
+- name: rocksdb_bloom_bits_per_key
+  type: uint
+  level: advanced
+  desc: Number of bits per key to use for RocksDB's bloom filters.
+  long_desc: 'RocksDB bloom filters can be used to quickly answer the question of
+    whether or not a key may exist or definitely does not exist in a given RocksDB
+    SST file without having to read all keys into memory.  Using a higher bit value
+    decreases the likelihood of false positives at the expense of additional disk
+    space and memory consumption when the filter is loaded into RAM.  The current
+    default value of 20 was found to provide significant performance gains when getattr
+    calls are made (such as during new object creation in bluestore) without significant
+    memory overhead or cache pollution when combined with rocksdb partitioned index
+    filters.  See: https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters
+    for more information.'
+  default: 20
+- name: rocksdb_cache_index_and_filter_blocks
+  type: bool
+  level: dev
+  desc: Whether to cache indices and filters in block cache
+  long_desc: By default RocksDB will load an SST file's index and bloom filters into
+    memory when it is opened and remove them from memory when an SST file is closed.  Thus,
+    memory consumption by indices and bloom filters is directly tied to the number
+    of concurrent SST files allowed to be kept open.  This option instead stores cached
+    indicies and filters in the block cache where they directly compete with other
+    cached data.  By default we set this option to true to better account for and
+    bound rocksdb memory usage and keep filters in memory even when an SST file is
+    closed.
+  default: true
+- name: rocksdb_cache_index_and_filter_blocks_with_high_priority
+  type: bool
+  level: dev
+  desc: Whether to cache indices and filters in the block cache with high priority
+  long_desc: A downside of setting rocksdb_cache_index_and_filter_blocks to true is
+    that regular data can push indices and filters out of memory.  Setting this option
+    to true means they are cached with higher priority than other data and should
+    typically stay in the block cache.
+  default: false
+- name: rocksdb_pin_l0_filter_and_index_blocks_in_cache
+  type: bool
+  level: dev
+  desc: Whether to pin Level 0 indices and bloom filters in the block cache
+  long_desc: A downside of setting rocksdb_cache_index_and_filter_blocks to true is
+    that regular data can push indices and filters out of memory.  Setting this option
+    to true means that level 0 SST files will always have their indices and filters
+    pinned in the block cache.
+  default: false
+- name: rocksdb_index_type
+  type: str
+  level: dev
+  desc: 'Type of index for SST files: binary_search, hash_search, two_level'
+  long_desc: 'This option controls the table index type.  binary_search is a space
+    efficient index block that is optimized for block-search-based index. hash_search
+    may improve prefix lookup performance at the expense of higher disk and memory
+    usage and potentially slower compactions.  two_level is an experimental index
+    type that uses two binary search indexes and works in conjunction with partition
+    filters.  See: http://rocksdb.org/blog/2017/05/12/partitioned-index-filter.html'
+  default: binary_search
+- name: rocksdb_partition_filters
+  type: bool
+  level: dev
+  desc: (experimental) partition SST index/filters into smaller blocks
+  long_desc: 'This is an experimental option for rocksdb that works in conjunction
+    with two_level indices to avoid having to keep the entire filter/index in cache
+    when cache_index_and_filter_blocks is true.  The idea is to keep a much smaller
+    top-level index in heap/cache and then opportunistically cache the lower level
+    indices.  See: https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters'
+  default: false
+- name: rocksdb_metadata_block_size
+  type: size
+  level: dev
+  desc: The block size for index partitions. (0 = rocksdb default)
+  default: 4_K
+# osd_*_priority adjust the relative priority of client io, recovery io,
+# snaptrim io, etc
+#
+# osd_*_priority determines the ratio of available io between client and
+# recovery.  Each option may be set between
+# 1..63.
+- name: rocksdb_cf_compact_on_deletion
+  type: bool
+  level: dev
+  desc: Compact the column family when a certain number of tombstones are observed within a given window.
+  long_desc: 'This setting instructs RocksDB to compact a column family when a certain
+    number of tombstones are observed during iteration within a certain sliding window.
+    For instance if rocksdb_cf_compact_on_deletion_sliding_window is 8192 and
+    rocksdb_cf_compact_on_deletion_trigger is 4096,  then once 4096 tombstones are
+    observed after iteration over 8192 entries, the column family will be compacted.'
+  default: true
+  with_legacy: true
+  see_also:
+  - rocksdb_cf_compact_on_deletion_sliding_window
+  - rocksdb_cf_compact_on_deletion_trigger
+- name: rocksdb_cf_compact_on_deletion_sliding_window
+  type: int
+  level: dev
+  desc: The sliding window to use when rocksdb_cf_compact_on_deletion is enabled.
+  default: 32768
+  with_legacy: true
+  see_also:
+  - rocksdb_cf_compact_on_deletion
+- name: rocksdb_cf_compact_on_deletion_trigger
+  type: int
+  level: dev
+  desc: The trigger to use when rocksdb_cf_compact_on_deletion is enabled.
+  default: 16384
+  with_legacy: true
+  see_also:
+  - rocksdb_cf_compact_on_deletion
+- name: osd_client_op_priority
+  type: uint
+  level: advanced
+  default: 63
+  fmt_desc: The priority set for client operations.  This value is relative
+    to that of ``osd_recovery_op_priority`` below.  The default
+    strongly favors client ops over recovery.
+  with_legacy: true
+- name: osd_recovery_op_priority
+  type: uint
+  level: advanced
+  desc: Priority to use for recovery operations if not specified for the pool
+  fmt_desc: The priority of recovery operations vs client operations, if not specified by the
+    pool's ``recovery_op_priority``.  The default value prioritizes client
+    ops (see above) over recovery ops.  You may adjust the tradeoff of client
+    impact against the time to restore cluster health by lowering this value
+    for increased prioritization of client ops, or by increasing it to favor
+    recovery.
+  default: 3
+  with_legacy: true
+- name: osd_peering_op_priority
+  type: uint
+  level: dev
+  default: 255
+  with_legacy: true
+- name: osd_snap_trim_priority
+  type: uint
+  level: advanced
+  default: 5
+  fmt_desc: The priority set for the snap trim work queue.
+  with_legacy: true
+- name: osd_snap_trim_cost
+  type: size
+  level: advanced
+  default: 1_M
+  with_legacy: true
+- name: osd_pg_delete_priority
+  type: uint
+  level: advanced
+  default: 5
+  with_legacy: true
+- name: osd_pg_delete_cost
+  type: size
+  level: advanced
+  default: 1_M
+  with_legacy: true
+- name: osd_scrub_priority
+  type: uint
+  level: advanced
+  desc: Priority for scrub operations in work queue
+  fmt_desc: The default work queue priority for scheduled scrubs when the
+    pool doesn't specify a value of ``scrub_priority``.  This can be
+    boosted to the value of ``osd_client_op_priority`` when scrubs are
+    blocking client operations.
+  default: 5
+  with_legacy: true
+- name: osd_scrub_cost
+  type: size
+  level: advanced
+  desc: Cost for scrub operations in work queue
+  default: 50_M
+  with_legacy: true
+- name: osd_scrub_event_cost
+  type: size
+  level: advanced
+  desc: Cost for each scrub operation, used when osd_op_queue=mclock_scheduler
+  default: 4_K
+  with_legacy: true
+# set requested scrub priority higher than scrub priority to make the
+# requested scrubs jump the queue of scheduled scrubs
+- name: osd_requested_scrub_priority
+  type: uint
+  level: advanced
+  default: 120
+  fmt_desc: The priority set for user requested scrub on the work queue.  If
+    this value were to be smaller than ``osd_client_op_priority`` it
+    can be boosted to the value of ``osd_client_op_priority`` when
+    scrub is blocking client operations.
+  with_legacy: true
+- name: osd_recovery_priority
+  type: uint
+  level: advanced
+  desc: Priority of recovery in the work queue
+  long_desc: Not related to a pool's recovery_priority
+  fmt_desc: The default priority set for recovery work queue.  Not
+    related to a pool's ``recovery_priority``.
+  default: 5
+  with_legacy: true
+# set default cost equal to 20MB io
+- name: osd_recovery_cost
+  type: size
+  level: advanced
+  default: 20_M
+  with_legacy: true
+# osd_recovery_op_warn_multiple scales the normal warning threshold,
+# osd_op_complaint_time, so that slow recovery ops won't cause noise
+- name: osd_recovery_op_warn_multiple
+  type: uint
+  level: advanced
+  default: 16
+  with_legacy: true
+# Max time to wait between notifying mon of shutdown and shutting down
+- name: osd_mon_shutdown_timeout
+  type: float
+  level: advanced
+  default: 5
+  with_legacy: true
+# crash if the OSD has stray PG refs on shutdown
+- name: osd_shutdown_pgref_assert
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+# OSD's maximum object size
+- name: osd_max_object_size
+  type: size
+  level: advanced
+  default: 128_M
+  fmt_desc: The maximum size of a RADOS object in bytes.
+  with_legacy: true
+# max rados object name len
+- name: osd_max_object_name_len
+  type: uint
+  level: advanced
+  default: 2_K
+  with_legacy: true
+# max rados object namespace len
+- name: osd_max_object_namespace_len
+  type: uint
+  level: advanced
+  default: 256
+  with_legacy: true
+# max rados attr name len; cannot go higher than 100 chars for file system backends
+- name: osd_max_attr_name_len
+  type: uint
+  level: advanced
+  default: 100
+  with_legacy: true
+- name: osd_max_attr_size
+  type: uint
+  level: advanced
+  default: 0
+  with_legacy: true
+- name: osd_max_omap_entries_per_request
+  type: uint
+  level: advanced
+  default: 1_K
+  with_legacy: true
+- name: osd_max_omap_bytes_per_request
+  type: size
+  level: advanced
+  default: 1_G
+  with_legacy: true
+# osd_recovery_op_warn_multiple scales the normal warning threshold,
+# osd_op_complaint_time, so that slow recovery ops won't cause noise
+- name: osd_max_write_op_reply_len
+  type: size
+  level: advanced
+  desc: Max size of the per-op payload for requests with the RETURNVEC flag set
+  long_desc: This value caps the amount of data (per op; a request may have many ops)
+    that will be sent back to the client and recorded in the PG log.
+  default: 64
+  with_legacy: true
+- name: osd_objectstore
+  type: str
+  level: advanced
+  desc: backend type for an OSD (like filestore or bluestore)
+  default: bluestore
+  enum_values:
+  - bluestore
+  - filestore
+  - memstore
+  - kstore
+  - seastore
+  - cyanstore
+  flags:
+  - create
+  with_legacy: true
+# true if LTTng-UST tracepoints should be enabled
+- name: osd_objectstore_tracing
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: osd_objectstore_fuse
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: osd_bench_small_size_max_iops
+  type: uint
+  level: advanced
+  default: 100
+  with_legacy: true
+- name: osd_bench_large_size_max_throughput
+  type: size
+  level: advanced
+  default: 100_M
+  with_legacy: true
+- name: osd_bench_max_block_size
+  type: size
+  level: advanced
+  default: 64_M
+  with_legacy: true
+# duration of 'osd bench', capped at 30s to avoid triggering timeouts
+- name: osd_bench_duration
+  type: uint
+  level: advanced
+  default: 30
+  with_legacy: true
+# create a blkin trace for all osd requests
+- name: osd_blkin_trace_all
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+# create a blkin trace for all objecter requests
+- name: osdc_blkin_trace_all
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: osd_discard_disconnected_ops
+  type: bool
+  level: advanced
+  default: true
+  with_legacy: true
+- name: osd_memory_target
+  type: size
+  level: basic
+  desc: When tcmalloc and cache autotuning is enabled, try to keep this many bytes
+    mapped in memory.
+  long_desc: The minimum value must be at least equal to osd_memory_base + osd_memory_cache_min.
+  fmt_desc: |
+    When TCMalloc is available and cache autotuning is enabled, try to
+    keep this many bytes mapped in memory. Note: This may not exactly
+    match the RSS memory usage of the process.  While the total amount
+    of heap memory mapped by the process should usually be close
+    to this target, there is no guarantee that the kernel will actually
+    reclaim  memory that has been unmapped.  During initial development,
+    it was found that some kernels result in the OSD's RSS memory
+    exceeding the mapped memory by up to 20%.  It is hypothesised
+    however, that the kernel generally may be more aggressive about
+    reclaiming unmapped memory when there is a high amount of memory
+    pressure.  Your mileage may vary.
+  default: 4_G
+  see_also:
+  - bluestore_cache_autotune
+  - osd_memory_cache_min
+  - osd_memory_base
+  - osd_memory_target_autotune
+  min: 896_M
+  flags:
+  - runtime
+- name: osd_memory_target_autotune
+  type: bool
+  default: false
+  level: advanced
+  desc: If enabled, allow orchestrator to automatically tune osd_memory_target
+  see_also:
+  - osd_memory_target
+- name: osd_memory_target_cgroup_limit_ratio
+  type: float
+  level: advanced
+  desc: Set the default value for osd_memory_target to the cgroup memory limit (if
+    set) times this value
+  long_desc: A value of 0 disables this feature.
+  default: 0.8
+  see_also:
+  - osd_memory_target
+  min: 0
+  max: 1
+- name: osd_memory_base
+  type: size
+  level: dev
+  desc: When tcmalloc and cache autotuning is enabled, estimate the minimum amount
+    of memory in bytes the OSD will need.
+  fmt_desc: When TCMalloc and cache autotuning are enabled, estimate the minimum
+    amount of memory in bytes the OSD will need.  This is used to help
+    the autotuner estimate the expected aggregate memory consumption of
+    the caches.
+  default: 768_M
+  see_also:
+  - bluestore_cache_autotune
+  flags:
+  - runtime
+- name: osd_memory_expected_fragmentation
+  type: float
+  level: dev
+  desc: When tcmalloc and cache autotuning is enabled, estimate the percent of memory
+    fragmentation.
+  fmt_desc: When TCMalloc and cache autotuning is enabled, estimate the
+    percentage of memory fragmentation.  This is used to help the
+    autotuner estimate the expected aggregate memory consumption
+    of the caches.
+  default: 0.15
+  see_also:
+  - bluestore_cache_autotune
+  min: 0
+  max: 1
+  flags:
+  - runtime
+- name: osd_memory_cache_min
+  type: size
+  level: dev
+  desc: When tcmalloc and cache autotuning is enabled, set the minimum amount of memory
+    used for caches.
+  fmt_desc: |
+    When TCMalloc and cache autotuning are enabled, set the minimum
+    amount of memory used for caches. Note: Setting this value too
+    low can result in significant cache thrashing.
+  default: 128_M
+  see_also:
+  - bluestore_cache_autotune
+  min: 128_M
+  flags:
+  - runtime
+- name: osd_memory_cache_resize_interval
+  type: float
+  level: dev
+  desc: When tcmalloc and cache autotuning is enabled, wait this many seconds between
+    resizing caches.
+  fmt_desc: When TCMalloc and cache autotuning are enabled, wait this many
+    seconds between resizing caches.  This setting changes the total
+    amount of memory available for BlueStore to use for caching.  Note
+    that setting this interval too small can result in memory allocator
+    thrashing and lower performance.
+  default: 1
+  see_also:
+  - bluestore_cache_autotune
+- name: memstore_device_bytes
+  type: size
+  level: advanced
+  default: 1_G
+  with_legacy: true
+- name: memstore_page_set
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: memstore_page_size
+  type: size
+  level: advanced
+  default: 64_K
+  with_legacy: true
+- name: memstore_debug_omit_block_device_write
+  type: bool
+  level: dev
+  desc: write metadata only
+  default: false
+  see_also:
+  - bluestore_debug_omit_block_device_write
+  with_legacy: true
+- name: objectstore_blackhole
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: bdev_debug_inflight_ios
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+# if N>0, then ~ 1/N IOs will complete before we crash on flush
+- name: bdev_inject_crash
+  type: int
+  level: dev
+  default: 0
+  with_legacy: true
+# wait N more seconds on flush
+- name: bdev_inject_crash_flush_delay
+  type: int
+  level: dev
+  default: 2
+  with_legacy: true
+- name: bdev_aio
+  type: bool
+  level: advanced
+  default: true
+  with_legacy: true
+# milliseconds
+- name: bdev_aio_poll_ms
+  type: int
+  level: advanced
+  default: 250
+  with_legacy: true
+- name: bdev_aio_max_queue_depth
+  type: int
+  level: advanced
+  default: 1024
+  with_legacy: true
+- name: bdev_aio_reap_max
+  type: int
+  level: advanced
+  default: 16
+  with_legacy: true
+- name: bdev_block_size
+  type: size
+  level: advanced
+  default: 4_K
+  with_legacy: true
+- name: bdev_read_buffer_alignment
+  type: size
+  level: advanced
+  default: 4_K
+  with_legacy: true
+- name: bdev_read_preallocated_huge_buffers
+  type: str
+  level: advanced
+  desc: description of pools arrangement for huge page-based read buffers
+  long_desc: Arrangement of preallocated, huge pages-based pools for reading
+    from a KernelDevice. Applied to minimize size of scatter-gather lists
+    sent to NICs. Targets really  big buffers (>= 2 or 4 MBs).
+    Keep in mind the system must be configured accordingly (see /proc/sys/vm/nr_hugepages).
+    Otherwise the OSD wil fail early.
+    Beware BlueStore, by default, stores large chunks across many smaller blobs.
+    Increasing bluestore_max_blob_size changes that, and thus allows the data to
+    be read back into small number of huge page-backed buffers.
+  fmt_desc: List of key=value pairs delimited by comma, semicolon or tab.
+    key specifies the targeted read size and must be expressed in bytes.
+    value specifies the number of preallocated buffers.
+    For instance, to preallocate 64 buffers that will be used to serve
+    2 MB-sized read requests and 128 for 4 MB, someone needs to set
+    "2097152=64,4194304=128".
+  see_also:
+  - bluestore_max_blob_size
+- name: bdev_debug_aio
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: bdev_debug_aio_suicide_timeout
+  type: float
+  level: dev
+  default: 1_min
+  with_legacy: true
+- name: bdev_debug_aio_log_age
+  type: float
+  level: dev
+  default: 5
+  with_legacy: true
+# if yes, osd will unbind all NVMe devices from kernel driver and bind them
+# to the uio_pci_generic driver. The purpose is to prevent the case where
+# NVMe driver is loaded while osd is running.
+- name: bdev_nvme_unbind_from_kernel
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: bdev_enable_discard
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: bdev_async_discard
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: bdev_flock_retry_interval
+  type: float
+  level: advanced
+  desc: interval to retry the flock
+  default: 0.1
+- name: bdev_flock_retry
+  type: uint
+  level: advanced
+  desc: times to retry the flock
+  long_desc: The number of times to retry on getting the block device lock. Programs
+    such as systemd-udevd may compete with Ceph for this lock. 0 means 'unlimited'.
+  default: 3
+- name: bluefs_alloc_size
+  type: size
+  level: advanced
+  desc: Allocation unit size for DB and WAL devices
+  default: 1_M
+  with_legacy: true
+- name: bluefs_shared_alloc_size
+  type: size
+  level: advanced
+  desc: Allocation unit size for primary/shared device
+  default: 64_K
+  with_legacy: true
+- name: bluefs_failed_shared_alloc_cooldown
+  type: float
+  level: advanced
+  desc: duration(in seconds) untill the next attempt to use
+   'bluefs_shared_alloc_size' after facing ENOSPC failure.
+  long_desc: Cooldown period(in seconds) when BlueFS uses shared/slow device
+   allocation size instead of "bluefs_shared_alloc_size' one after facing
+   recoverable (via fallback to smaller chunk size) ENOSPC failure. Intended
+   primarily to avoid repetitive unsuccessful allocations which might be
+   expensive.
+  default: 600
+  with_legacy: true
+- name: bluefs_max_prefetch
+  type: size
+  level: advanced
+  default: 1_M
+  with_legacy: true
+# alloc when we get this low
+- name: bluefs_min_log_runway
+  type: size
+  level: advanced
+  default: 1_M
+  with_legacy: true
+# alloc this much at a time
+- name: bluefs_max_log_runway
+  type: size
+  level: advanced
+  default: 4_M
+  with_legacy: true
+# before we consider
+- name: bluefs_log_compact_min_ratio
+  type: float
+  level: advanced
+  default: 5
+  with_legacy: true
+# before we consider
+- name: bluefs_log_compact_min_size
+  type: size
+  level: advanced
+  default: 16_M
+  with_legacy: true
+# ignore flush until its this big
+- name: bluefs_min_flush_size
+  type: size
+  level: advanced
+  default: 512_K
+  with_legacy: true
+# sync or async log compaction
+- name: bluefs_compact_log_sync
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: bluefs_buffered_io
+  type: bool
+  level: advanced
+  desc: Enabled buffered IO for bluefs reads.
+  long_desc: When this option is enabled, bluefs will in some cases perform buffered
+    reads.  This allows the kernel page cache to act as a secondary cache for things
+    like RocksDB block reads.  For example, if the rocksdb block cache isn't large
+    enough to hold all blocks during OMAP iteration, it may be possible to read them
+    from page cache instead of from the disk.  This can dramatically improve
+    performance when the osd_memory_target is too small to hold all entries in block
+    cache but it does come with downsides.  It has been reported to occasionally
+    cause excessive kernel swapping (and associated stalls) under certain workloads.
+    Currently the best and most consistent performing combination appears to be
+    enabling bluefs_buffered_io and disabling system level swap.  It is possible
+    that this recommendation may change in the future however.
+  default: true
+  with_legacy: true
+- name: bluefs_sync_write
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: bluefs_allocator
+  type: str
+  level: dev
+  default: hybrid
+  enum_values:
+  - bitmap
+  - stupid
+  - avl
+  - hybrid
+  with_legacy: true
+- name: bluefs_log_replay_check_allocations
+  type: bool
+  level: advanced
+  desc: Enables checks for allocations consistency during log replay
+  default: true
+  with_legacy: true
+- name: bluefs_replay_recovery
+  type: bool
+  level: dev
+  desc: Attempt to read bluefs log so large that it became unreadable.
+  long_desc: If BlueFS log grows to extreme sizes (200GB+) it is likely that it becames
+    unreadable. This options enables heuristics that scans devices for missing data.
+    DO NOT ENABLE BY DEFAULT
+  default: false
+  with_legacy: true
+- name: bluefs_replay_recovery_disable_compact
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: bluefs_check_for_zeros
+  type: bool
+  level: dev
+  desc: Check data read for suspicious pages
+  long_desc: Looks into data read to check if there is a 4K block entirely filled
+    with zeros. If this happens, we re-read data. If there is difference, we print
+    error to log.
+  default: false
+  see_also:
+  - bluestore_retry_disk_reads
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluefs_check_volume_selector_on_umount
+  type: bool
+  level: dev
+  desc: Check validity of volume selector on umount
+  long_desc: Checks if volume selector did not diverge from the state it should be in.
+    Reference is constructed from bluefs inode table. Asserts on inconsistency.
+  default: false
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluefs_check_volume_selector_often
+  type: bool
+  level: dev
+  desc: Periodically check validity of volume selector
+  long_desc: Periodically checks if current volume selector does not diverge from the valid state.
+    Reference is constructed from bluefs inode table. Asserts on inconsistency. This is debug feature.
+  default: false
+  see_also:
+  - bluefs_check_volume_selector_on_umount
+  flags:
+  - startup
+  with_legacy: true
+- name: bluestore_bluefs
+  type: bool
+  level: dev
+  desc: Use BlueFS to back rocksdb
+  long_desc: BlueFS allows rocksdb to share the same physical device(s) as the rest
+    of BlueStore.  It should be used in all cases unless testing/developing an alternative
+    metadata database for BlueStore.
+  default: true
+  flags:
+  - create
+  with_legacy: true
+# mirror to normal Env for debug
+- name: bluestore_bluefs_env_mirror
+  type: bool
+  level: dev
+  desc: Mirror bluefs data to file system for testing/validation
+  default: false
+  flags:
+  - create
+  with_legacy: true
+- name: bluestore_bluefs_max_free
+  type: size
+  level: advanced
+  default: 10_G
+  desc: Maximum free space allocated to BlueFS
+- name: bluestore_bluefs_alloc_failure_dump_interval
+  type: float
+  level: advanced
+  desc: How frequently (in seconds) to dump allocator on BlueFS space allocation failure
+  default: 0
+  with_legacy: true
+- name: bluestore_spdk_mem
+  type: size
+  level: dev
+  desc: Amount of dpdk memory size in MB
+  long_desc: If running multiple SPDK instances per node, you must specify the amount
+    of dpdk memory size in MB each instance will use, to make sure each instance uses
+    its own dpdk memory
+  default: 512
+- name: bluestore_spdk_coremask
+  type: str
+  level: dev
+  desc: A hexadecimal bit mask of the cores to run on. Note the core numbering can
+    change between platforms and should be determined beforehand
+  default: '0x1'
+- name: bluestore_spdk_max_io_completion
+  type: uint
+  level: dev
+  desc: Maximal I/Os to be batched completed while checking queue pair completions,
+    0 means let spdk library determine it
+  default: 0
+- name: bluestore_spdk_io_sleep
+  type: uint
+  level: dev
+  desc: Time period to wait if there is no completed I/O from polling
+  default: 5
+# If you want to use spdk driver, you need to specify NVMe serial number here
+# with "spdk:" prefix.
+# Users can use 'lspci -vvv -d 8086:0953 | grep "Device Serial Number"' to
+# get the serial number of Intel(R) Fultondale NVMe controllers.
+# Example:
+# bluestore_block_path = spdk:55cd2e404bd73932
+- name: bluestore_block_path
+  type: str
+  level: dev
+  desc: Path to block device/file
+  flags:
+  - create
+  with_legacy: true
+- name: bluestore_block_size
+  type: size
+  level: dev
+  desc: Size of file to create for backing bluestore
+  default: 100_G
+  flags:
+  - create
+  with_legacy: true
+- name: bluestore_block_create
+  type: bool
+  level: dev
+  desc: Create bluestore_block_path if it doesn't exist
+  default: true
+  see_also:
+  - bluestore_block_path
+  - bluestore_block_size
+  flags:
+  - create
+  with_legacy: true
+- name: bluestore_block_db_path
+  type: str
+  level: dev
+  desc: Path for db block device
+  flags:
+  - create
+  with_legacy: true
+# rocksdb ssts (hot/warm)
+- name: bluestore_block_db_size
+  type: size
+  level: dev
+  desc: Size of file to create for bluestore_block_db_path
+  default: 0
+  flags:
+  - create
+  with_legacy: true
+- name: bluestore_block_db_create
+  type: bool
+  level: dev
+  desc: Create bluestore_block_db_path if it doesn't exist
+  default: false
+  see_also:
+  - bluestore_block_db_path
+  - bluestore_block_db_size
+  flags:
+  - create
+  with_legacy: true
+- name: bluestore_block_wal_path
+  type: str
+  level: dev
+  desc: Path to block device/file backing bluefs wal
+  flags:
+  - create
+  with_legacy: true
+# rocksdb wal
+- name: bluestore_block_wal_size
+  type: size
+  level: dev
+  desc: Size of file to create for bluestore_block_wal_path
+  default: 96_M
+  flags:
+  - create
+  with_legacy: true
+- name: bluestore_block_wal_create
+  type: bool
+  level: dev
+  desc: Create bluestore_block_wal_path if it doesn't exist
+  default: false
+  see_also:
+  - bluestore_block_wal_path
+  - bluestore_block_wal_size
+  flags:
+  - create
+  with_legacy: true
+# whether preallocate space if block/db_path/wal_path is file rather that block device.
+- name: bluestore_block_preallocate_file
+  type: bool
+  level: dev
+  desc: Preallocate file created via bluestore_block*_create
+  default: false
+  flags:
+  - create
+  with_legacy: true
+- name: bluestore_ignore_data_csum
+  type: bool
+  level: dev
+  desc: Ignore checksum errors on read and do not generate an EIO error
+  default: false
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_csum_type
+  type: str
+  level: advanced
+  desc: Default checksum algorithm to use
+  long_desc: crc32c, xxhash32, and xxhash64 are available.  The _16 and _8 variants
+    use only a subset of the bits for more compact (but less reliable) checksumming.
+  fmt_desc: The default checksum algorithm to use.
+  default: crc32c
+  enum_values:
+  - none
+  - crc32c
+  - crc32c_16
+  - crc32c_8
+  - xxhash32
+  - xxhash64
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_retry_disk_reads
+  type: uint
+  level: advanced
+  desc: Number of read retries on checksum validation error
+  long_desc: Retries to read data from the disk this many times when checksum validation
+    fails to handle spurious read errors gracefully.
+  default: 3
+  min: 0
+  max: 255
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_min_alloc_size
+  type: uint
+  level: advanced
+  desc: Minimum allocation size to allocate for an object
+  long_desc: A smaller allocation size generally means less data is read and then
+    rewritten when a copy-on-write operation is triggered (e.g., when writing to something
+    that was recently snapshotted).  Similarly, less data is journaled before performing
+    an overwrite (writes smaller than min_alloc_size must first pass through the BlueStore
+    journal).  Larger values of min_alloc_size reduce the amount of metadata required
+    to describe the on-disk layout and reduce overall fragmentation.
+  default: 0
+  flags:
+  - create
+  with_legacy: true
+- name: bluestore_min_alloc_size_hdd
+  type: size
+  level: advanced
+  desc: Default min_alloc_size value for rotational media
+  default: 4_K
+  see_also:
+  - bluestore_min_alloc_size
+  flags:
+  - create
+  with_legacy: true
+- name: bluestore_min_alloc_size_ssd
+  type: size
+  level: advanced
+  desc: Default min_alloc_size value for non-rotational (solid state)  media
+  default: 4_K
+  see_also:
+  - bluestore_min_alloc_size
+  flags:
+  - create
+  with_legacy: true
+- name: bluestore_use_optimal_io_size_for_min_alloc_size 
+  type: bool
+  level: advanced
+  desc: Discover media optimal IO Size and use for min_alloc_size
+  default: false
+  see_also:
+  - bluestore_min_alloc_size
+  flags:
+  - create
+  with_legacy: true
+- name: bluestore_max_alloc_size
+  type: size
+  level: advanced
+  desc: Maximum size of a single allocation (0 for no max)
+  default: 0
+  flags:
+  - create
+  with_legacy: true
+- name: bluestore_prefer_deferred_size
+  type: size
+  level: advanced
+  desc: Writes smaller than this size will be written to the journal and then asynchronously
+    written to the device.  This can be beneficial when using rotational media where
+    seeks are expensive, and is helpful both with and without solid state journal/wal
+    devices.
+  default: 0
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_prefer_deferred_size_hdd
+  type: size
+  level: advanced
+  desc: Default bluestore_prefer_deferred_size for rotational media
+  default: 64_K
+  see_also:
+  - bluestore_prefer_deferred_size
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_prefer_deferred_size_ssd
+  type: size
+  level: advanced
+  desc: Default bluestore_prefer_deferred_size for non-rotational (solid state) media
+  default: 0
+  see_also:
+  - bluestore_prefer_deferred_size
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_compression_mode
+  type: str
+  level: advanced
+  desc: Default policy for using compression when pool does not specify
+  long_desc: '''none'' means never use compression.  ''passive'' means use compression
+    when clients hint that data is compressible.  ''aggressive'' means use compression
+    unless clients hint that data is not compressible.  This option is used when the
+    per-pool property for the compression mode is not present.'
+  fmt_desc: The default policy for using compression if the per-pool property
+    ``compression_mode`` is not set. ``none`` means never use
+    compression. ``passive`` means use compression when
+    :c:func:`clients hint <rados_set_alloc_hint>` that data is
+    compressible.  ``aggressive`` means use compression unless
+    clients hint that data is not compressible.  ``force`` means use
+    compression under all circumstances even if the clients hint that
+    the data is not compressible.
+  default: none
+  enum_values:
+  - none
+  - passive
+  - aggressive
+  - force
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_compression_algorithm
+  type: str
+  level: advanced
+  desc: Default compression algorithm to use when writing object data
+  long_desc: This controls the default compressor to use (if any) if the per-pool
+    property is not set.  Note that zstd is *not* recommended for bluestore due to
+    high CPU overhead when compressing small amounts of data.
+  fmt_desc: The default compressor to use (if any) if the per-pool property
+    ``compression_algorithm`` is not set. Note that ``zstd`` is *not*
+    recommended for BlueStore due to high CPU overhead when
+    compressing small amounts of data.
+  default: snappy
+  enum_values:
+  - ''
+  - snappy
+  - zlib
+  - zstd
+  - lz4
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_compression_min_blob_size
+  type: size
+  level: advanced
+  desc: Maximum chunk size to apply compression to when random access is expected
+    for an object.
+  long_desc: Chunks larger than this are broken into smaller chunks before being compressed
+  fmt_desc: Chunks smaller than this are never compressed.
+    The per-pool property ``compression_min_blob_size`` overrides
+    this setting.
+  default: 0
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_compression_min_blob_size_hdd
+  type: size
+  level: advanced
+  desc: Default value of bluestore_compression_min_blob_size for rotational media
+  fmt_desc: Default value of ``bluestore compression min blob size``
+    for rotational media.
+  default: 8_K
+  see_also:
+  - bluestore_compression_min_blob_size
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_compression_min_blob_size_ssd
+  type: size
+  level: advanced
+  desc: Default value of bluestore_compression_min_blob_size for non-rotational (solid
+    state) media
+  fmt_desc: Default value of ``bluestore compression min blob size``
+    for non-rotational (solid state) media.
+  default: 64_K
+  see_also:
+  - bluestore_compression_min_blob_size
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_compression_max_blob_size
+  type: size
+  level: advanced
+  desc: Maximum chunk size to apply compression to when non-random access is expected
+    for an object.
+  long_desc: Chunks larger than this are broken into smaller chunks before being compressed
+  fmt_desc: Chunks larger than this value are broken into smaller blobs of at most
+    ``bluestore_compression_max_blob_size`` bytes before being compressed.
+    The per-pool property ``compression_max_blob_size`` overrides
+    this setting.
+  default: 0
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_compression_max_blob_size_hdd
+  type: size
+  level: advanced
+  desc: Default value of bluestore_compression_max_blob_size for rotational media
+  fmt_desc: Default value of ``bluestore compression max blob size``
+    for rotational media.
+  default: 64_K
+  see_also:
+  - bluestore_compression_max_blob_size
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_compression_max_blob_size_ssd
+  type: size
+  level: advanced
+  desc: Default value of bluestore_compression_max_blob_size for non-rotational (solid
+    state) media
+  fmt_desc: Default value of ``bluestore compression max blob size``
+    for non-rotational (SSD, NVMe) media.
+  default: 64_K
+  see_also:
+  - bluestore_compression_max_blob_size
+  flags:
+  - runtime
+  with_legacy: true
+# Specifies minimum expected amount of saved allocation units
+# per single blob to enable compressed blobs garbage collection
+- name: bluestore_gc_enable_blob_threshold
+  type: int
+  level: dev
+  default: 0
+  flags:
+  - runtime
+  with_legacy: true
+# Specifies minimum expected amount of saved allocation units
+# per all blobsb to enable compressed blobs garbage collection
+- name: bluestore_gc_enable_total_threshold
+  type: int
+  level: dev
+  default: 0
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_max_blob_size
+  type: size
+  level: dev
+  long_desc: Bluestore blobs are collections of extents (ie on-disk data) originating
+    from one or more objects.  Blobs can be compressed, typically have checksum data,
+    may be overwritten, may be shared (with an extent ref map), or split.  This setting
+    controls the maximum size a blob is allowed to be.
+  default: 0
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_max_blob_size_hdd
+  type: size
+  level: dev
+  default: 64_K
+  see_also:
+  - bluestore_max_blob_size
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_max_blob_size_ssd
+  type: size
+  level: dev
+  default: 64_K
+  see_also:
+  - bluestore_max_blob_size
+  flags:
+  - runtime
+  with_legacy: true
+# Require the net gain of compression at least to be at this ratio,
+# otherwise we don't compress.
+# And ask for compressing at least 12.5%(1/8) off, by default.
+- name: bluestore_compression_required_ratio
+  type: float
+  level: advanced
+  desc: Compression ratio required to store compressed data
+  long_desc: If we compress data and get less than this we discard the result and
+    store the original uncompressed data.
+  fmt_desc: The ratio of the size of the data chunk after
+    compression relative to the original size must be at
+    least this small in order to store the compressed
+    version.
+  default: 0.875
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_extent_map_shard_max_size
+  type: size
+  level: dev
+  desc: Max size (bytes) for a single extent map shard before splitting
+  default: 1200
+  with_legacy: true
+- name: bluestore_extent_map_shard_target_size
+  type: size
+  level: dev
+  desc: Target size (bytes) for a single extent map shard
+  default: 500
+  with_legacy: true
+- name: bluestore_extent_map_shard_min_size
+  type: size
+  level: dev
+  desc: Min size (bytes) for a single extent map shard before merging
+  default: 150
+  with_legacy: true
+- name: bluestore_extent_map_shard_target_size_slop
+  type: float
+  level: dev
+  desc: Ratio above/below target for a shard when trying to align to an existing extent
+    or blob boundary
+  default: 0.2
+  with_legacy: true
+- name: bluestore_extent_map_inline_shard_prealloc_size
+  type: size
+  level: dev
+  desc: Preallocated buffer for inline shards
+  default: 256
+  with_legacy: true
+- name: bluestore_cache_trim_interval
+  type: float
+  level: advanced
+  desc: How frequently we trim the bluestore cache
+  default: 0.05
+  with_legacy: true
+- name: bluestore_cache_trim_max_skip_pinned
+  type: uint
+  level: dev
+  desc: Max pinned cache entries we consider before giving up
+  default: 1000
+  with_legacy: true
+- name: bluestore_cache_type
+  type: str
+  level: dev
+  desc: Cache replacement algorithm
+  default: 2q
+  enum_values:
+  - 2q
+  - lru
+  with_legacy: true
+- name: bluestore_2q_cache_kin_ratio
+  type: float
+  level: dev
+  desc: 2Q paper suggests .5
+  default: 0.5
+  with_legacy: true
+- name: bluestore_2q_cache_kout_ratio
+  type: float
+  level: dev
+  desc: 2Q paper suggests .5
+  default: 0.5
+  with_legacy: true
+- name: bluestore_cache_size
+  type: size
+  level: dev
+  desc: Cache size (in bytes) for BlueStore
+  long_desc: This includes data and metadata cached by BlueStore as well as memory
+    devoted to rocksdb's cache(s).
+  fmt_desc: The amount of memory BlueStore will use for its cache.  If zero,
+    ``bluestore_cache_size_hdd`` or ``bluestore_cache_size_ssd`` will
+    be used instead.
+  default: 0
+  with_legacy: true
+- name: bluestore_cache_size_hdd
+  type: size
+  level: dev
+  desc: Default bluestore_cache_size for rotational media
+  fmt_desc: The default amount of memory BlueStore will use for its cache when
+    backed by an HDD.
+  default: 1_G
+  see_also:
+  - bluestore_cache_size
+  with_legacy: true
+- name: bluestore_cache_size_ssd
+  type: size
+  level: dev
+  desc: Default bluestore_cache_size for non-rotational (solid state) media
+  fmt_desc: The default amount of memory BlueStore will use for its cache when
+    backed by an SSD.
+  default: 3_G
+  see_also:
+  - bluestore_cache_size
+  with_legacy: true
+- name: bluestore_cache_meta_ratio
+  type: float
+  level: dev
+  desc: Ratio of bluestore cache to devote to metadata
+  default: 0.45
+  see_also:
+  - bluestore_cache_size
+  with_legacy: true
+- name: bluestore_cache_kv_ratio
+  type: float
+  level: dev
+  desc: Ratio of bluestore cache to devote to key/value database (RocksDB)
+  default: 0.45
+  see_also:
+  - bluestore_cache_size
+  with_legacy: true
+- name: bluestore_cache_kv_onode_ratio
+  type: float
+  level: dev
+  desc: Ratio of bluestore cache to devote to kv onode column family (rocksdb)
+  default: 0.04
+  see_also:
+  - bluestore_cache_size
+- name: bluestore_cache_autotune
+  type: bool
+  level: dev
+  desc: Automatically tune the ratio of caches while respecting min values.
+  fmt_desc: Automatically tune the space ratios assigned to various BlueStore
+    caches while respecting minimum values.
+  default: true
+  see_also:
+  - bluestore_cache_size
+  - bluestore_cache_meta_ratio
+- name: bluestore_cache_autotune_interval
+  type: float
+  level: dev
+  desc: The number of seconds to wait between rebalances when cache autotune is enabled.
+  fmt_desc: |
+    The number of seconds to wait between rebalances when cache autotune is
+    enabled.  `bluestore_cache_autotune_interval` sets the speed at which Ceph
+    recomputes the allocation ratios of various caches. Note: Setting this
+    interval too small can result in high CPU usage and lower performance.
+  default: 5
+  see_also:
+  - bluestore_cache_autotune
+- name: bluestore_cache_age_bin_interval
+  type: float
+  level: dev
+  desc: The duration (in seconds) represented by a single cache age bin.
+  fmt_desc: |
+    The caches used by bluestore will assign cache entries to an 'age bin'
+    that represents a period of time during which that cache entry was most
+    recently updated.  By binning the caches in this way, Ceph's priority
+    cache balancing code can make better decisions about which caches should
+    receive priority based on the relative ages of items in the caches.  By
+    default, a single cache age bin represents 1 second of time.  Note:
+    Setting this interval too small can result in high CPU usage and lower
+    performance.
+  default: 1
+  see_also:
+  - bluestore_cache_age_bins_kv
+  - bluestore_cache_age_bins_kv_onode
+  - bluestore_cache_age_bins_meta
+  - bluestore_cache_age_bins_data
+- name: bluestore_cache_age_bins_kv
+  type: str
+  level: dev
+  desc: A 10 element, space separated list of age bins for kv cache
+  fmt_desc: |
+    A 10 element, space separated list of cache age bins grouped by
+    priority such that PRI1=[0,n), PRI2=[n,n+1), PRI3=[n+1,n+2) ...
+    PRI10=[n+8,n+9).  Values represent the starting and ending bin for each
+    priority level.  A 0 in the 2nd term will prevent any items from being
+    associated with that priority.  bin duration is based on the
+    bluestore_cache_age_bin_interval value.  For example,
+    "1 5 0 0 0 0 0 0 0 0" defines bin ranges for two priority levels. PRI1
+    contains 1 age bin.  Assuming the default age bin interval of 1 second,
+    PRI1 represents cache items that are less than 1 second old. PRI2 has 4
+    bins representing cache items that are 1 to less than 5 seconds old. All
+    other cache items in this example are associated with the lowest priority
+    level as PRI3-PRI10 all have 0s in their second term.
+  default: "1 2 6 24 120 720 0 0 0 0"
+  see_also:
+  - bluestore_cache_age_bin_interval
+- name: bluestore_cache_age_bins_kv_onode
+  type: str
+  level: dev
+  desc: A 10 element, space separated list of age bins for kv onode cache
+  fmt_desc: |
+    A 10 element, space separated list of cache age bins grouped by
+    priority such that PRI1=[0,n), PRI2=[n,n+1), PRI3=[n+1,n+2) ...
+    PRI10=[n+8,n+9).  Values represent the starting and ending bin for each
+    priority level.  A 0 in the 2nd term will prevent any items from being
+    associated with that priority.  bin duration is based on the
+    bluestore_cache_age_bin_interval value.  For example,
+    "1 5 0 0 0 0 0 0 0 0" defines bin ranges for two priority levels. PRI1
+    contains 1 age bin.  Assuming the default age bin interval of 1 second,
+    PRI1 represents cache items that are less than 1 second old. PRI2 has 4
+    bins representing cache items that are 1 to less than 5 seconds old. All
+    other cache items in this example are associated with the lowest priority
+    level as PRI3-PRI10 all have 0s in their second term.
+  default: "0 0 0 0 0 0 0 0 0 720"
+  see_also:
+  - bluestore_cache_age_bin_interval
+- name: bluestore_cache_age_bins_meta
+  type: str
+  level: dev
+  desc: A 10 element, space separated list of age bins for onode cache
+  fmt_desc: |
+    A 10 element, space separated list of cache age bins grouped by
+    priority such that PRI1=[0,n), PRI2=[n,n+1), PRI3=[n+1,n+2) ...
+    PRI10=[n+8,n+9).  Values represent the starting and ending bin for each
+    priority level.  A 0 in the 2nd term will prevent any items from being
+    associated with that priority.  bin duration is based on the
+    bluestore_cache_age_bin_interval value.  For example,
+    "1 5 0 0 0 0 0 0 0 0" defines bin ranges for two priority levels. PRI1
+    contains 1 age bin.  Assuming the default age bin interval of 1 second,
+    PRI1 represents cache items that are less than 1 second old. PRI2 has 4
+    bins representing cache items that are 1 to less than 5 seconds old. All
+    other cache items in this example are associated with the lowest priority
+    level as PRI3-PRI10 all have 0s in their second term.
+  default: "1 2 6 24 120 720 0 0 0 0"
+  see_also:
+  - bluestore_cache_age_bin_interval
+- name: bluestore_cache_age_bins_data
+  type: str
+  level: dev
+  desc: A 10 element, space separated list of age bins for data cache
+  fmt_desc: |
+    A 10 element, space separated list of cache age bins grouped by
+    priority such that PRI1=[0,n), PRI2=[n,n+1), PRI3=[n+1,n+2) ...
+    PRI10=[n+8,n+9).  Values represent the starting and ending bin for each
+    priority level.  A 0 in the 2nd term will prevent any items from being
+    associated with that priority.  bin duration is based on the
+    bluestore_cache_age_bin_interval value.  For example,
+    "1 5 0 0 0 0 0 0 0 0" defines bin ranges for two priority levels. PRI1
+    contains 1 age bin.  Assuming the default age bin interval of 1 second,
+    PRI1 represents cache items that are less than 1 second old. PRI2 has 4
+    bins representing cache items that are 1 to less than 5 seconds old. All
+    other cache items in this example are associated with the lowest priority
+    level as PRI3-PRI10 all have 0s in their second term.
+  default: "1 2 6 24 120 720 0 0 0 0"
+  see_also:
+  - bluestore_cache_age_bin_interval
+- name: bluestore_alloc_stats_dump_interval
+  type: float
+  level: dev
+  desc: The period (in second) for logging allocation statistics.
+  default: 1_day
+  with_legacy: true
+- name: bluestore_kvbackend
+  type: str
+  level: dev
+  desc: Key value database to use for bluestore
+  default: rocksdb
+  flags:
+  - create
+  with_legacy: true
+- name: bluestore_allocator
+  type: str
+  level: advanced
+  desc: Allocator policy
+  long_desc: Allocator to use for bluestore.  Stupid should only be used for testing.
+  default: hybrid
+  enum_values:
+  - bitmap
+  - stupid
+  - avl
+  - hybrid
+  - zoned
+  with_legacy: true
+- name: bluestore_freelist_blocks_per_key
+  type: size
+  level: dev
+  desc: Block (and bits) per database key
+  default: 128
+  with_legacy: true
+- name: bluestore_bitmapallocator_blocks_per_zone
+  type: size
+  level: dev
+  default: 1_K
+  with_legacy: true
+- name: bluestore_bitmapallocator_span_size
+  type: size
+  level: dev
+  default: 1_K
+  with_legacy: true
+- name: bluestore_max_deferred_txc
+  type: uint
+  level: advanced
+  desc: Max transactions with deferred writes that can accumulate before we force
+    flush deferred writes
+  default: 32
+  with_legacy: true
+- name: bluestore_max_defer_interval
+  type: float
+  level: advanced
+  desc: max duration to force deferred submit
+  default: 3
+  with_legacy: true
+- name: bluestore_rocksdb_options
+  type: str
+  level: advanced
+  desc: Full set of rocksdb settings to override
+  default: compression=kNoCompression,max_write_buffer_number=64,min_write_buffer_number_to_merge=6,compaction_style=kCompactionStyleLevel,write_buffer_size=16777216,max_background_jobs=4,level0_file_num_compaction_trigger=8,max_bytes_for_level_base=1073741824,max_bytes_for_level_multiplier=8,compaction_readahead_size=2MB,max_total_wal_size=1073741824,writable_file_max_buffer_size=0
+  with_legacy: true
+- name: bluestore_rocksdb_options_annex
+  type: str
+  level: advanced
+  desc: An addition to bluestore_rocksdb_options. Allows setting rocksdb options without
+    repeating the existing defaults.
+  with_legacy: true
+- name: bluestore_rocksdb_cf
+  type: bool
+  level: advanced
+  desc: Enable use of rocksdb column families for bluestore metadata
+  fmt_desc: Enables sharding of BlueStore's RocksDB.
+    When ``true``, ``bluestore_rocksdb_cfs`` is used.
+    Only applied when OSD is doing ``--mkfs``.
+  default: true
+  verbatim: |
+    #ifdef WITH_SEASTAR
+    // This is necessary as the Seastar's allocator imposes restrictions
+    // on the number of threads that entered malloc/free/*. Unfortunately,
+    // RocksDB sharding in BlueStore dramatically lifted the number of
+    // threads spawn during RocksDB's init.
+    .set_validator([](std::string *value, std::string *error_message) {
+      if (const bool parsed_value = strict_strtob(value->c_str(), error_message);
+        error_message->empty() && parsed_value) {
+        *error_message = "invalid BlueStore sharding configuration."
+                         " Be aware any change takes effect only on mkfs!";
+        return -EINVAL;
+      } else {
+        return 0;
+      }
+    })
+    #endif
+- name: bluestore_rocksdb_cfs
+  type: str
+  level: dev
+  desc: Definition of column families and their sharding
+  long_desc: 'Space separated list of elements: column_def [ ''='' rocksdb_options
+    ]. column_def := column_name [ ''('' shard_count [ '','' hash_begin ''-'' [ hash_end
+    ] ] '')'' ]. Example: ''I=write_buffer_size=1048576 O(6) m(7,10-)''. Interval
+    [hash_begin..hash_end) defines characters to use for hash calculation. Recommended
+    hash ranges: O(0-13) P(0-8) m(0-16). Sharding of S,T,C,M,B prefixes is inadvised'
+  fmt_desc: Definition of BlueStore's RocksDB sharding.
+    The optimal value depends on multiple factors, and modification is invadvisable.
+    This setting is used only when OSD is doing ``--mkfs``.
+    Next runs of OSD retrieve sharding from disk.
+  default: m(3) p(3,0-12) O(3,0-13)=block_cache={type=binned_lru} L=min_write_buffer_number_to_merge=32 P=min_write_buffer_number_to_merge=32
+- name: bluestore_qfsck_on_mount
+  type: bool
+  level: dev
+  desc: Run quick-fsck at mount comparing allocation-file to RocksDB allocation state
+  default: true
+  with_legacy: true
+- name: bluestore_fsck_on_mount
+  type: bool
+  level: dev
+  desc: Run fsck at mount
+  default: false
+  with_legacy: true
+- name: bluestore_fsck_on_mount_deep
+  type: bool
+  level: dev
+  desc: Run deep fsck at mount when bluestore_fsck_on_mount is set to true
+  default: false
+  with_legacy: true
+- name: bluestore_fsck_quick_fix_on_mount
+  type: bool
+  level: dev
+  desc: Do quick-fix for the store at mount
+  default: false
+  with_legacy: true
+- name: bluestore_fsck_on_umount
+  type: bool
+  level: dev
+  desc: Run fsck at umount
+  default: false
+  with_legacy: true
+- name: bluestore_allocation_from_file
+  type: bool
+  level: dev
+  desc: Remove allocation info from RocksDB and store the info in a new allocation file
+  default: true
+  with_legacy: true
+- name: bluestore_debug_inject_allocation_from_file_failure
+  type: float
+  level: dev
+  desc: Enables random error injections when restoring allocation map from file.
+  long_desc: Specifies error injection probability for restoring allocation map from file
+    hence causing full recovery. Intended primarily for testing.
+  default: 0
+  with_legacy: true
+- name: bluestore_fsck_on_umount_deep
+  type: bool
+  level: dev
+  desc: Run deep fsck at umount when bluestore_fsck_on_umount is set to true
+  default: false
+  with_legacy: true
+- name: bluestore_fsck_on_mkfs
+  type: bool
+  level: dev
+  desc: Run fsck after mkfs
+  default: true
+  with_legacy: true
+- name: bluestore_fsck_on_mkfs_deep
+  type: bool
+  level: dev
+  desc: Run deep fsck after mkfs
+  default: false
+  with_legacy: true
+- name: bluestore_sync_submit_transaction
+  type: bool
+  level: dev
+  desc: Try to submit metadata transaction to rocksdb in queuing thread context
+  default: false
+  with_legacy: true
+- name: bluestore_fsck_read_bytes_cap
+  type: size
+  level: advanced
+  desc: Maximum bytes read at once by deep fsck
+  default: 64_M
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_fsck_quick_fix_threads
+  type: int
+  level: advanced
+  desc: Number of additional threads to perform quick-fix (shallow fsck) command
+  default: 2
+  with_legacy: true
+- name: bluestore_fsck_shared_blob_tracker_size
+  type: float
+  level: dev
+  desc: Size(a fraction of osd_memory_target, defaults to 128MB) of a hash table to track shared blobs ref counts. Higher the size, more precise is the tracker -> less overhead during the repair.
+  default: 0.03125
+  see_also:
+  - osd_memory_target
+  flags:
+  - runtime
+- name: bluestore_throttle_bytes
+  type: size
+  level: advanced
+  desc: Maximum bytes in flight before we throttle IO submission
+  default: 64_M
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_throttle_deferred_bytes
+  type: size
+  level: advanced
+  desc: Maximum bytes for deferred writes before we throttle IO submission
+  default: 128_M
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_throttle_cost_per_io
+  type: size
+  level: advanced
+  desc: Overhead added to transaction cost (in bytes) for each IO
+  default: 0
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_throttle_cost_per_io_hdd
+  type: uint
+  level: advanced
+  desc: Default bluestore_throttle_cost_per_io for rotational media
+  default: 670000
+  see_also:
+  - bluestore_throttle_cost_per_io
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_throttle_cost_per_io_ssd
+  type: uint
+  level: advanced
+  desc: Default bluestore_throttle_cost_per_io for non-rotation (solid state) media
+  default: 4000
+  see_also:
+  - bluestore_throttle_cost_per_io
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_deferred_batch_ops
+  type: uint
+  level: advanced
+  desc: Max number of deferred writes before we flush the deferred write queue
+  default: 0
+  min: 0
+  max: 65535
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_deferred_batch_ops_hdd
+  type: uint
+  level: advanced
+  desc: Default bluestore_deferred_batch_ops for rotational media
+  default: 64
+  see_also:
+  - bluestore_deferred_batch_ops
+  min: 0
+  max: 65535
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_deferred_batch_ops_ssd
+  type: uint
+  level: advanced
+  desc: Default bluestore_deferred_batch_ops for non-rotational (solid state) media
+  default: 16
+  see_also:
+  - bluestore_deferred_batch_ops
+  min: 0
+  max: 65535
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_nid_prealloc
+  type: int
+  level: dev
+  desc: Number of unique object ids to preallocate at a time
+  default: 1024
+  with_legacy: true
+- name: bluestore_blobid_prealloc
+  type: uint
+  level: dev
+  desc: Number of unique blob ids to preallocate at a time
+  default: 10_K
+  with_legacy: true
+- name: bluestore_clone_cow
+  type: bool
+  level: advanced
+  desc: Use copy-on-write when cloning objects (versus reading and rewriting them
+    at clone time)
+  default: true
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_default_buffered_read
+  type: bool
+  level: advanced
+  desc: Cache read results by default (unless hinted NOCACHE or WONTNEED)
+  default: true
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_default_buffered_write
+  type: bool
+  level: advanced
+  desc: Cache writes by default (unless hinted NOCACHE or WONTNEED)
+  default: false
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_debug_no_reuse_blocks
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: bluestore_debug_small_allocations
+  type: int
+  level: dev
+  default: 0
+  with_legacy: true
+- name: bluestore_debug_too_many_blobs_threshold
+  type: int
+  level: dev
+  default: 24576
+  with_legacy: true
+- name: bluestore_debug_freelist
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: bluestore_debug_prefill
+  type: float
+  level: dev
+  desc: simulate fragmentation
+  default: 0
+  with_legacy: true
+- name: bluestore_debug_prefragment_max
+  type: size
+  level: dev
+  default: 1_M
+  with_legacy: true
+- name: bluestore_debug_inject_read_err
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: bluestore_debug_randomize_serial_transaction
+  type: int
+  level: dev
+  default: 0
+  with_legacy: true
+- name: bluestore_debug_omit_block_device_write
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: bluestore_debug_fsck_abort
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: bluestore_debug_omit_kv_commit
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: bluestore_debug_permit_any_bdev_label
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: bluestore_debug_random_read_err
+  type: float
+  level: dev
+  default: 0
+  with_legacy: true
+- name: bluestore_debug_inject_bug21040
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: bluestore_debug_inject_csum_err_probability
+  type: float
+  level: dev
+  desc: inject crc verification errors into bluestore device reads
+  default: 0
+  with_legacy: true
+- name: bluestore_debug_legacy_omap
+  type: bool
+  level: dev
+  desc: Allows mkfs to create OSD in legacy OMAP naming mode (neither per-pool nor per-pg).
+    This is intended primarily for developers' purposes. The resulting OSD might/would
+    be transformed to the currrently default 'per-pg' format when BlueStore's quick-fix or
+    repair are applied.
+  default: false
+  with_legacy: true
+- name: bluestore_fsck_error_on_no_per_pool_stats
+  type: bool
+  level: advanced
+  desc: Make fsck error (instead of warn) when bluestore lacks per-pool stats, e.g.,
+    after an upgrade
+  default: false
+  with_legacy: true
+- name: bluestore_warn_on_bluefs_spillover
+  type: bool
+  level: advanced
+  desc: Enable health indication on bluefs slow device usage
+  default: true
+  with_legacy: true
+- name: bluestore_warn_on_legacy_statfs
+  type: bool
+  level: advanced
+  desc: Enable health indication on lack of per-pool statfs reporting from bluestore
+  default: true
+  with_legacy: true
+- name: bluestore_warn_on_spurious_read_errors
+  type: bool
+  level: advanced
+  desc: Enable health indication when spurious read errors are observed by OSD
+  default: true
+  with_legacy: true
+- name: bluestore_fsck_error_on_no_per_pool_omap
+  type: bool
+  level: advanced
+  desc: Make fsck error (instead of warn) when objects without per-pool omap are found
+  default: false
+  with_legacy: true
+- name: bluestore_fsck_error_on_no_per_pg_omap
+  type: bool
+  level: advanced
+  desc: Make fsck error (instead of warn) when objects without per-pg omap are found
+  default: false
+  with_legacy: true
+- name: bluestore_warn_on_no_per_pool_omap
+  type: bool
+  level: advanced
+  desc: Enable health indication on lack of per-pool omap
+  default: true
+  with_legacy: true
+- name: bluestore_warn_on_no_per_pg_omap
+  type: bool
+  level: advanced
+  desc: Enable health indication on lack of per-pg omap
+  default: false
+  with_legacy: true
+- name: bluestore_log_op_age
+  type: float
+  level: advanced
+  desc: log operation if it's slower than this age (seconds)
+  default: 5
+  with_legacy: true
+- name: bluestore_log_omap_iterator_age
+  type: float
+  level: advanced
+  desc: log omap iteration operation if it's slower than this age (seconds)
+  default: 5
+  with_legacy: true
+- name: bluestore_log_collection_list_age
+  type: float
+  level: advanced
+  desc: log collection list operation if it's slower than this age (seconds)
+  default: 1_min
+  with_legacy: true
+- name: bluestore_debug_enforce_settings
+  type: str
+  level: dev
+  desc: Enforces specific hw profile settings
+  long_desc: '''hdd'' enforces settings intended for BlueStore above a rotational
+    drive. ''ssd'' enforces settings intended for BlueStore above a solid drive. ''default''
+    - using settings for the actual hardware.'
+  default: default
+  enum_values:
+  - default
+  - hdd
+  - ssd
+  with_legacy: true
+- name: bluestore_avl_alloc_ff_max_search_count
+  type: uint
+  level: dev
+  desc: Search for this many ranges in first-fit mode before switching over to
+    to best-fit mode. 0 to iterate through all ranges for required chunk.
+  default: 100
+- name: bluestore_avl_alloc_ff_max_search_bytes
+  type: size
+  level: dev
+  desc: Maximum distance to search in first-fit mode before switching over to
+    to best-fit mode. 0 to iterate through all ranges for required chunk.
+  default: 16_M
+- name: bluestore_avl_alloc_bf_threshold
+  type: uint
+  level: dev
+  desc: Sets threshold at which shrinking max free chunk size triggers enabling best-fit
+    mode.
+  long_desc: 'AVL allocator works in two modes: near-fit and best-fit. By default,
+    it uses very fast near-fit mode, in which it tries to fit a new block near the
+    last allocated block of similar size. The second mode is much slower best-fit
+    mode, in which it tries to find an exact match for the requested allocation. This
+    mode is used when either the device gets fragmented or when it is low on free
+    space. When the largest free block is smaller than ''bluestore_avl_alloc_bf_threshold'',
+    best-fit mode is used.'
+  default: 128_K
+  see_also:
+  - bluestore_avl_alloc_bf_free_pct
+- name: bluestore_avl_alloc_bf_free_pct
+  type: uint
+  level: dev
+  desc: Sets threshold at which shrinking free space (in %, integer) triggers enabling
+    best-fit mode.
+  long_desc: 'AVL allocator works in two modes: near-fit and best-fit. By default,
+    it uses very fast near-fit mode, in which it tries to fit a new block near the
+    last allocated block of similar size. The second mode is much slower best-fit
+    mode, in which it tries to find an exact match for the requested allocation. This
+    mode is used when either the device gets fragmented or when it is low on free
+    space. When free space is smaller than ''bluestore_avl_alloc_bf_free_pct'', best-fit
+    mode is used.'
+  default: 4
+  see_also:
+  - bluestore_avl_alloc_bf_threshold
+- name: bluestore_hybrid_alloc_mem_cap
+  type: uint
+  level: dev
+  desc: Maximum RAM hybrid allocator should use before enabling bitmap supplement
+  default: 64_M
+- name: bluestore_volume_selection_policy
+  type: str
+  level: dev
+  desc: Determines bluefs volume selection policy
+  long_desc: Determines bluefs volume selection policy. 'use_some_extra*' policy allows
+    to override RocksDB level granularity and put high level's data to faster device
+    even when the level doesn't completely fit there. 'fit_to_fast' policy enables
+    using 100% of faster disk capacity and allows the user to turn on 'level_compaction_dynamic_level_bytes'
+    option in RocksDB options.
+  default: use_some_extra
+  enum_values:
+  - rocksdb_original
+  - use_some_extra
+  - use_some_extra_enforced
+  - fit_to_fast
+  with_legacy: true
+- name: bluestore_volume_selection_reserved_factor
+  type: float
+  level: advanced
+  desc: DB level size multiplier. Determines amount of space at DB device to bar from
+    the usage when 'use some extra' policy is in action. Reserved size is determined
+    as sum(L_max_size[0], L_max_size[L-1]) + L_max_size[L] * this_factor
+  default: 2
+  flags:
+  - startup
+  with_legacy: true
+- name: bluestore_volume_selection_reserved
+  type: int
+  level: advanced
+  desc: Space reserved at DB device and not allowed for 'use some extra' policy usage.
+    Overrides 'bluestore_volume_selection_reserved_factor' setting and introduces
+    straightforward limit.
+  default: 0
+  flags:
+  - startup
+  with_legacy: true
+- name: bdev_ioring
+  type: bool
+  level: advanced
+  desc: Enables Linux io_uring API instead of libaio
+  default: false
+- name: bdev_ioring_hipri
+  type: bool
+  level: advanced
+  desc: Enables Linux io_uring API Use polled IO completions
+  default: false
+- name: bdev_ioring_sqthread_poll
+  type: bool
+  level: advanced
+  desc: Enables Linux io_uring API Offload submission/completion to kernel thread
+  default: false
+- name: bluestore_kv_sync_util_logging_s
+  type: float
+  level: advanced
+  desc: KV sync thread utilization logging period
+  long_desc: How often (in seconds) to print KV sync thread utilization, not logged
+    when set to 0 or when utilization is 0%
+  default: 10
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_fail_eio
+  type: bool
+  level: dev
+  desc: fail/crash on EIO
+  long_desc: whether bluestore osd fails on eio
+  default: false
+  flags:
+  - runtime
+  with_legacy: true
+- name: bluestore_zero_block_detection
+  type: bool
+  level: dev
+  desc: punch holes instead of writing zeros
+  long_desc: Intended for large-scale synthetic testing. Currently this is implemented
+    with punch hole semantics, affecting the logical extent map of the object. This does
+    not interact well with some RBD and CephFS features.
+  default: false
+  flags:
+  - runtime
+  with_legacy: true
+- name: kstore_max_ops
+  type: uint
+  level: advanced
+  default: 512
+  with_legacy: true
+- name: kstore_max_bytes
+  type: size
+  level: advanced
+  default: 64_M
+  with_legacy: true
+- name: kstore_backend
+  type: str
+  level: advanced
+  default: rocksdb
+  with_legacy: true
+- name: kstore_rocksdb_options
+  type: str
+  level: advanced
+  desc: Options to pass through when RocksDB is used as the KeyValueDB for kstore.
+  default: compression=kNoCompression
+  with_legacy: true
+- name: kstore_fsck_on_mount
+  type: bool
+  level: advanced
+  desc: Whether or not to run fsck on mount for kstore.
+  default: false
+  with_legacy: true
+- name: kstore_fsck_on_mount_deep
+  type: bool
+  level: advanced
+  desc: Whether or not to run deep fsck on mount for kstore
+  default: true
+  with_legacy: true
+- name: kstore_nid_prealloc
+  type: uint
+  level: advanced
+  default: 1_K
+  with_legacy: true
+- name: kstore_sync_transaction
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: kstore_sync_submit_transaction
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: kstore_onode_map_size
+  type: uint
+  level: advanced
+  default: 1_K
+  with_legacy: true
+- name: kstore_default_stripe_size
+  type: size
+  level: advanced
+  default: 64_K
+  with_legacy: true
+# rocksdb options that will be used for omap(if omap_backend is rocksdb)
+- name: filestore_rocksdb_options
+  type: str
+  level: dev
+  desc: Options to pass through when RocksDB is used as the KeyValueDB for filestore.
+  default: max_background_jobs=10,compaction_readahead_size=2097152,compression=kNoCompression
+  with_legacy: true
+- name: filestore_omap_backend
+  type: str
+  level: dev
+  desc: The KeyValueDB to use for filestore metadata (ie omap).
+  default: rocksdb
+  enum_values:
+  - leveldb
+  - rocksdb
+  with_legacy: true
+- name: filestore_omap_backend_path
+  type: str
+  level: dev
+  desc: The path where the filestore KeyValueDB should store it's database(s).
+  with_legacy: true
+# filestore wb throttle limits
+- name: filestore_wbthrottle_enable
+  type: bool
+  level: advanced
+  desc: Enabling throttling of operations to backing file system
+  default: true
+  with_legacy: true
+- name: filestore_wbthrottle_btrfs_bytes_start_flusher
+  type: size
+  level: advanced
+  desc: Start flushing (fsyncing) when this many bytes are written(btrfs)
+  default: 40_M
+  with_legacy: true
+- name: filestore_wbthrottle_btrfs_bytes_hard_limit
+  type: size
+  level: advanced
+  desc: Block writes when this many bytes haven't been flushed (fsynced) (btrfs)
+  default: 400_M
+  with_legacy: true
+- name: filestore_wbthrottle_btrfs_ios_start_flusher
+  type: uint
+  level: advanced
+  desc: Start flushing (fsyncing) when this many IOs are written (brtrfs)
+  default: 500
+  with_legacy: true
+- name: filestore_wbthrottle_btrfs_ios_hard_limit
+  type: uint
+  level: advanced
+  desc: Block writes when this many IOs haven't been flushed (fsynced) (btrfs)
+  default: 5000
+  with_legacy: true
+- name: filestore_wbthrottle_btrfs_inodes_start_flusher
+  type: uint
+  level: advanced
+  desc: Start flushing (fsyncing) when this many distinct inodes have been modified
+    (btrfs)
+  default: 500
+  with_legacy: true
+- name: filestore_wbthrottle_xfs_bytes_start_flusher
+  type: size
+  level: advanced
+  desc: Start flushing (fsyncing) when this many bytes are written(xfs)
+  default: 40_M
+  with_legacy: true
+- name: filestore_wbthrottle_xfs_bytes_hard_limit
+  type: size
+  level: advanced
+  desc: Block writes when this many bytes haven't been flushed (fsynced) (xfs)
+  default: 400_M
+  with_legacy: true
+- name: filestore_wbthrottle_xfs_ios_start_flusher
+  type: uint
+  level: advanced
+  desc: Start flushing (fsyncing) when this many IOs are written (xfs)
+  default: 500
+  with_legacy: true
+- name: filestore_wbthrottle_xfs_ios_hard_limit
+  type: uint
+  level: advanced
+  desc: Block writes when this many IOs haven't been flushed (fsynced) (xfs)
+  default: 5000
+  with_legacy: true
+- name: filestore_wbthrottle_xfs_inodes_start_flusher
+  type: uint
+  level: advanced
+  desc: Start flushing (fsyncing) when this many distinct inodes have been modified
+    (xfs)
+  default: 500
+  with_legacy: true
+# These must be less than the fd limit
+- name: filestore_wbthrottle_btrfs_inodes_hard_limit
+  type: uint
+  level: advanced
+  desc: Block writing when this many inodes have outstanding writes (btrfs)
+  default: 5000
+  with_legacy: true
+- name: filestore_wbthrottle_xfs_inodes_hard_limit
+  type: uint
+  level: advanced
+  desc: Block writing when this many inodes have outstanding writes (xfs)
+  default: 5000
+  with_legacy: true
+# Introduce a O_DSYNC write in the filestore
+- name: filestore_odsync_write
+  type: bool
+  level: dev
+  desc: Write with O_DSYNC
+  default: false
+  with_legacy: true
+# Tests index failure paths
+- name: filestore_index_retry_probability
+  type: float
+  level: dev
+  default: 0
+  with_legacy: true
+# Allow object read error injection
+- name: filestore_debug_inject_read_err
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: filestore_debug_random_read_err
+  type: float
+  level: dev
+  default: 0
+  with_legacy: true
+# Expensive debugging check on sync
+- name: filestore_debug_omap_check
+  type: bool
+  level: dev
+  default: false
+  fmt_desc: Debugging check on synchronization. This is an expensive operation.
+
+  with_legacy: true
+- name: filestore_omap_header_cache_size
+  type: size
+  level: dev
+  default: 1_K
+  with_legacy: true
+# Use omap for xattrs for attrs over
+# filestore_max_inline_xattr_size or
+- name: filestore_max_inline_xattr_size
+  type: size
+  level: dev
+  default: 0
+  with_legacy: true
+- name: filestore_max_inline_xattr_size_xfs
+  type: size
+  level: dev
+  default: 64_K
+  with_legacy: true
+- name: filestore_max_inline_xattr_size_btrfs
+  type: size
+  level: dev
+  default: 2_K
+  with_legacy: true
+- name: filestore_max_inline_xattr_size_other
+  type: size
+  level: dev
+  default: 512
+  with_legacy: true
+# for more than filestore_max_inline_xattrs attrs
+- name: filestore_max_inline_xattrs
+  type: uint
+  level: dev
+  default: 0
+  with_legacy: true
+- name: filestore_max_inline_xattrs_xfs
+  type: uint
+  level: dev
+  default: 10
+  with_legacy: true
+- name: filestore_max_inline_xattrs_btrfs
+  type: uint
+  level: dev
+  default: 10
+  with_legacy: true
+- name: filestore_max_inline_xattrs_other
+  type: uint
+  level: dev
+  default: 2
+  with_legacy: true
+- name: filestore_max_xattr_value_size
+  type: size
+  level: dev
+  default: 0
+  with_legacy: true
+- name: filestore_max_xattr_value_size_xfs
+  type: size
+  level: dev
+  default: 64_K
+  with_legacy: true
+- name: filestore_max_xattr_value_size_btrfs
+  type: size
+  level: dev
+  default: 64_K
+  with_legacy: true
+# ext4 allows 4k xattrs total including some smallish extra fields and the
+# keys.  We're allowing 2 512 inline attrs in addition some some filestore
+# replay attrs.  After accounting for those, we still need to fit up to
+# two attrs of this value.  That means we need this value to be around 1k
+# to be safe.  This is hacky, but it's not worth complicating the code
+# to work around ext4's total xattr limit.
+- name: filestore_max_xattr_value_size_other
+  type: size
+  level: dev
+  default: 1_K
+  with_legacy: true
+# track sloppy crcs
+- name: filestore_sloppy_crc
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: filestore_sloppy_crc_block_size
+  type: size
+  level: dev
+  default: 64_K
+  with_legacy: true
+- name: filestore_max_alloc_hint_size
+  type: size
+  level: dev
+  default: 1_M
+  with_legacy: true
+# seconds
+- name: filestore_max_sync_interval
+  type: float
+  level: advanced
+  desc: Period between calls to syncfs(2) and journal trims (seconds)
+  default: 5
+  with_legacy: true
+# seconds
+- name: filestore_min_sync_interval
+  type: float
+  level: dev
+  desc: Minimum period between calls to syncfs(2)
+  default: 0.01
+  with_legacy: true
+- name: filestore_btrfs_snap
+  type: bool
+  level: dev
+  default: true
+  with_legacy: true
+- name: filestore_btrfs_clone_range
+  type: bool
+  level: advanced
+  desc: Use btrfs clone_range ioctl to efficiently duplicate objects
+  default: true
+  with_legacy: true
+# zfsonlinux is still unstable
+- name: filestore_zfs_snap
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: filestore_fsync_flushes_journal_data
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+# (try to) use fiemap
+- name: filestore_fiemap
+  type: bool
+  level: advanced
+  desc: Use fiemap ioctl(2) to determine which parts of objects are sparse
+  default: false
+  with_legacy: true
+- name: filestore_punch_hole
+  type: bool
+  level: advanced
+  desc: Use fallocate(2) FALLOC_FL_PUNCH_HOLE to efficiently zero ranges of objects
+  default: false
+  with_legacy: true
+# (try to) use seek_data/hole
+- name: filestore_seek_data_hole
+  type: bool
+  level: advanced
+  desc: Use lseek(2) SEEK_HOLE and SEEK_DATA to determine which parts of objects are
+    sparse
+  default: false
+  with_legacy: true
+- name: filestore_splice
+  type: bool
+  level: advanced
+  desc: Use splice(2) to more efficiently copy data between files
+  default: false
+  with_legacy: true
+- name: filestore_fadvise
+  type: bool
+  level: advanced
+  desc: Use posix_fadvise(2) to pass hints to file system
+  default: true
+  with_legacy: true
+# collect device partition information for management application to use
+- name: filestore_collect_device_partition_information
+  type: bool
+  level: advanced
+  desc: Collect metadata about the backing file system on OSD startup
+  default: true
+  with_legacy: true
+# (try to) use extsize for alloc hint NOTE: extsize seems to trigger
+# data corruption in xfs prior to kernel 3.5.  filestore will
+# implicitly disable this if it cannot confirm the kernel is newer
+# than that.
+# NOTE: This option involves a tradeoff: When disabled, fragmentation is
+# worse, but large sequential writes are faster. When enabled, large
+# sequential writes are slower, but fragmentation is reduced.
+- name: filestore_xfs_extsize
+  type: bool
+  level: advanced
+  desc: Use XFS extsize ioctl(2) to hint allocator about expected write sizes
+  default: false
+  with_legacy: true
+- name: filestore_journal_parallel
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: filestore_journal_writeahead
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: filestore_journal_trailing
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: filestore_queue_max_ops
+  type: uint
+  level: advanced
+  desc: Max IO operations in flight
+  default: 50
+  with_legacy: true
+- name: filestore_queue_max_bytes
+  type: size
+  level: advanced
+  desc: Max (written) bytes in flight
+  default: 100_M
+  with_legacy: true
+- name: filestore_caller_concurrency
+  type: int
+  level: dev
+  default: 10
+  with_legacy: true
+# Expected filestore throughput in B/s
+- name: filestore_expected_throughput_bytes
+  type: float
+  level: advanced
+  desc: Expected throughput of backend device (aids throttling calculations)
+  default: 209715200
+  with_legacy: true
+# Expected filestore throughput in ops/s
+- name: filestore_expected_throughput_ops
+  type: float
+  level: advanced
+  desc: Expected through of backend device in IOPS (aids throttling calculations)
+  default: 200
+  with_legacy: true
+# Filestore max delay multiple.  Defaults to 0 (disabled)
+- name: filestore_queue_max_delay_multiple
+  type: float
+  level: dev
+  default: 0
+  with_legacy: true
+# Filestore high delay multiple.  Defaults to 0 (disabled)
+- name: filestore_queue_high_delay_multiple
+  type: float
+  level: dev
+  default: 0
+  with_legacy: true
+# Filestore max delay multiple ops.  Defaults to 0 (disabled)
+- name: filestore_queue_max_delay_multiple_bytes
+  type: float
+  level: dev
+  default: 0
+  with_legacy: true
+# Filestore high delay multiple bytes.  Defaults to 0 (disabled)
+- name: filestore_queue_high_delay_multiple_bytes
+  type: float
+  level: dev
+  default: 0
+  with_legacy: true
+# Filestore max delay multiple ops.  Defaults to 0 (disabled)
+- name: filestore_queue_max_delay_multiple_ops
+  type: float
+  level: dev
+  default: 0
+  with_legacy: true
+# Filestore high delay multiple ops.  Defaults to 0 (disabled)
+- name: filestore_queue_high_delay_multiple_ops
+  type: float
+  level: dev
+  default: 0
+  with_legacy: true
+- name: filestore_queue_low_threshhold
+  type: float
+  level: dev
+  default: 0.3
+  with_legacy: true
+- name: filestore_queue_high_threshhold
+  type: float
+  level: dev
+  with_legacy: true
+  default: 0.9
+- name: filestore_op_threads
+  type: int
+  level: advanced
+  desc: Threads used to apply changes to backing file system
+  default: 2
+  with_legacy: true
+- name: filestore_op_thread_timeout
+  type: int
+  level: advanced
+  desc: Seconds before a worker thread is considered stalled
+  default: 1_min
+  with_legacy: true
+- name: filestore_op_thread_suicide_timeout
+  type: int
+  level: advanced
+  desc: Seconds before a worker thread is considered dead
+  default: 3_min
+  with_legacy: true
+- name: filestore_commit_timeout
+  type: float
+  level: advanced
+  desc: Seconds before backing file system is considered hung
+  default: 10_min
+  with_legacy: true
+- name: filestore_fiemap_threshold
+  type: size
+  level: dev
+  default: 4_K
+  with_legacy: true
+- name: filestore_merge_threshold
+  type: int
+  level: dev
+  default: -10
+  with_legacy: true
+- name: filestore_split_multiple
+  type: int
+  level: dev
+  default: 2
+  with_legacy: true
+- name: filestore_split_rand_factor
+  type: uint
+  level: dev
+  default: 20
+  with_legacy: true
+- name: filestore_update_to
+  type: int
+  level: dev
+  default: 1000
+  with_legacy: true
+- name: filestore_blackhole
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: filestore_fd_cache_size
+  type: int
+  level: dev
+  default: 128
+  with_legacy: true
+- name: filestore_fd_cache_shards
+  type: int
+  level: dev
+  default: 16
+  with_legacy: true
+- name: filestore_ondisk_finisher_threads
+  type: int
+  level: dev
+  default: 1
+  with_legacy: true
+- name: filestore_apply_finisher_threads
+  type: int
+  level: dev
+  default: 1
+  with_legacy: true
+# file onto which store transaction dumps
+- name: filestore_dump_file
+  type: str
+  level: dev
+  with_legacy: true
+# inject a failure at the n'th opportunity
+- name: filestore_kill_at
+  type: int
+  level: dev
+  default: 0
+  with_legacy: true
+# artificially stall for N seconds in op queue thread
+- name: filestore_inject_stall
+  type: int
+  level: dev
+  default: 0
+  with_legacy: true
+# fail/crash on EIO
+- name: filestore_fail_eio
+  type: bool
+  level: dev
+  default: true
+  with_legacy: true
+- name: filestore_debug_verify_split
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: journal_dio
+  type: bool
+  level: dev
+  default: true
+  fmt_desc: Enables direct i/o to the journal. Requires ``journal block
+   align`` set to ``true``.
+  with_legacy: true
+- name: journal_aio
+  type: bool
+  level: dev
+  default: true
+  fmt_desc: Enables using ``libaio`` for asynchronous writes to the journal.
+   Requires ``journal dio`` set to ``true``. Version 0.61 and later, ``true``.
+   Version 0.60 and earlier, ``false``.
+  with_legacy: true
+- name: journal_force_aio
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: journal_block_size
+  type: size
+  level: dev
+  default: 4_K
+  with_legacy: true
+- name: journal_block_align
+  type: bool
+  level: dev
+  default: true
+  fmt_desc: Block aligns write operations. Required for ``dio`` and ``aio``.
+  with_legacy: true
+- name: journal_write_header_frequency
+  type: uint
+  level: dev
+  default: 0
+  with_legacy: true
+- name: journal_max_write_bytes
+  type: size
+  level: advanced
+  desc: Max bytes in flight to journal
+  fmt_desc: The maximum number of bytes the journal will write at
+   any one time.
+  default: 10_M
+  with_legacy: true
+- name: journal_max_write_entries
+  type: int
+  level: advanced
+  desc: Max IOs in flight to journal
+  fmt_desc: The maximum number of entries the journal will write at
+   any one time.
+  default: 100
+  with_legacy: true
+# Target range for journal fullness
+- name: journal_throttle_low_threshhold
+  type: float
+  level: dev
+  default: 0.6
+  with_legacy: true
+- name: journal_throttle_high_threshhold
+  type: float
+  level: dev
+  default: 0.9
+  with_legacy: true
+# Multiple over expected at high_threshhold. Defaults to 0 (disabled).
+- name: journal_throttle_high_multiple
+  type: float
+  level: dev
+  default: 0
+  with_legacy: true
+# Multiple over expected at max.  Defaults to 0 (disabled).
+- name: journal_throttle_max_multiple
+  type: float
+  level: dev
+  default: 0
+  with_legacy: true
+# align data payloads >= this.
+- name: journal_align_min_size
+  type: size
+  level: dev
+  default: 64_K
+  fmt_desc: Align data payloads greater than the specified minimum.
+  with_legacy: true
+- name: journal_replay_from
+  type: int
+  level: dev
+  default: 0
+  with_legacy: true
+- name: journal_zero_on_create
+  type: bool
+  level: dev
+  default: false
+  fmt_desc: |
+    Causes the file store to overwrite the entire journal with
+    ``0``'s during ``mkfs``.
+  with_legacy: true
+# assume journal is not corrupt
+- name: journal_ignore_corruption
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+# using ssd disk as journal, whether support discard nouse journal-data.
+- name: journal_discard
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+# fio data directory for fio-objectstore
+- name: fio_dir
+  type: str
+  level: advanced
+  default: /tmp/fio
+  with_legacy: true
+- name: rados_mon_op_timeout
+  type: secs
+  level: advanced
+  desc: timeout for operations handled by monitors such as statfs (0 is unlimited)
+  default: 0
+  min: 0
+  flags:
+  - runtime
+- name: rados_osd_op_timeout
+  type: secs
+  level: advanced
+  desc: timeout for operations handled by osds such as write (0 is unlimited)
+  default: 0
+  min: 0
+  flags:
+  - runtime
+# true if LTTng-UST tracepoints should be enabled
+- name: rados_tracing
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: mgr_connect_retry_interval
+  type: float
+  level: dev
+  default: 1
+  services:
+  - common
+- name: mgr_client_service_daemon_unregister_timeout
+  type: float
+  level: dev
+  desc: Time to wait during shutdown to deregister service with mgr
+  default: 1
+- name: throttler_perf_counter
+  type: bool
+  level: advanced
+  default: true
+  with_legacy: true
+- name: event_tracing
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: bluestore_tracing
+  type: bool
+  level: advanced
+  desc: Enable bluestore event tracing.
+  default: false
+- name: bluestore_throttle_trace_rate
+  type: float
+  level: advanced
+  desc: Rate at which to sample bluestore transactions (per second)
+  default: 0
+- name: debug_deliberately_leak_memory
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+- name: debug_asserts_on_shutdown
+  type: bool
+  level: dev
+  desc: Enable certain asserts to check for refcounting bugs on shutdown; see http://tracker.ceph.com/issues/21738
+  default: false
+- name: debug_asok_assert_abort
+  type: bool
+  level: dev
+  desc: allow commands 'assert' and 'abort' via asok for testing crash dumps etc
+  default: false
+  with_legacy: true
+- name: target_max_misplaced_ratio
+  type: float
+  level: basic
+  desc: Max ratio of misplaced objects to target when throttling data rebalancing
+    activity
+  default: 0.05
+- name: device_failure_prediction_mode
+  type: str
+  level: basic
+  desc: Method used to predict device failures
+  long_desc: To disable prediction, use 'none',  'local' uses a prediction model that
+    runs inside the mgr daemon.  'cloud' will share metrics with a cloud service and
+    query the service for devicelife expectancy.
+  default: none
+  enum_values:
+  - none
+  - local
+  - cloud
+  flags:
+  - runtime
+- name: gss_ktab_client_file
+  type: str
+  level: advanced
+  desc: GSS/KRB5 Keytab file for client authentication
+  long_desc: This sets the full path for the GSS/Kerberos client keytab file location.
+  default: /var/lib/ceph/$name/gss_client_$name.ktab
+  services:
+  - mon
+  - osd
+- name: gss_target_name
+  type: str
+  level: advanced
+  long_desc: This sets the gss target service name.
+  default: ceph
+  services:
+  - mon
+  - osd
+- name: debug_disable_randomized_ping
+  type: bool
+  level: dev
+  desc: Disable heartbeat ping randomization for testing purposes
+  default: false
+- name: debug_heartbeat_testing_span
+  type: int
+  level: dev
+  desc: Override 60 second periods for testing only
+  default: 0
+- name: librados_thread_count
+  type: uint
+  level: advanced
+  desc: Size of thread pool for Objecter
+  default: 2
+  tags:
+  - client
+  min: 1
+- name: osd_asio_thread_count
+  type: uint
+  level: advanced
+  desc: Size of thread pool for ASIO completions
+  default: 2
+  tags:
+  - osd
+  min: 1
+- name: cephsqlite_lock_renewal_interval
+  type: millisecs
+  level: advanced
+  desc: number of milliseconds before lock is renewed
+  default: 2000
+  tags:
+  - client
+  see_also:
+  - cephsqlite_lock_renewal_timeout
+  min: 100
+- name: cephsqlite_lock_renewal_timeout
+  type: millisecs
+  level: advanced
+  desc: number of milliseconds before transaction lock times out
+  long_desc: The amount of time before a running libcephsqlite VFS connection has
+    to renew a lock on the database before the lock is automatically lost. If the
+    lock is lost, the VFS will abort the process to prevent database corruption.
+  default: 30000
+  tags:
+  - client
+  see_also:
+  - cephsqlite_lock_renewal_interval
+  min: 100
+- name: cephsqlite_blocklist_dead_locker
+  type: bool
+  level: advanced
+  desc: blocklist the last dead owner of the database lock
+  long_desc: Require that the Ceph SQLite VFS blocklist the last dead owner of the
+    database when cleanup was incomplete. DO NOT CHANGE THIS UNLESS YOU UNDERSTAND
+    THE RAMIFICATIONS. CORRUPTION MAY RESULT.
+  default: true
+  tags:
+  - client
+- name: bdev_type
+  type: str
+  level: advanced
+  desc: Explicitly set the device type to select the driver if it's needed
+  enum_values:
+  - aio
+  - spdk
+  - pmem
+  - hm_smr
+- name: bluestore_cleaner_sleep_interval
+  type: float
+  level: advanced
+  desc: How long cleaner should sleep before re-checking utilization
+  default: 5
+  with_legacy: true
+- name: jaeger_tracing_enable
+  type: bool
+  level: advanced
+  desc: Ceph should use jaeger tracing system
+  default: false
+  services:
+  - rgw
+  - osd
+  with_legacy: true
+- name: jaeger_agent_port
+  type: int
+  level: advanced
+  desc: port number of the jaeger agent
+  default: 6799
+  services:
+  - rgw
+  - osd
+- name: mgr_ttl_cache_expire_seconds
+  type: uint
+  level: dev
+  desc: Set the time to live in seconds - set to 0 to disable the cache.
+  default: 0
+  services:
+  - mgr
diff --git a/src/common/options/immutable-object-cache.yaml.in b/src/common/options/immutable-object-cache.yaml.in
new file mode 100644
index 000000000..90b13d60d
--- /dev/null
+++ b/src/common/options/immutable-object-cache.yaml.in
@@ -0,0 +1,98 @@
+# -*- mode: YAML -*-
+---
+
+options:
+- name: immutable_object_cache_path
+  type: str
+  level: advanced
+  desc: immutable object cache data dir
+  default: /tmp/ceph_immutable_object_cache
+  services:
+  - immutable-object-cache
+- name: immutable_object_cache_sock
+  type: str
+  level: advanced
+  desc: immutable object cache domain socket
+  default: /var/run/ceph/immutable_object_cache_sock
+  services:
+  - immutable-object-cache
+- name: immutable_object_cache_max_size
+  type: size
+  level: advanced
+  desc: max immutable object cache data size
+  default: 1_G
+  services:
+  - immutable-object-cache
+- name: immutable_object_cache_max_inflight_ops
+  type: uint
+  level: advanced
+  desc: max inflight promoting requests for immutable object cache daemon
+  default: 128
+  services:
+  - immutable-object-cache
+- name: immutable_object_cache_client_dedicated_thread_num
+  type: uint
+  level: advanced
+  desc: immutable object cache client dedicated thread number
+  default: 2
+  services:
+  - immutable-object-cache
+- name: immutable_object_cache_watermark
+  type: float
+  level: advanced
+  desc: immutable object cache water mark
+  default: 0.9
+  services:
+  - immutable-object-cache
+- name: immutable_object_cache_qos_schedule_tick_min
+  type: millisecs
+  level: advanced
+  desc: minimum schedule tick for immutable object cache
+  default: 50
+  services:
+  - immutable-object-cache
+  min: 1
+- name: immutable_object_cache_qos_iops_limit
+  type: uint
+  level: advanced
+  desc: the desired immutable object cache IO operations limit per second
+  default: 0
+  services:
+  - immutable-object-cache
+- name: immutable_object_cache_qos_iops_burst
+  type: uint
+  level: advanced
+  desc: the desired burst limit of immutable object cache IO operations
+  default: 0
+  services:
+  - immutable-object-cache
+- name: immutable_object_cache_qos_iops_burst_seconds
+  type: secs
+  level: advanced
+  desc: the desired burst duration in seconds of immutable object cache IO operations
+  default: 1
+  services:
+  - immutable-object-cache
+  min: 1
+- name: immutable_object_cache_qos_bps_limit
+  type: uint
+  level: advanced
+  desc: the desired immutable object cache IO bytes limit per second
+  default: 0
+  services:
+  - immutable-object-cache
+- name: immutable_object_cache_qos_bps_burst
+  type: uint
+  level: advanced
+  desc: the desired burst limit of immutable object cache IO bytes
+  default: 0
+  services:
+  - immutable-object-cache
+- name: immutable_object_cache_qos_bps_burst_seconds
+  type: secs
+  level: advanced
+  desc: the desired burst duration in seconds of immutable object cache IO bytes
+  default: 1
+  services:
+  - immutable-object-cache
+  min: 1
diff --git a/src/common/options/legacy_config_opts.h b/src/common/options/legacy_config_opts.h
new file mode 100644
index 000000000..3f8c8244c
--- /dev/null
+++ b/src/common/options/legacy_config_opts.h
@@ -0,0 +1,11 @@
+#include "global_legacy_options.h"
+#include "cephfs-mirror_legacy_options.h"
+#include "mds_legacy_options.h"
+#include "mds-client_legacy_options.h"
+#include "mgr_legacy_options.h"
+#include "mon_legacy_options.h"
+#include "osd_legacy_options.h"
+#include "rbd_legacy_options.h"
+#include "rbd-mirror_legacy_options.h"
+#include "immutable-object-cache_legacy_options.h"
+#include "rgw_legacy_options.h"
diff --git a/src/common/options/mds-client.yaml.in b/src/common/options/mds-client.yaml.in
new file mode 100644
index 000000000..4e599d4cf
--- /dev/null
+++ b/src/common/options/mds-client.yaml.in
@@ -0,0 +1,580 @@
+# -*- mode: YAML -*-
+---
+
+options:
+- name: client_cache_size
+  type: size
+  level: basic
+  desc: soft maximum number of directory entries in client cache
+  fmt_desc: Set the number of inodes that the client keeps in the metadata cache.
+  default: 16_K
+  services:
+  - mds_client
+  with_legacy: true
+- name: client_cache_mid
+  type: float
+  level: advanced
+  desc: mid-point of client cache LRU
+  fmt_desc: Set client cache midpoint. The midpoint splits the least recently used
+    lists into a hot and warm list.
+  default: 0.75
+  services:
+  - mds_client
+  with_legacy: true
+- name: client_use_random_mds
+  type: bool
+  level: dev
+  desc: issue new requests to a random active MDS
+  fmt_desc: Choose random MDS for each request.
+  default: false
+  services:
+  - mds_client
+  with_legacy: true
+- name: client_mount_timeout
+  type: secs
+  level: advanced
+  desc: timeout for mounting CephFS (seconds)
+  fmt_desc: Set the timeout for CephFS mount in seconds.
+  default: 5_min
+  services:
+  - mds_client
+- name: client_tick_interval
+  type: secs
+  level: dev
+  desc: seconds between client upkeep ticks
+  fmt_desc: Set the interval in seconds between capability renewal and other upkeep.
+  default: 1
+  services:
+  - mds_client
+- name: client_trace
+  type: str
+  level: dev
+  desc: file containing trace of client operations
+  services:
+  - mds_client
+  with_legacy: true
+- name: client_readahead_min
+  type: size
+  level: advanced
+  desc: minimum bytes to readahead in a file
+  fmt_desc: Set the minimum number bytes that the client reads ahead.
+  default: 128_K
+  services:
+  - mds_client
+  with_legacy: true
+- name: client_readahead_max_bytes
+  type: size
+  level: advanced
+  desc: maximum bytes to readahead in a file (zero is unlimited)
+  fmt_desc: Set the maximum number of bytes that the client reads ahead for
+    future read operations. Overridden by the ``client_readahead_max_periods``
+    setting.
+  default: 0
+  services:
+  - mds_client
+  with_legacy: true
+# as multiple of file layout period (object size * num stripes)
+- name: client_readahead_max_periods
+  type: int
+  level: advanced
+  desc: maximum stripe periods to readahead in a file
+  fmt_desc: Set the number of file layout periods (object size * number of
+    stripes) that the client reads ahead. Overrides the
+    ``client_readahead_max_bytes`` setting.
+  default: 4
+  services:
+  - mds_client
+  with_legacy: true
+- name: client_reconnect_stale
+  type: bool
+  level: advanced
+  desc: reconnect when the session becomes stale
+  default: false
+  services:
+  - mds_client
+- name: client_snapdir
+  type: str
+  level: advanced
+  desc: pseudo directory for snapshot access to a directory
+  fmt_desc: Set the snapshot directory name.
+  default: .snap
+  services:
+  - mds_client
+  with_legacy: true
+- name: client_mountpoint
+  type: str
+  level: advanced
+  desc: default mount-point
+  fmt_desc: Directory to mount on the CephFS file system. An alternative to the
+    ``-r`` option of the ``ceph-fuse`` command.
+  default: /
+  services:
+  - mds_client
+- name: client_mount_uid
+  type: int
+  level: advanced
+  desc: uid to mount as
+  default: -1
+  services:
+  - mds_client
+  fmt_desc: Set the user ID of CephFS mount.
+  with_legacy: true
+- name: client_mount_gid
+  type: int
+  level: advanced
+  desc: gid to mount as
+  fmt_desc: Set the group ID of CephFS mount.
+  default: -1
+  services:
+  - mds_client
+  with_legacy: true
+- name: client_notify_timeout
+  type: int
+  level: dev
+  default: 10
+  services:
+  - mds_client
+  with_legacy: true
+- name: osd_client_watch_timeout
+  type: int
+  level: dev
+  default: 30
+  services:
+  - mds_client
+  with_legacy: true
+- name: client_caps_release_delay
+  type: secs
+  level: dev
+  default: 5
+  services:
+  - mds_client
+  fmt_desc: Set the delay between capability releases in seconds. The delay
+    sets how many   seconds a client waits to release capabilities that it no
+    longer needs in case the capabilities are needed for another user space
+    operation.
+- name: client_quota_df
+  type: bool
+  level: advanced
+  desc: show quota usage for statfs (df)
+  fmt_desc: Report root directory quota for the ``statfs`` operation.
+  default: true
+  services:
+  - mds_client
+  with_legacy: true
+- name: client_oc
+  type: bool
+  level: advanced
+  desc: enable object caching
+  default: true
+  services:
+  - mds_client
+  with_legacy: true
+- name: client_oc_size
+  type: size
+  level: advanced
+  desc: maximum size of object cache
+  fmt_desc: Set how many bytes of data will the client cache.
+  default: 200_M
+  services:
+  - mds_client
+  flags:
+  - runtime
+  with_legacy: true
+# MB * n  (dirty OR tx.. bigish)
+- name: client_oc_max_dirty
+  type: size
+  level: advanced
+  desc: maximum size of dirty pages in object cache
+  fmt_desc: Set the maximum number of dirty bytes in the object cache.
+  default: 100_M
+  services:
+  - mds_client
+  flags:
+  - runtime
+  with_legacy: true
+# target dirty (keep this smallish)
+- name: client_oc_target_dirty
+  type: size
+  level: advanced
+  desc: target size of dirty pages object cache
+  fmt_desc: Set the target size of dirty data. We recommend to keep this number low.
+  default: 8_M
+  services:
+  - mds_client
+  flags:
+  - runtime
+  with_legacy: true
+- name: client_oc_max_dirty_age
+  type: float
+  level: advanced
+  desc: maximum age of dirty pages in object cache (seconds)
+  fmt_desc: Set the maximum age in seconds of dirty data in the object cache
+    before writeback.
+  default: 5
+  services:
+  - mds_client
+  flags:
+  - runtime
+  with_legacy: true
+- name: client_oc_max_objects
+  type: int
+  level: advanced
+  desc: maximum number of objects in cache
+  fmt_desc: Set the maximum number of objects in the object cache.
+  default: 1000
+  services:
+  - mds_client
+  flags:
+  - runtime
+  with_legacy: true
+# check if MDS reply contains wanted caps
+- name: client_debug_getattr_caps
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds_client
+  with_legacy: true
+# always read synchronously (go to osds)
+- name: client_debug_force_sync_read
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds_client
+  fmt_desc: If set to ``true``, clients read data directly from OSDs instead
+    of using a local page cache.
+  with_legacy: true
+- name: client_debug_inject_tick_delay
+  type: secs
+  level: dev
+  default: 0
+  services:
+  - mds_client
+- name: client_max_inline_size
+  type: size
+  level: dev
+  default: 4_K
+  services:
+  - mds_client
+  fmt_desc: Set the maximum size of inlined data stored in a file inode rather
+    than in a separate data object in RADOS. This setting only applies if the
+    ``inline_data`` flag is set on the MDS map.
+  with_legacy: true
+# synthetic client bug for testing
+- name: client_inject_release_failure
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds_client
+  with_legacy: true
+# synthetic client bug for testing
+- name: client_inject_fixed_oldest_tid
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds_client
+  with_legacy: true
+- name: client_metadata
+  type: str
+  level: advanced
+  desc: metadata key=value comma-delimited pairs appended to session metadata
+  fmt_desc: Comma-delimited strings for client metadata sent to each MDS, in addition
+    to the automatically generated version, host name, and other metadata.
+  services:
+  - mds_client
+  with_legacy: true
+- name: client_acl_type
+  type: str
+  level: advanced
+  desc: ACL type to enforce (none or "posix_acl")
+  fmt_desc: Set the ACL type. Currently, only possible value is ``"posix_acl"`` to
+    enable POSIX ACL, or an empty string. This option only takes effect when the
+    ``fuse_default_permissions`` is set to ``false``.
+  services:
+  - mds_client
+  with_legacy: true
+- name: client_permissions
+  type: bool
+  level: advanced
+  desc: client-enforced permission checking
+  fmt_desc: Check client permissions on all I/O operations.
+  default: true
+  services:
+  - mds_client
+  with_legacy: true
+- name: client_dirsize_rbytes
+  type: bool
+  level: advanced
+  desc: set the directory size as the number of file bytes recursively used
+  long_desc: This option enables a CephFS feature that stores the recursive directory
+    size (the bytes used by files in the directory and its descendents) in the st_size
+    field of the stat structure.
+  default: true
+  services:
+  - mds_client
+  with_legacy: true
+- name: client_force_lazyio
+  type: bool
+  level: advanced
+  default: false
+  services:
+  - mds_client
+- name: fuse_use_invalidate_cb
+  type: bool
+  level: advanced
+  desc: use fuse 2.8+ invalidate callback to keep page cache consistent
+  default: true
+  services:
+  - mds_client
+- name: fuse_disable_pagecache
+  type: bool
+  level: advanced
+  desc: disable page caching in the kernel for this FUSE mount
+  fmt_desc: If set to ``true``, kernel page cache is disabled for ``ceph-fuse``
+    mounts. When multiple clients read/write to a file at the same
+    time, readers may get stale data from page cache. Due to
+    limitations of FUSE, ``ceph-fuse`` can't disable page cache dynamically.
+  default: false
+  services:
+  - mds_client
+- name: fuse_allow_other
+  type: bool
+  level: advanced
+  desc: pass allow_other to FUSE on mount
+  default: true
+  services:
+  - mds_client
+- name: fuse_default_permissions
+  type: bool
+  level: advanced
+  desc: pass default_permisions to FUSE on mount
+  fmt_desc: When set to ``false``, ``ceph-fuse`` utility checks does its own
+    permissions checking, instead of relying on the permissions enforcement in
+    FUSE. Set to ``false`` together with the ``client acl type=posix_acl``
+    option to enable POSIX ACL.
+  default: false
+  services:
+  - mds_client
+  flags:
+  - startup
+- name: fuse_splice_read
+  type: bool
+  level: advanced
+  desc: enable splice read to reduce the memory copies
+  default: true
+  services:
+  - mds_client
+- name: fuse_splice_write
+  type: bool
+  level: advanced
+  desc: enable splice write to reduce the memory copies
+  default: true
+  services:
+  - mds_client
+- name: fuse_splice_move
+  type: bool
+  level: advanced
+  desc: enable splice move to reduce the memory copies
+  default: true
+  services:
+  - mds_client
+- name: fuse_big_writes
+  type: bool
+  level: advanced
+  desc: big_writes is deprecated in libfuse 3.0.0
+  default: true
+  services:
+  - mds_client
+- name: fuse_max_write
+  type: size
+  level: advanced
+  desc: set the maximum number of bytes in a single write operation
+  long_desc: Set the maximum number of bytes in a single write operation that may
+    pass atomically through FUSE. The FUSE default is 128kB and may be indicated by
+    setting this option to 0.
+  fmt_desc: Set the maximum number of bytes in a single write operation. A value of
+    0 indicates no change; the FUSE default of 128 kbytes remains in force.
+  default: 0
+  services:
+  - mds_client
+- name: fuse_atomic_o_trunc
+  type: bool
+  level: advanced
+  desc: pass atomic_o_trunc flag to FUSE on mount
+  default: true
+  services:
+  - mds_client
+- name: fuse_debug
+  type: bool
+  level: advanced
+  desc: enable debugging for the libfuse
+  default: false
+  services:
+  - mds_client
+  flags:
+  - no_mon_update
+  - startup
+- name: fuse_multithreaded
+  type: bool
+  level: advanced
+  desc: allow parallel processing through FUSE library
+  default: true
+  services:
+  - mds_client
+- name: fuse_require_active_mds
+  type: bool
+  level: advanced
+  desc: require active MDSs in the file system when mounting
+  default: true
+  services:
+  - mds_client
+- name: fuse_syncfs_on_mksnap
+  type: bool
+  level: advanced
+  desc: synchronize all local metadata/file changes after snapshot
+  default: true
+  services:
+  - mds_client
+- name: fuse_set_user_groups
+  type: bool
+  level: advanced
+  desc: check for ceph-fuse to consider supplementary groups for permissions
+  default: true
+  services:
+  - mds_client
+# the client should try to use dentry invalidation instead of remounting, on kernels it believes that will work for
+- name: client_try_dentry_invalidate
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds_client
+  with_legacy: true
+- name: client_max_retries_on_remount_failure
+  type: uint
+  level: advanced
+  desc: number of consecutive failed remount attempts for invalidating kernel dcache
+    after which client would abort.
+  default: 5
+  services:
+  - mds_client
+- name: client_die_on_failed_remount
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds_client
+- name: client_die_on_failed_dentry_invalidate
+  type: bool
+  level: advanced
+  desc: kill the client when no dentry invalidation options are available
+  long_desc: The CephFS client requires a mechanism to invalidate dentries in the
+    caller (e.g. the kernel for ceph-fuse) when capabilities must be recalled. If
+    the client cannot do this then the MDS cache cannot shrink which can cause the
+    MDS to fail.
+  default: true
+  services:
+  - mds_client
+- name: client_check_pool_perm
+  type: bool
+  level: advanced
+  desc: confirm access to inode's data pool/namespace described in file layout
+  default: true
+  services:
+  - mds_client
+  with_legacy: true
+- name: client_use_faked_inos
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds_client
+  flags:
+  - startup
+  - no_mon_update
+  with_legacy: true
+- name: client_fs
+  type: str
+  level: advanced
+  desc: CephFS file system name to mount
+  long_desc: Use this with ceph-fuse, or with any process that uses libcephfs.  Programs
+    using libcephfs may also pass the filesystem name into mount(), which will override
+    this setting. If no filesystem name is given in mount() or this setting, the default
+    filesystem will be mounted (usually the first created).
+  services:
+  - mds_client
+  flags:
+  - startup
+- name: client_mds_namespace
+  type: str
+  level: dev
+  services:
+  - mds_client
+  flags:
+  - startup
+- name: fake_statfs_for_testing
+  type: int
+  level: dev
+  desc: Set a value for kb and compute kb_used from total of num_bytes
+  default: 0
+  services:
+  - mds_client
+  with_legacy: true
+# XXX: mon
+- name: debug_allow_any_pool_priority
+  type: bool
+  level: dev
+  desc: Allow any pool priority to be set to test conversion to new range
+  default: false
+  services:
+  - mds_client
+  with_legacy: true
+- name: client_asio_thread_count
+  type: uint
+  level: advanced
+  desc: Size of thread pool for ASIO completions
+  default: 2
+  tags:
+  - client
+  services:
+  - mds_client
+  min: 1
+- name: client_shutdown_timeout
+  type: secs
+  level: advanced
+  desc: timeout for shutting down CephFS
+  long_desc: Timeout for shutting down CephFS via unmount or shutdown.
+  default: 30
+  tags:
+  - client
+  services:
+  - mds_client
+  min: 0
+  flags:
+  - runtime
+- name: client_collect_and_send_global_metrics
+  type: bool
+  level: advanced
+  desc: to enable and force collecting and sending the global metrics to MDS
+  long_desc: To be careful for this, when connecting to some old ceph clusters
+    it may crash the MDS daemons while upgrading.
+  default: false
+  tags:
+  - client
+  services:
+  - mds_client
+  flags:
+  - runtime
+- name: client_quota
+  type: bool
+  level: advanced
+  desc: Enable quota enforcement
+  long_desc: Enable quota_bytes and quota_files enforcement for the client.
+  default: true
+  services:
+  - mds_client
+  flags:
+  - runtime
diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in
new file mode 100644
index 000000000..6eb0702fc
--- /dev/null
+++ b/src/common/options/mds.yaml.in
@@ -0,0 +1,1536 @@
+# -*- mode: YAML -*-
+---
+
+options:
+- name: mds_alternate_name_max
+  type: size
+  level: advanced
+  desc: set the maximum length of alternate names for dentries
+  default: 8_K
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_fscrypt_last_block_max_size
+  type: size
+  level: advanced
+  desc: maximum size of the last block without the header along with a truncate
+    request when the fscrypt is enabled.
+  default: 4_K
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_valgrind_exit
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_standby_replay_damaged
+  type: bool
+  level: dev
+  default: false
+  flags:
+  - runtime
+- name: mds_numa_node
+  type: int
+  level: advanced
+  desc: set mds's cpu affinity to a numa node (-1 for none)
+  default: -1
+  services:
+  - mds
+  flags:
+  - startup
+- name: mds_data
+  type: str
+  level: advanced
+  desc: path to MDS data and keyring
+  default: /var/lib/ceph/mds/$cluster-$id
+  services:
+  - mds
+  flags:
+  - no_mon_update
+  with_legacy: true
+- name: mds_join_fs
+  type: str
+  level: basic
+  desc: file system MDS prefers to join
+  long_desc: This setting indicates which file system name the MDS should prefer to
+    join (affinity). The monitors will try to have the MDS cluster safely reach a
+    state where all MDS have strong affinity, even via failovers to a standby.
+  services:
+  - mds
+  flags:
+  - runtime
+# max xattr kv pairs size for each dir/file
+- name: mds_max_xattr_pairs_size
+  type: size
+  level: advanced
+  desc: maximum aggregate size of extended attributes on a file
+  default: 64_K
+  services:
+  - mds
+  with_legacy: true
+- name: mds_cache_trim_interval
+  type: secs
+  level: advanced
+  desc: interval in seconds between cache trimming
+  default: 1
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_cache_release_free_interval
+  type: secs
+  level: dev
+  desc: interval in seconds between heap releases
+  default: 10
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_cache_memory_limit
+  type: size
+  level: basic
+  desc: target maximum memory usage of MDS cache
+  long_desc: This sets a target maximum memory usage of the MDS cache and is the primary
+    tunable to limit the MDS memory usage. The MDS will try to stay under a reservation
+    of this limit (by default 95%; 1 - mds_cache_reservation) by trimming unused metadata
+    in its cache and recalling cached items in the client caches. It is possible for
+    the MDS to exceed this limit due to slow recall from clients. The mds_health_cache_threshold
+    (150%) sets a cache full threshold for when the MDS signals a cluster health warning.
+  default: 4_G
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_cache_reservation
+  type: float
+  level: advanced
+  desc: amount of memory to reserve for future cached objects
+  fmt_desc: The cache reservation (memory or inodes) for the MDS cache to maintain.
+    Once the MDS begins dipping into its reservation, it will recall
+    client state until its cache size shrinks to restore the
+    reservation.
+  default: 0.05
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_health_cache_threshold
+  type: float
+  level: advanced
+  desc: threshold for cache size to generate health warning
+  default: 1.5
+  services:
+  - mds
+- name: mds_cache_mid
+  type: float
+  level: advanced
+  desc: midpoint for MDS cache LRU
+  fmt_desc: The insertion point for new items in the cache LRU
+    (from the top).
+  default: 0.7
+  services:
+  - mds
+- name: mds_cache_trim_decay_rate
+  type: float
+  level: advanced
+  desc: decay rate for trimming MDS cache throttle
+  default: 1
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_cache_trim_threshold
+  type: size
+  level: advanced
+  desc: threshold for number of dentries that can be trimmed
+  default: 256_K
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_max_file_recover
+  type: uint
+  level: advanced
+  desc: maximum number of files to recover file sizes in parallel
+  default: 32
+  services:
+  - mds
+  with_legacy: true
+- name: mds_dir_max_commit_size
+  type: int
+  level: advanced
+  desc: maximum size in megabytes for a RADOS write to a directory
+  fmt_desc: The maximum size of a directory update before Ceph breaks it into
+    smaller transactions (MB).
+  default: 10
+  services:
+  - mds
+  with_legacy: true
+- name: mds_dir_keys_per_op
+  type: int
+  level: advanced
+  desc: number of directory entries to read in one RADOS operation
+  default: 16384
+  services:
+  - mds
+  with_legacy: true
+- name: mds_decay_halflife
+  type: float
+  level: advanced
+  desc: rate of decay for temperature counters on each directory for balancing
+  default: 5
+  services:
+  - mds
+  with_legacy: true
+- name: mds_beacon_interval
+  type: float
+  level: advanced
+  desc: interval in seconds between MDS beacon messages sent to monitors
+  default: 4
+  services:
+  - mds
+  with_legacy: true
+- name: mds_beacon_grace
+  type: float
+  level: advanced
+  desc: tolerance in seconds for missed MDS beacons to monitors
+  fmt_desc: The interval without beacons before Ceph declares an MDS laggy
+    (and possibly replace it).
+  default: 15
+  services:
+  - mds
+  with_legacy: true
+- name: mds_heartbeat_reset_grace
+  type: uint
+  level: advanced
+  desc: the basic unit of tolerance in how many circles in a loop, which will
+    keep running by holding the mds_lock, it must trigger to reset heartbeat
+  default: 1000
+  services:
+  - mds
+- name: mds_heartbeat_grace
+  type: float
+  level: advanced
+  desc: tolerance in seconds for MDS internal heartbeat
+  default: 15
+  services:
+  - mds
+- name: mds_enforce_unique_name
+  type: bool
+  level: advanced
+  desc: require MDS name is unique in the cluster
+  default: true
+  services:
+  - mds
+  with_legacy: true
+# whether to blocklist clients whose sessions are dropped due to timeout
+- name: mds_session_blocklist_on_timeout
+  type: bool
+  level: advanced
+  desc: blocklist clients whose sessions have become stale
+  default: true
+  services:
+  - mds
+  with_legacy: true
+# whether to blocklist clients whose sessions are dropped via admin commands
+- name: mds_session_blocklist_on_evict
+  type: bool
+  level: advanced
+  desc: blocklist clients that have been evicted
+  default: true
+  services:
+  - mds
+  with_legacy: true
+# how many sessions should I try to load/store in a single OMAP operation?
+- name: mds_sessionmap_keys_per_op
+  type: uint
+  level: advanced
+  desc: number of omap keys to read from the SessionMap in one operation
+  default: 1_K
+  services:
+  - mds
+  with_legacy: true
+- name: mds_recall_max_caps
+  type: size
+  level: advanced
+  desc: maximum number of caps to recall from client session in single recall
+  default: 30000
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_recall_max_decay_rate
+  type: float
+  level: advanced
+  desc: decay rate for throttle on recalled caps on a session
+  default: 1.5
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_recall_max_decay_threshold
+  type: size
+  level: advanced
+  desc: decay threshold for throttle on recalled caps on a session
+  default: 128_K
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_recall_global_max_decay_threshold
+  type: size
+  level: advanced
+  desc: decay threshold for throttle on recalled caps globally
+  default: 128_K
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_recall_warning_threshold
+  type: size
+  level: advanced
+  desc: decay threshold for warning on slow session cap recall
+  default: 256_K
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_recall_warning_decay_rate
+  type: float
+  level: advanced
+  desc: decay rate for warning on slow session cap recall
+  default: 60
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_session_cache_liveness_decay_rate
+  type: float
+  level: advanced
+  desc: decay rate for session liveness leading to preemptive cap recall
+  long_desc: This determines how long a session needs to be quiescent before the MDS
+    begins preemptively recalling capabilities. The default of 5 minutes will cause
+    10 halvings of the decay counter after 1 hour, or 1/1024. The default magnitude
+    of 10 (1^10 or 1024) is chosen so that the MDS considers a previously chatty session
+    (approximately) to be quiescent after 1 hour.
+  default: 5_min
+  services:
+  - mds
+  see_also:
+  - mds_session_cache_liveness_magnitude
+  flags:
+  - runtime
+- name: mds_session_cache_liveness_magnitude
+  type: size
+  level: advanced
+  desc: decay magnitude for preemptively recalling caps on quiet client
+  long_desc: This is the order of magnitude difference (in base 2) of the internal
+    liveness decay counter and the number of capabilities the session holds. When
+    this difference occurs, the MDS treats the session as quiescent and begins recalling
+    capabilities.
+  default: 10
+  services:
+  - mds
+  see_also:
+  - mds_session_cache_liveness_decay_rate
+  flags:
+  - runtime
+- name: mds_session_cap_acquisition_decay_rate
+  type: float
+  level: advanced
+  desc: decay rate for session readdir caps leading to readdir throttle
+  long_desc: The half-life for the session cap acquisition counter of caps
+    acquired by readdir. This is used for throttling readdir requests from
+    clients.
+  default: 30
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_session_cap_acquisition_throttle
+  type: uint
+  level: advanced
+  desc: threshold at which the cap acquisition decay counter throttles
+  default: 100000
+  services:
+  - mds
+- name: mds_session_max_caps_throttle_ratio
+  type: float
+  level: advanced
+  desc: ratio of mds_max_caps_per_client that client must exceed before readdir may
+    be throttled by cap acquisition throttle
+  default: 1.1
+  services:
+  - mds
+- name: mds_cap_acquisition_throttle_retry_request_timeout
+  type: float
+  level: advanced
+  desc: timeout in seconds after which a client request is retried due to cap acquisition
+    throttling
+  default: 0.5
+  services:
+  - mds
+# detecting freeze tree deadlock
+- name: mds_freeze_tree_timeout
+  type: float
+  level: dev
+  default: 30
+  services:
+  - mds
+  with_legacy: true
+# collapse N-client health metrics to a single 'many'
+- name: mds_health_summarize_threshold
+  type: int
+  level: advanced
+  desc: threshold of number of clients to summarize late client recall
+  default: 10
+  services:
+  - mds
+  with_legacy: true
+# seconds to wait for clients during mds restart
+# make it (mdsmap.session_timeout - mds_beacon_grace)
+- name: mds_reconnect_timeout
+  type: float
+  level: advanced
+  desc: timeout in seconds to wait for clients to reconnect during MDS reconnect recovery
+    state
+  default: 45
+  services:
+  - mds
+  with_legacy: true
+- name: mds_deny_all_reconnect
+  type: bool
+  level: advanced
+  desc: flag to deny all client reconnects during failover
+  default: false
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_dir_prefetch
+  type: bool
+  level: advanced
+  desc: flag to prefetch entire dir
+  default: true
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_tick_interval
+  type: float
+  level: advanced
+  desc: time in seconds between upkeep tasks
+  fmt_desc: How frequently the MDS performs internal periodic tasks.
+  default: 5
+  services:
+  - mds
+  with_legacy: true
+# try to avoid propagating more often than this
+- name: mds_dirstat_min_interval
+  type: float
+  level: dev
+  default: 1
+  services:
+  - mds
+  fmt_desc: The minimum interval (in seconds) to try to avoid propagating
+    recursive stats up the tree.
+  with_legacy: true
+# how quickly dirstat changes propagate up the hierarchy
+- name: mds_scatter_nudge_interval
+  type: float
+  level: advanced
+  desc: minimum interval between scatter lock updates
+  fmt_desc: How quickly dirstat changes propagate up.
+  default: 5
+  services:
+  - mds
+  with_legacy: true
+- name: mds_client_prealloc_inos
+  type: int
+  level: advanced
+  desc: number of unused inodes to pre-allocate to clients for file creation
+  fmt_desc: The number of inode numbers to preallocate per client session.
+  default: 1000
+  services:
+  - mds
+  with_legacy: true
+- name: mds_client_delegate_inos_pct
+  type: uint
+  level: advanced
+  desc: percentage of preallocated inos to delegate to client
+  default: 50
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_early_reply
+  type: bool
+  level: advanced
+  desc: additional reply to clients that metadata requests are complete but not yet
+    durable
+  fmt_desc: Determines whether the MDS should allow clients to see request
+    results before they commit to the journal.
+  default: true
+  services:
+  - mds
+  with_legacy: true
+- name: mds_replay_unsafe_with_closed_session
+  type: bool
+  level: advanced
+  desc: complete all the replay request when mds is restarted, no matter the session
+    is closed or not
+  default: false
+  services:
+  - mds
+  flags:
+  - startup
+- name: mds_default_dir_hash
+  type: int
+  level: advanced
+  desc: hash function to select directory fragment for dentry name
+  fmt_desc: The function to use for hashing files across directory fragments.
+  # CEPH_STR_HASH_RJENKINS
+  default: 2
+  services:
+  - mds
+  with_legacy: true
+- name: mds_log_pause
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds
+  with_legacy: true
+- name: mds_log_skip_corrupt_events
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds
+  fmt_desc: Determines whether the MDS should try to skip corrupt journal
+    events during journal replay.
+  with_legacy: true
+- name: mds_log_max_events
+  type: int
+  level: advanced
+  desc: maximum number of events in the MDS journal (-1 is unlimited)
+  fmt_desc: The maximum events in the journal before we initiate trimming.
+    Set to ``-1`` to disable limits.
+  default: -1
+  services:
+  - mds
+  with_legacy: true
+- name: mds_log_events_per_segment
+  type: int
+  level: advanced
+  desc: maximum number of events in an MDS journal segment
+  default: 1024
+  services:
+  - mds
+  with_legacy: true
+# segment size for mds log, default to default file_layout_t
+- name: mds_log_segment_size
+  type: size
+  level: advanced
+  desc: size in bytes of each MDS log segment
+  default: 0
+  services:
+  - mds
+  with_legacy: true
+- name: mds_log_max_segments
+  type: uint
+  level: advanced
+  desc: maximum number of segments which may be untrimmed
+  fmt_desc: The maximum number of segments (objects) in the journal before
+    we initiate trimming. Set to ``-1`` to disable limits.
+  default: 128
+  services:
+  - mds
+  with_legacy: true
+- name: mds_log_warn_factor
+  type: float
+  level: advanced
+  desc: trigger MDS_HEALTH_TRIM warning when the mds log is longer than mds_log_max_segments
+    * mds_log_warn_factor
+  default: 2
+  services:
+  - mds
+  min: 1
+  flags:
+  - runtime
+- name: mds_bal_export_pin
+  type: bool
+  level: advanced
+  desc: allow setting directory export pins to particular ranks
+  default: true
+  services:
+  - mds
+  with_legacy: true
+- name: mds_export_ephemeral_random
+  type: bool
+  level: advanced
+  desc: allow ephemeral random pinning of the loaded subtrees
+  long_desc: probabilistically pin the loaded directory inode and the subtree beneath
+    it to an MDS based on the consistent hash of the inode number. The higher this
+    value the more likely the loaded subtrees get pinned
+  default: true
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_export_ephemeral_random_max
+  type: float
+  level: advanced
+  desc: the maximum percent permitted for random ephemeral pin policy
+  default: 0.01
+  services:
+  - mds
+  see_also:
+  - mds_export_ephemeral_random
+  min: 0
+  max: 1
+  flags:
+  - runtime
+- name: mds_export_ephemeral_distributed
+  type: bool
+  level: advanced
+  desc: allow ephemeral distributed pinning of the loaded subtrees
+  long_desc: 'pin the immediate child directories of the loaded directory inode based
+    on the consistent hash of the child''s inode number. '
+  default: true
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_export_ephemeral_distributed_factor
+  type: float
+  level: advanced
+  desc: multiple of max_mds for splitting and distributing directory
+  default: 2
+  services:
+  - mds
+  min: 1
+  max: 100
+  flags:
+  - runtime
+- name: mds_bal_sample_interval
+  type: float
+  level: advanced
+  desc: interval in seconds between balancer ticks
+  fmt_desc: Determines how frequently to sample directory temperature
+    (for fragmentation decisions).
+  default: 3
+  services:
+  - mds
+  with_legacy: true
+- name: mds_bal_replicate_threshold
+  type: float
+  level: advanced
+  desc: hot popularity threshold to replicate a subtree
+  fmt_desc: The minimum temperature before Ceph attempts to replicate
+    metadata to other nodes.
+  default: 8000
+  services:
+  - mds
+  with_legacy: true
+- name: mds_bal_unreplicate_threshold
+  type: float
+  level: advanced
+  desc: cold popularity threshold to merge subtrees
+  fmt_desc: The minimum temperature before Ceph stops replicating
+    metadata to other nodes.
+  default: 0
+  services:
+  - mds
+  with_legacy: true
+- name: mds_bal_split_size
+  type: int
+  level: advanced
+  desc: minimum size of directory fragment before splitting
+  fmt_desc: The maximum directory size before the MDS will split a directory
+    fragment into smaller bits.
+  default: 10000
+  services:
+  - mds
+  with_legacy: true
+- name: mds_bal_split_rd
+  type: float
+  level: advanced
+  desc: hot read popularity threshold for splitting a directory fragment
+  fmt_desc: The maximum directory read temperature before Ceph splits
+    a directory fragment.
+  default: 25000
+  services:
+  - mds
+  with_legacy: true
+- name: mds_bal_split_wr
+  type: float
+  level: advanced
+  desc: hot write popularity threshold for splitting a directory fragment
+  fmt_desc: The maximum directory write temperature before Ceph splits
+    a directory fragment.
+  default: 10000
+  services:
+  - mds
+  with_legacy: true
+- name: mds_bal_split_bits
+  type: int
+  level: advanced
+  desc: power of two child fragments for a fragment on split
+  fmt_desc: The number of bits by which to split a directory fragment.
+  default: 3
+  services:
+  - mds
+  min: 1
+  max: 24
+  with_legacy: true
+- name: mds_bal_merge_size
+  type: int
+  level: advanced
+  desc: size of fragments where merging should occur
+  fmt_desc: The minimum directory size before Ceph tries to merge
+    adjacent directory fragments.
+  default: 50
+  services:
+  - mds
+  with_legacy: true
+- name: mds_bal_interval
+  type: int
+  level: advanced
+  desc: interval between MDS balancer cycles
+  fmt_desc: The frequency (in seconds) of workload exchanges between MDSs.
+  default: 10
+  services:
+  - mds
+- name: mds_bal_fragment_interval
+  type: int
+  level: advanced
+  desc: delay in seconds before interrupting client IO to perform splits
+  fmt_desc: The delay (in seconds) between a fragment being eligible for split
+    or merge and executing the fragmentation change.
+  default: 5
+  services:
+  - mds
+# order of magnitude higher than split size
+- name: mds_bal_fragment_size_max
+  type: int
+  level: advanced
+  desc: maximum size of a directory fragment before new creat/links fail
+  fmt_desc: The maximum size of a fragment before any new entries
+    are rejected with ENOSPC.
+  default: 100000
+  services:
+  - mds
+  with_legacy: true
+# multiple of size_max that triggers immediate split
+- name: mds_bal_fragment_fast_factor
+  type: float
+  level: advanced
+  desc: ratio of mds_bal_split_size at which fast fragment splitting occurs
+  fmt_desc: The ratio by which frags may exceed the split size before
+    a split is executed immediately (skipping the fragment interval)
+  default: 1.5
+  services:
+  - mds
+  with_legacy: true
+- name: mds_bal_fragment_dirs
+  type: bool
+  level: advanced
+  desc: enable directory fragmentation
+  long_desc: Directory fragmentation is a standard feature of CephFS that allows sharding
+    directories across multiple objects for performance and stability. Additionally,
+    this allows fragments to be distributed across multiple active MDSs to increase
+    throughput. Disabling (new) fragmentation should only be done in exceptional circumstances
+    and may lead to performance issues.
+  default: true
+  services:
+  - mds
+- name: mds_bal_idle_threshold
+  type: float
+  level: advanced
+  desc: idle metadata popularity threshold before rebalancing
+  fmt_desc: The minimum temperature before Ceph migrates a subtree
+    back to its parent.
+  default: 0
+  services:
+  - mds
+  with_legacy: true
+- name: mds_bal_max
+  type: int
+  level: dev
+  default: -1
+  services:
+  - mds
+  fmt_desc: The number of iterations to run balancer before Ceph stops.
+    (used for testing purposes only)
+  with_legacy: true
+- name: mds_bal_max_until
+  type: int
+  level: dev
+  default: -1
+  services:
+  - mds
+  fmt_desc: The number of seconds to run balancer before Ceph stops.
+    (used for testing purposes only)
+  with_legacy: true
+- name: mds_bal_mode
+  type: int
+  level: dev
+  default: 0
+  services:
+  - mds
+  fmt_desc: |
+    The method for calculating MDS load.
+
+      - ``0`` = Hybrid.
+      - ``1`` = Request rate and latency.
+      - ``2`` = CPU load.
+  with_legacy: true
+# must be this much above average before we export anything
+- name: mds_bal_min_rebalance
+  type: float
+  level: dev
+  desc: amount overloaded over internal target before balancer begins offloading
+  fmt_desc: The minimum subtree temperature before Ceph migrates.
+  default: 0.1
+  services:
+  - mds
+  with_legacy: true
+# if we need less than this, we don't do anything
+- name: mds_bal_min_start
+  type: float
+  level: dev
+  default: 0.2
+  services:
+  - mds
+  fmt_desc: The minimum subtree temperature before Ceph searches a subtree.
+  with_legacy: true
+# take within this range of what we need
+- name: mds_bal_need_min
+  type: float
+  level: dev
+  default: 0.8
+  services:
+  - mds
+  fmt_desc: The minimum fraction of target subtree size to accept.
+  with_legacy: true
+- name: mds_bal_need_max
+  type: float
+  level: dev
+  default: 1.2
+  services:
+  - mds
+  fmt_desc: The maximum fraction of target subtree size to accept.
+  with_legacy: true
+# any sub bigger than this taken in full
+- name: mds_bal_midchunk
+  type: float
+  level: dev
+  default: 0.3
+  services:
+  - mds
+  fmt_desc: Ceph will migrate any subtree that is larger than this fraction
+    of the target subtree size.
+  with_legacy: true
+# never take anything smaller than this
+- name: mds_bal_minchunk
+  type: float
+  level: dev
+  default: 0.001
+  services:
+  - mds
+  fmt_desc: Ceph will ignore any subtree that is smaller than this fraction
+    of the target subtree size.
+  with_legacy: true
+# target decay half-life in MDSMap (2x larger is approx. 2x slower)
+- name: mds_bal_target_decay
+  type: float
+  level: advanced
+  desc: rate of decay for export targets communicated to clients
+  default: 10
+  services:
+  - mds
+  with_legacy: true
+- name: mds_oft_prefetch_dirfrags
+  type: bool
+  level: advanced
+  desc: prefetch dirfrags recorded in open file table on startup
+  default: false
+  services:
+  - mds
+  flags:
+  - startup
+# time to wait before starting replay again
+- name: mds_replay_interval
+  type: float
+  level: advanced
+  desc: time in seconds between replay of updates to journal by standby replay MDS
+  fmt_desc: The journal poll interval when in standby-replay mode.
+    ("hot standby")
+  default: 1
+  services:
+  - mds
+  with_legacy: true
+- name: mds_shutdown_check
+  type: int
+  level: dev
+  default: 0
+  services:
+  - mds
+  fmt_desc: The interval for polling the cache during MDS shutdown.
+  with_legacy: true
+- name: mds_thrash_exports
+  type: int
+  level: dev
+  default: 0
+  services:
+  - mds
+  fmt_desc: Ceph will randomly export subtrees between nodes (testing only).
+  with_legacy: true
+- name: mds_thrash_fragments
+  type: int
+  level: dev
+  default: 0
+  services:
+  - mds
+  fmt_desc: Ceph will randomly fragment or merge directories.
+  with_legacy: true
+- name: mds_dump_cache_on_map
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds
+  fmt_desc: Ceph will dump the MDS cache contents to a file on each MDSMap.
+  with_legacy: true
+- name: mds_dump_cache_after_rejoin
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds
+  fmt_desc: Ceph will dump MDS cache contents to a file after
+    rejoining the cache (during recovery).
+  with_legacy: true
+- name: mds_verify_scatter
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds
+  fmt_desc: Ceph will assert that various scatter/gather invariants
+    are ``true`` (developers only).
+  with_legacy: true
+- name: mds_debug_scatterstat
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds
+  fmt_desc: Ceph will assert that various recursive stat invariants
+    are ``true`` (for developers only).
+  with_legacy: true
+- name: mds_debug_frag
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds
+  fmt_desc: Ceph will verify directory fragmentation invariants
+    when convenient (developers only).
+  with_legacy: true
+- name: mds_debug_auth_pins
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds
+  fmt_desc: The debug auth pin invariants (for developers only).
+  with_legacy: true
+- name: mds_debug_subtrees
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds
+  fmt_desc: The debug subtree invariants (for developers only).
+  with_legacy: true
+- name: mds_abort_on_newly_corrupt_dentry
+  type: bool
+  level: advanced
+  default: true
+  services:
+  - mds
+  fmt_desc: MDS will abort if dentry is detected newly corrupted.
+- name: mds_go_bad_corrupt_dentry
+  type: bool
+  level: advanced
+  default: true
+  services:
+  - mds
+  fmt_desc: MDS will mark a corrupt dentry as bad and isolate
+  flags:
+  - runtime
+- name: mds_inject_rename_corrupt_dentry_first
+  type: float
+  level: dev
+  default: 0.0
+  services:
+  - mds
+  fmt_desc: probabilistically inject corrupt CDentry::first at rename
+  flags:
+  - runtime
+- name: mds_inject_journal_corrupt_dentry_first
+  type: float
+  level: dev
+  default: 0.0
+  services:
+  - mds
+  fmt_desc: probabilistically inject corrupt CDentry::first at journal load
+  flags:
+  - runtime
+- name: mds_kill_mdstable_at
+  type: int
+  level: dev
+  default: 0
+  services:
+  - mds
+  fmt_desc: Ceph will inject MDS failure in MDSTable code
+    (for developers only).
+  with_legacy: true
+- name: mds_max_export_size
+  type: size
+  level: dev
+  default: 20_M
+  services:
+  - mds
+- name: mds_kill_export_at
+  type: int
+  level: dev
+  default: 0
+  services:
+  - mds
+  fmt_desc: Ceph will inject MDS failure in the subtree export code
+    (for developers only).
+  with_legacy: true
+- name: mds_kill_import_at
+  type: int
+  level: dev
+  default: 0
+  services:
+  - mds
+  fmt_desc: Ceph will inject MDS failure in the subtree import code
+    (for developers only).
+  with_legacy: true
+- name: mds_kill_link_at
+  type: int
+  level: dev
+  default: 0
+  services:
+  - mds
+  fmt_desc: Ceph will inject MDS failure in hard link code
+    (for developers only).
+  with_legacy: true
+- name: mds_kill_rename_at
+  type: int
+  level: dev
+  default: 0
+  services:
+  - mds
+  fmt_desc: Ceph will inject MDS failure in the rename code
+    (for developers only).
+  with_legacy: true
+- name: mds_kill_openc_at
+  type: int
+  level: dev
+  default: 0
+  services:
+  - mds
+  with_legacy: true
+# XXX
+- name: mds_kill_journal_at
+  type: int
+  level: dev
+  default: 0
+  services:
+  - mds
+- name: mds_kill_journal_expire_at
+  type: int
+  level: dev
+  default: 0
+  services:
+  - mds
+  with_legacy: true
+- name: mds_kill_journal_replay_at
+  type: int
+  level: dev
+  default: 0
+  services:
+  - mds
+  with_legacy: true
+- name: mds_journal_format
+  type: uint
+  level: dev
+  default: 1
+  services:
+  - mds
+  with_legacy: true
+- name: mds_kill_create_at
+  type: int
+  level: dev
+  default: 0
+  services:
+  - mds
+  with_legacy: true
+- name: mds_inject_health_dummy
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds
+- name: mds_kill_skip_replaying_inotable
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds
+  fmt_desc: Ceph will skip replaying the inotable when replaying the journal, and
+    the premary MDS will crash, while the replacing MDS won't.
+    (for testing only).
+  with_legacy: true
+- name: mds_inject_skip_replaying_inotable
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds
+  fmt_desc: Ceph will skip replaying the inotable when replaying the journal, and
+    the premary MDS will crash, while the replacing MDS won't.
+    (for testing only).
+  with_legacy: true
+#  percentage of MDS modify replies to skip sending the client a trace on [0-1]
+- name: mds_inject_traceless_reply_probability
+  type: float
+  level: dev
+  default: 0
+  services:
+  - mds
+  with_legacy: true
+- name: mds_wipe_sessions
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds
+  fmt_desc: Ceph will delete all client sessions on startup
+    (for testing only).
+  with_legacy: true
+- name: mds_wipe_ino_prealloc
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds
+  fmt_desc: Ceph will delete ino preallocation metadata on startup
+    (for testing only).
+  with_legacy: true
+- name: mds_skip_ino
+  type: int
+  level: dev
+  default: 0
+  services:
+  - mds
+  fmt_desc: The number of inode numbers to skip on startup
+    (for testing only).
+  with_legacy: true
+- name: mds_enable_op_tracker
+  type: bool
+  level: advanced
+  desc: track remote operation progression and statistics
+  default: true
+  services:
+  - mds
+  with_legacy: true
+# Max number of completed ops to track
+- name: mds_op_history_size
+  type: uint
+  level: advanced
+  desc: maximum size for list of historical operations
+  default: 20
+  services:
+  - mds
+  with_legacy: true
+# Oldest completed op to track
+- name: mds_op_history_duration
+  type: uint
+  level: advanced
+  desc: expiration time in seconds of historical operations
+  default: 600
+  services:
+  - mds
+  with_legacy: true
+# how many seconds old makes an op complaint-worthy
+- name: mds_op_complaint_time
+  type: float
+  level: advanced
+  desc: time in seconds to consider an operation blocked after no updates
+  default: 30
+  services:
+  - mds
+  with_legacy: true
+# how many op log messages to show in one go
+- name: mds_op_log_threshold
+  type: int
+  level: dev
+  default: 5
+  services:
+  - mds
+  with_legacy: true
+- name: mds_snap_min_uid
+  type: uint
+  level: advanced
+  desc: minimum uid of client to perform snapshots
+  default: 0
+  services:
+  - mds
+  with_legacy: true
+- name: mds_snap_max_uid
+  type: uint
+  level: advanced
+  desc: maximum uid of client to perform snapshots
+  default: 4294967294
+  services:
+  - mds
+  with_legacy: true
+- name: mds_snap_rstat
+  type: bool
+  level: advanced
+  desc: enabled nested rstat for snapshots
+  default: false
+  services:
+  - mds
+  with_legacy: true
+- name: mds_verify_backtrace
+  type: uint
+  level: dev
+  default: 1
+  services:
+  - mds
+  with_legacy: true
+# detect clients which aren't trimming completed requests
+- name: mds_max_completed_flushes
+  type: uint
+  level: dev
+  default: 100000
+  services:
+  - mds
+  with_legacy: true
+- name: mds_max_completed_requests
+  type: uint
+  level: dev
+  default: 100000
+  services:
+  - mds
+  with_legacy: true
+- name: mds_action_on_write_error
+  type: uint
+  level: advanced
+  desc: action to take when MDS cannot write to RADOS (0:ignore, 1:read-only, 2:suicide)
+  default: 1
+  services:
+  - mds
+  with_legacy: true
+- name: mds_mon_shutdown_timeout
+  type: float
+  level: advanced
+  desc: time to wait for mon to receive damaged MDS rank notification
+  default: 5
+  services:
+  - mds
+  with_legacy: true
+# Maximum number of concurrent stray files to purge
+- name: mds_max_purge_files
+  type: uint
+  level: advanced
+  desc: maximum number of deleted files to purge in parallel
+  default: 64
+  services:
+  - mds
+  with_legacy: true
+# Maximum number of concurrent RADOS ops to issue in purging
+- name: mds_max_purge_ops
+  type: uint
+  level: advanced
+  desc: maximum number of purge operations performed in parallel
+  default: 8_K
+  services:
+  - mds
+  with_legacy: true
+# Maximum number of concurrent RADOS ops to issue in purging, scaled by PG count
+- name: mds_max_purge_ops_per_pg
+  type: float
+  level: advanced
+  desc: number of parallel purge operations performed per PG
+  default: 0.5
+  services:
+  - mds
+  with_legacy: true
+- name: mds_purge_queue_busy_flush_period
+  type: float
+  level: dev
+  default: 1
+  services:
+  - mds
+  with_legacy: true
+- name: mds_root_ino_uid
+  type: int
+  level: advanced
+  desc: default uid for new root directory
+  default: 0
+  services:
+  - mds
+  with_legacy: true
+- name: mds_root_ino_gid
+  type: int
+  level: advanced
+  desc: default gid for new root directory
+  default: 0
+  services:
+  - mds
+  with_legacy: true
+- name: mds_max_scrub_ops_in_progress
+  type: int
+  level: advanced
+  desc: maximum number of scrub operations performed in parallel
+  default: 5
+  services:
+  - mds
+  with_legacy: true
+- name: mds_forward_all_requests_to_auth
+  type: bool
+  level: advanced
+  desc: always process op on auth mds
+  default: false
+  services:
+  - mds
+  flags:
+  - runtime
+# Maximum number of damaged frags/dentries before whole MDS rank goes damaged
+- name: mds_damage_table_max_entries
+  type: int
+  level: advanced
+  desc: maximum number of damage table entries
+  default: 10000
+  services:
+  - mds
+  with_legacy: true
+# Maximum increment for client writable range, counted by number of objects
+- name: mds_client_writeable_range_max_inc_objs
+  type: uint
+  level: advanced
+  desc: maximum number of objects in writeable range of a file for a client
+  default: 1_K
+  services:
+  - mds
+  with_legacy: true
+- name: mds_min_caps_per_client
+  type: uint
+  level: advanced
+  desc: minimum number of capabilities a client may hold
+  default: 100
+  services:
+  - mds
+- name: mds_min_caps_working_set
+  type: uint
+  level: advanced
+  desc: number of capabilities a client may hold without cache pressure warnings generated
+  default: 10000
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_max_caps_per_client
+  type: uint
+  level: advanced
+  desc: maximum number of capabilities a client may hold
+  default: 1_M
+  services:
+  - mds
+- name: mds_hack_allow_loading_invalid_metadata
+  type: bool
+  level: advanced
+  desc: INTENTIONALLY CAUSE DATA LOSS by bypasing checks for invalid metadata on disk.
+    Allows testing repair tools.
+  default: false
+  services:
+  - mds
+- name: mds_defer_session_stale
+  type: bool
+  level: dev
+  default: true
+  services:
+  - mds
+- name: mds_inject_migrator_session_race
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds
+- name: mds_request_load_average_decay_rate
+  type: float
+  level: advanced
+  desc: rate of decay in seconds for calculating request load average
+  default: 1_min
+  services:
+  - mds
+- name: mds_cap_revoke_eviction_timeout
+  type: float
+  level: advanced
+  desc: number of seconds after which clients which have not responded to cap revoke
+    messages by the MDS are evicted.
+  default: 0
+  services:
+  - mds
+- name: mds_dump_cache_threshold_formatter
+  type: size
+  level: dev
+  desc: threshold for cache usage to disallow "dump cache" operation to formatter
+  long_desc: Disallow MDS from dumping caches to formatter via "dump cache" command
+    if cache usage exceeds this threshold.
+  default: 1_G
+  services:
+  - mds
+- name: mds_dump_cache_threshold_file
+  type: size
+  level: dev
+  desc: threshold for cache usage to disallow "dump cache" operation to file
+  long_desc: Disallow MDS from dumping caches to file via "dump cache" command if
+    cache usage exceeds this threshold.
+  default: 0
+  services:
+  - mds
+- name: mds_task_status_update_interval
+  type: float
+  level: dev
+  desc: task status update interval to manager
+  long_desc: interval (in seconds) for sending mds task status to ceph manager
+  default: 2
+  services:
+  - mds
+- name: mds_max_snaps_per_dir
+  type: uint
+  level: advanced
+  desc: max snapshots per directory
+  long_desc: maximum number of snapshots that can be created per directory
+  default: 100
+  services:
+  - mds
+  min: 0
+  max: 4_K
+  flags:
+  - runtime
+- name: mds_asio_thread_count
+  type: uint
+  level: advanced
+  desc: Size of thread pool for ASIO completions
+  default: 2
+  tags:
+  - mds
+  services:
+  - mds
+  min: 1
+- name: mds_ping_grace
+  type: secs
+  level: advanced
+  desc: timeout after which an MDS is considered laggy by rank 0 MDS.
+  long_desc: timeout for replying to a ping message sent by rank 0 after which an
+    active MDS considered laggy (delayed metrics) by rank 0.
+  default: 15
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_ping_interval
+  type: secs
+  level: advanced
+  desc: interval in seconds for sending ping messages to active MDSs.
+  long_desc: interval in seconds for rank 0 to send ping messages to all active MDSs.
+  default: 5
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_metrics_update_interval
+  type: secs
+  level: advanced
+  desc: interval in seconds for metrics data update.
+  long_desc: interval in seconds after which active MDSs send client metrics data
+    to rank 0.
+  default: 2
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_dir_max_entries
+  type: uint
+  level: advanced
+  desc: maximum number of entries per directory before new creat/links fail
+  long_desc: The maximum number of entries before any new entries
+    are rejected with ENOSPC.
+  default: 0
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_sleep_rank_change
+  type: float
+  level: dev
+  default: 0.0
+  flags:
+  - runtime
+- name: mds_connect_bootstrapping
+  type: bool
+  level: dev
+  default: false
+  flags:
+  - runtime
+- name: mds_symlink_recovery
+  type: bool
+  level: advanced
+  desc: Stores symlink target on the first data object of symlink file.
+    Allows recover of symlink using recovery tools.
+  default: true
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_extraordinary_events_dump_interval
+  type: secs
+  level: advanced
+  desc: Interval in seconds for dumping the recent in-memory logs when there is an extra-ordinary event.
+  long_desc: Interval in seconds for dumping the recent in-memory logs when there is an extra-ordinary
+    event. The default is ``0`` (disabled). The log level should be ``< 10`` and the gather level
+    should be ``>=10`` in debug_mds for enabling this option.
+  default: 0
+  min: 0
+  max: 60
+  services:
+  - mds
+  flags:
+  - runtime
+- name: mds_session_metadata_threshold
+  type: size
+  level: advanced
+  desc: Evict non-advancing client-tid sessions exceeding the config size.
+  long_desc: Evict clients which are not advancing their request tids which causes a large buildup of session metadata (`completed_requests`) in the MDS causing the MDS to go read-only since the RADOS operation exceeds the size threashold. This config is the maximum size (in bytes) that a session metadata (encoded) can grow.
+  default: 16_M
+  services:
+  - mds
+  flags:
+  - runtime
diff --git a/src/common/options/mgr.yaml.in b/src/common/options/mgr.yaml.in
new file mode 100644
index 000000000..7d7b68035
--- /dev/null
+++ b/src/common/options/mgr.yaml.in
@@ -0,0 +1,362 @@
+# -*- mode: YAML -*-
+---
+
+options:
+- name: mgr_data
+  type: str
+  level: advanced
+  desc: Filesystem path to the ceph-mgr data directory, used to contain keyring.
+  fmt_desc: Path to load daemon data (such as keyring)
+  default: /var/lib/ceph/mgr/$cluster-$id
+  services:
+  - mgr
+  flags:
+  - no_mon_update
+- name: mgr_pool
+  type: bool
+  level: dev
+  desc: Allow use/creation of .mgr pool.
+  default: true
+  services:
+  - mgr
+  flags:
+  - startup
+- name: mgr_stats_period
+  type: int
+  level: basic
+  desc: Period in seconds of OSD/MDS stats reports to manager
+  long_desc: Use this setting to control the granularity of time series data collection
+    from daemons.  Adjust upwards if the manager CPU load is too high, or if you simply
+    do not require the most up to date performance counter data.
+  default: 5
+  services:
+  - mgr
+  - common
+- name: mgr_client_bytes
+  type: size
+  level: dev
+  default: 128_M
+  services:
+  - mgr
+- name: mgr_client_messages
+  type: uint
+  level: dev
+  default: 512
+  services:
+  - mgr
+- name: mgr_osd_bytes
+  type: size
+  level: dev
+  default: 512_M
+  services:
+  - mgr
+- name: mgr_osd_messages
+  type: uint
+  level: dev
+  default: 8_K
+  services:
+  - mgr
+- name: mgr_mds_bytes
+  type: size
+  level: dev
+  default: 128_M
+  services:
+  - mgr
+- name: mgr_mds_messages
+  type: uint
+  level: dev
+  default: 128
+  services:
+  - mgr
+- name: mgr_mon_bytes
+  type: size
+  level: dev
+  default: 128_M
+  services:
+  - mgr
+- name: mgr_mon_messages
+  type: uint
+  level: dev
+  default: 128
+  services:
+  - mgr
+- name: mgr_service_beacon_grace
+  type: float
+  level: advanced
+  desc: Period in seconds from last beacon to manager dropping state about a monitored
+    service (RGW, rbd-mirror etc)
+  default: 1_min
+  services:
+  - mgr
+- name: mgr_debug_aggressive_pg_num_changes
+  type: bool
+  level: dev
+  desc: Bypass most throttling and safety checks in pg[p]_num controller
+  default: false
+  services:
+  - mgr
+- name: mgr_max_pg_num_change
+  type: int
+  level: advanced
+  desc: maximum change in pg_num
+  default: 128
+  services:
+  - mgr
+  with_legacy: true
+- name: mgr_module_path
+  type: str
+  level: advanced
+  desc: Filesystem path to manager modules.
+  fmt_desc: Path to load modules from
+  default: @CEPH_INSTALL_DATADIR@/mgr
+  services:
+  - mgr
+- name: mgr_standby_modules
+  type: bool
+  default: true
+  level: advanced
+  desc: Start modules in standby (redirect) mode when mgr is standby
+  long_desc: By default, the standby modules will answer incoming requests with a
+    HTTP redirect to the active manager, allowing users to point their browser at any
+    mgr node and find their way to an active mgr.  However, this mode is problematic
+    when using a load balancer because (1) the redirect locations are usually private
+    IPs and (2) the load balancer can't identify which mgr is the right one to send
+    traffic to. If a load balancer is being used, set this to false.
+- name: mgr_disabled_modules
+  type: str
+  level: advanced
+  desc: List of manager modules never get loaded
+  long_desc: A comma delimited list of module names. This list is read by manager
+    when it starts. By default, manager loads all modules found in specified 'mgr_module_path',
+    and it starts the enabled ones as instructed. The modules in this list will not
+    be loaded at all.
+  default: @mgr_disabled_modules@
+  services:
+  - mgr
+  see_also:
+  - mgr_module_path
+  flags:
+  - startup
+- name: mgr_initial_modules
+  type: str
+  level: basic
+  desc: List of manager modules to enable when the cluster is first started
+  long_desc: This list of module names is read by the monitor when the cluster is
+    first started after installation, to populate the list of enabled manager modules.  Subsequent
+    updates are done using the 'mgr module [enable|disable]' commands.  List may be
+    comma or space separated.
+  default: restful iostat nfs
+  services:
+  - mon
+  - common
+  flags:
+  - no_mon_update
+  - cluster_create
+- name: cephadm_path
+  type: str
+  level: advanced
+  desc: Path to cephadm utility
+  default: /usr/sbin/cephadm
+  services:
+  - mgr
+- name: mon_delta_reset_interval
+  type: float
+  level: advanced
+  desc: window duration for rate calculations in 'ceph status'
+  fmt_desc: Seconds of inactivity before we reset the PG delta to 0. We keep
+    track of the delta of the used space of each pool, so, for
+    example, it would be easier for us to understand the progress of
+    recovery or the performance of cache tier. But if there's no
+    activity reported for a certain pool, we just reset the history of
+    deltas of that pool.
+  default: 10
+  services:
+  - mgr
+  with_legacy: true
+- name: mon_stat_smooth_intervals
+  type: uint
+  level: advanced
+  desc: number of PGMaps stats over which we calc the average read/write throughput
+    of the whole cluster
+  fmt_desc: Ceph will smooth statistics over the last ``N`` PG maps.
+  default: 6
+  services:
+  - mgr
+  min: 1
+- name: mon_pool_quota_warn_threshold
+  type: int
+  level: advanced
+  desc: percent of quota at which to issue warnings
+  default: 0
+  services:
+  - mgr
+- name: mon_pool_quota_crit_threshold
+  type: int
+  level: advanced
+  desc: percent of quota at which to issue errors
+  default: 0
+  services:
+  - mgr
+- name: mon_cache_target_full_warn_ratio
+  type: float
+  level: advanced
+  desc: issue CACHE_POOL_NEAR_FULL health warning when cache pool utilization exceeds
+    this ratio of usable space
+  fmt_desc: Position between pool's ``cache_target_full`` and ``target_max_object``
+    where we start warning
+  default: 0.66
+  services:
+  - mgr
+  flags:
+  - no_mon_update
+  - cluster_create
+  with_legacy: true
+- name: mon_pg_check_down_all_threshold
+  type: float
+  level: advanced
+  desc: threshold of down osds after which we check all pgs
+  fmt_desc: Percentage threshold of ``down`` OSDs above which we check all PGs
+    for stale ones.
+  default: 0.5
+  services:
+  - mgr
+  with_legacy: true
+- name: mon_pg_stuck_threshold
+  type: int
+  level: advanced
+  desc: number of seconds after which pgs can be considered stuck inactive, unclean,
+    etc
+  long_desc: see doc/control.rst under dump_stuck for more info
+  fmt_desc: Number of seconds after which PGs can be considered as
+    being stuck.
+  default: 1_min
+  services:
+  - mgr
+- name: mon_pg_warn_min_per_osd
+  type: uint
+  level: advanced
+  desc: minimal number PGs per (in) osd before we warn the admin
+  fmt_desc: Raise ``HEALTH_WARN`` if the average number
+    of PGs per ``in`` OSD is under this number. A non-positive number
+    disables this.
+  default: 0
+  services:
+  - mgr
+- name: mon_pg_warn_max_object_skew
+  type: float
+  level: advanced
+  desc: max skew few average in objects per pg
+  fmt_desc: Raise ``HEALTH_WARN`` if the average RADOS object count per PG
+    of any pool is greater than ``mon_pg_warn_max_object_skew`` times
+    the average RADOS object count per PG of all pools. Zero or a non-positive
+    number disables this. Note that this option applies to ``ceph-mgr`` daemons.
+  default: 10
+  services:
+  - mgr
+- name: mon_pg_warn_min_objects
+  type: int
+  level: advanced
+  desc: 'do not warn below this object #'
+  fmt_desc: Do not warn if the total number of RADOS objects in cluster is below
+    this number
+  default: 10000
+  services:
+  - mgr
+- name: mon_pg_warn_min_pool_objects
+  type: int
+  level: advanced
+  desc: 'do not warn on pools below this object #'
+  fmt_desc: Do not warn on pools whose RADOS object count is below this number
+  default: 1000
+  services:
+  - mgr
+- name: mon_warn_on_misplaced
+  type: bool
+  level: advanced
+  desc: Issue a health warning if there are misplaced objects
+  default: false
+  services:
+  - mgr
+  with_legacy: true
+- name: mon_warn_on_pool_no_app
+  type: bool
+  level: dev
+  desc: issue POOL_APP_NOT_ENABLED health warning if pool has not application enabled
+  default: true
+  services:
+  - mgr
+- name: mon_warn_on_too_few_osds
+  type: bool
+  level: advanced
+  desc: Issue a health warning if there are fewer OSDs than osd_pool_default_size
+  default: true
+  services:
+  - mgr
+- name: mon_target_pg_per_osd
+  type: uint
+  level: advanced
+  desc: Automated PG management creates this many PGs per OSD
+  long_desc: When creating pools, the automated PG management logic will attempt to
+    reach this target.  In some circumstances, it may exceed this target, up to the
+    ``mon_max_pg_per_osd`` limit. Conversely, a lower number of PGs per OSD may be
+    created if the cluster is not yet fully utilised
+  default: 100
+  min: 1
+# min pgs per osd for reweight-by-pg command
+- name: mon_reweight_min_pgs_per_osd
+  type: uint
+  level: advanced
+  default: 10
+  services:
+  - mgr
+  with_legacy: true
+# min bytes per osd for reweight-by-utilization command
+- name: mon_reweight_min_bytes_per_osd
+  type: size
+  level: advanced
+  default: 100_M
+  services:
+  - mgr
+  with_legacy: true
+# max osds to change per reweight-by-* command
+- name: mon_reweight_max_osds
+  type: int
+  level: advanced
+  default: 4
+  services:
+  - mgr
+  with_legacy: true
+- name: mon_reweight_max_change
+  type: float
+  level: advanced
+  default: 0.05
+  services:
+  - mgr
+  with_legacy: true
+- name: mgr_stats_threshold
+  type: int
+  level: advanced
+  desc: Lowest perfcounter priority collected by mgr
+  long_desc: Daemons only set perf counter data to the manager daemon if the counter
+    has a priority higher than this.
+  default: 5
+  min: 0
+  max: 11
+- name: mgr_tick_period
+  type: secs
+  level: advanced
+  desc: Period in seconds of beacon messages to monitor
+  fmt_desc: How many seconds between mgr beacons to monitors, and other
+    periodic checks.
+  default: 2
+  services:
+  - mgr
+  - mon
+- name: mon_osd_err_op_age_ratio
+  type: float
+  level: advanced
+  desc: issue REQUEST_STUCK health error if OSD ops are slower than is age (seconds)
+  default: 128
+  services:
+  - mgr
+  with_legacy: true
diff --git a/src/common/options/mon.yaml.in b/src/common/options/mon.yaml.in
new file mode 100644
index 000000000..1cd655ad4
--- /dev/null
+++ b/src/common/options/mon.yaml.in
@@ -0,0 +1,1340 @@
+# -*- mode: YAML -*-
+---
+
+options:
+- name: osd_crush_update_weight_set
+  type: bool
+  level: advanced
+  desc: update CRUSH weight-set weights when updating weights
+  long_desc: If this setting is true, we will update the weight-set weights when adjusting
+    an item's weight, effectively making changes take effect immediately, and discarding
+    any previous optimization in the weight-set value.  Setting this value to false
+    will leave it to the balancer to (slowly, presumably) adjust weights to approach
+    the new target value.
+  default: true
+  with_legacy: true
+- name: osd_pool_erasure_code_stripe_unit
+  type: size
+  level: advanced
+  desc: the amount of data (in bytes) in a data chunk, per stripe
+  fmt_desc: Sets the default size, in bytes, of a chunk of an object
+    stripe for erasure coded pools. Every object of size S
+    will be stored as N stripes, with each data chunk
+    receiving ``stripe unit`` bytes. Each stripe of ``N *
+    stripe unit`` bytes will be encoded/decoded
+    individually. This option can is overridden by the
+    ``stripe_unit`` setting in an erasure code profile.
+  default: 4_K
+  services:
+  - mon
+- name: osd_pool_default_crimson
+  type: bool
+  level: advanced
+  desc: Create pools by default with FLAG_CRIMSON
+  default: false
+  services :
+  - mon
+  flags:
+  - runtime
+- name: mon_max_pool_pg_num
+  type: uint
+  level: advanced
+  default: 64_K
+  fmt_desc: The maximum number of placement groups per pool.
+- name: mon_mgr_digest_period
+  type: int
+  level: dev
+  desc: Period in seconds between monitor-to-manager health/status updates
+  default: 5
+  services:
+  - mon
+- name: mon_down_mkfs_grace
+  type: secs
+  level: advanced
+  desc: Period in seconds that the cluster may have a mon down after cluster creation
+  default: 1_min
+  services:
+  - mon
+- name: mon_mgr_beacon_grace
+  type: secs
+  level: advanced
+  desc: Period in seconds from last beacon to monitor marking a manager daemon as
+    failed
+  default: 30
+  services:
+  - mon
+- name: mon_mgr_inactive_grace
+  type: int
+  level: advanced
+  desc: Period in seconds after cluster creation during which cluster may have no
+    active manager
+  long_desc: This grace period enables the cluster to come up cleanly without raising
+    spurious health check failures about managers that aren't online yet
+  default: 1_min
+  services:
+  - mon
+- name: mon_mgr_mkfs_grace
+  type: int
+  level: advanced
+  desc: Period in seconds that the cluster may have no active manager before this
+    is reported as an ERR rather than a WARN
+  default: 2_min
+  services:
+  - mon
+- name: mon_mgr_proxy_client_bytes_ratio
+  type: float
+  level: dev
+  desc: ratio of mon_client_bytes that can be consumed by proxied mgr commands before
+    we error out to client
+  default: 0.3
+  services:
+  - mon
+- name: mon_cluster_log_to_stderr
+  type: bool
+  level: advanced
+  desc: Make monitor send cluster log messages to stderr (prefixed by channel)
+  default: false
+  services:
+  - mon
+  see_also:
+  - log_stderr_prefix
+  flags:
+  - runtime
+  with_legacy: true
+- name: mon_cluster_log_to_syslog
+  type: str
+  level: advanced
+  desc: Make monitor send cluster log messages to syslog
+  fmt_desc: Determines if the cluster log should be output to the syslog.
+  default: default=false
+  services:
+  - mon
+  flags:
+  - runtime
+  with_legacy: true
+- name: mon_cluster_log_to_syslog_level
+  type: str
+  level: advanced
+  desc: Syslog level for cluster log messages
+  default: info
+  services:
+  - mon
+  see_also:
+  - mon_cluster_log_to_syslog
+  flags:
+  - runtime
+  with_legacy: true
+- name: mon_cluster_log_to_syslog_facility
+  type: str
+  level: advanced
+  desc: Syslog facility for cluster log messages
+  default: daemon
+  services:
+  - mon
+  see_also:
+  - mon_cluster_log_to_syslog
+  flags:
+  - runtime
+  with_legacy: true
+- name: mon_cluster_log_to_file
+  type: bool
+  level: advanced
+  desc: Make monitor send cluster log messages to file
+  default: true
+  services:
+  - mon
+  see_also:
+  - mon_cluster_log_file
+  flags:
+  - runtime
+  with_legacy: true
+- name: mon_cluster_log_file
+  type: str
+  level: advanced
+  desc: File(s) to write cluster log to
+  long_desc: This can either be a simple file name to receive all messages, or a list
+    of key/value pairs where the key is the log channel and the value is the filename,
+    which may include $cluster and $channel metavariables
+  fmt_desc: |
+    The locations of the cluster's log files. There are two channels in
+    Ceph: ``cluster`` and ``audit``. This option represents a mapping
+    from channels to log files, where the log entries of that
+    channel are sent to. The ``default`` entry is a fallback
+    mapping for channels not explicitly specified. So, the following
+    default setting will send cluster log to ``$cluster.log``, and
+    send audit log to ``$cluster.audit.log``, where ``$cluster`` will
+    be replaced with the actual cluster name.
+  default: default=/var/log/ceph/$cluster.$channel.log cluster=/var/log/ceph/$cluster.log
+  services:
+  - mon
+  see_also:
+  - mon_cluster_log_to_file
+  flags:
+  - runtime
+  with_legacy: true
+- name: mon_cluster_log_file_level
+  type: str
+  level: advanced
+  desc: Lowest level to include is cluster log file
+  default: debug
+  services:
+  - mon
+  see_also:
+  - mon_cluster_log_file
+  flags:
+  - runtime
+  with_legacy: true
+- name: mon_cluster_log_to_graylog
+  type: str
+  level: advanced
+  desc: Make monitor send cluster log to graylog
+  default: 'false'
+  services:
+  - mon
+  flags:
+  - runtime
+  with_legacy: true
+- name: mon_cluster_log_to_graylog_host
+  type: str
+  level: advanced
+  desc: Graylog host for cluster log messages
+  default: 127.0.0.1
+  services:
+  - mon
+  see_also:
+  - mon_cluster_log_to_graylog
+  flags:
+  - runtime
+  with_legacy: true
+- name: mon_cluster_log_to_graylog_port
+  type: str
+  level: advanced
+  desc: Graylog port for cluster log messages
+  default: '12201'
+  services:
+  - mon
+  see_also:
+  - mon_cluster_log_to_graylog
+  flags:
+  - runtime
+  with_legacy: true
+- name: mon_cluster_log_to_journald
+  type: str
+  level: advanced
+  desc: Make monitor send cluster log to journald
+  default: 'false'
+  services:
+  - mon
+  flags:
+  - runtime
+- name: mon_log_max
+  type: uint
+  level: advanced
+  desc: number of recent cluster log messages to retain
+  default: 10000
+  services:
+  - mon
+  with_legacy: true
+- name: mon_log_max_summary
+  type: uint
+  level: advanced
+  desc: number of recent cluster log messages to dedup against
+  default: 50
+  services:
+  - mon
+  with_legacy: true
+- name: mon_log_full_interval
+  type: uint
+  level: advanced
+  desc: how many epochs before we encode a full copy of recent log keys
+  default: 50
+  services: [mon]
+  with_legacy: true
+- name: mon_max_log_entries_per_event
+  type: int
+  level: advanced
+  desc: max cluster log entries per paxos event
+  fmt_desc: The maximum number of log entries per event.
+  default: 4096
+  services:
+  - mon
+  with_legacy: true
+- name: mon_health_to_clog
+  type: bool
+  level: advanced
+  desc: log monitor health to cluster log
+  fmt_desc: Enable sending a health summary to the cluster log periodically.
+  default: true
+  services:
+  - mon
+  with_legacy: true
+- name: mon_health_to_clog_interval
+  type: int
+  level: advanced
+  desc: frequency to log monitor health to cluster log
+  fmt_desc: How often (in seconds) the monitor sends a health summary to the cluster
+    log (a non-positive number disables). Monitors will always
+    send a summary to the cluster log whether or not it differs from
+    the previous summary.
+  default: 10_min
+  services:
+  - mon
+  see_also:
+  - mon_health_to_clog
+  with_legacy: true
+- name: mon_health_to_clog_tick_interval
+  type: float
+  level: dev
+  fmt_desc: How often (in seconds) the monitor sends a health summary to the cluster
+    log (a non-positive number disables). If current health summary
+    is empty or identical to the last time, monitor will not send it
+    to cluster log.
+  default: 1_min
+  services:
+  - mon
+  with_legacy: true
+- name: mon_health_detail_to_clog
+  type: bool
+  level: dev
+  desc: log health detail to cluster log
+  default: true
+  with_legacy: true
+- name: mon_warn_on_filestore_osds
+  type: bool
+  level: dev
+  desc: log health warn for filestore OSDs
+  default: true
+  with_legacy: true
+- name: mon_health_max_detail
+  type: uint
+  level: advanced
+  desc: max detailed pgs to report in health detail
+  default: 50
+  services:
+  - mon
+- name: mon_health_log_update_period
+  type: int
+  level: dev
+  desc: minimum time in seconds between log messages about each health check
+  default: 5
+  services:
+  - mon
+  min: 0
+- name: mon_data_avail_crit
+  type: int
+  level: advanced
+  desc: issue MON_DISK_CRIT health error when mon available space below this percentage
+  fmt_desc: Raise ``HEALTH_ERR`` status when the filesystem that houses a
+    monitor's data store reports that its available capacity is
+    less than or equal to this percentage.
+  default: 5
+  services:
+  - mon
+  with_legacy: true
+- name: mon_data_avail_warn
+  type: int
+  level: advanced
+  desc: issue MON_DISK_LOW health warning when mon available space below this percentage
+  fmt_desc: Raise ``HEALTH_WARN`` status when the filesystem that houses a
+    monitor's data store reports that its available capacity is
+    less than or equal to this percentage .
+  default: 30
+  services:
+  - mon
+  with_legacy: true
+- name: mon_data_size_warn
+  type: size
+  level: advanced
+  desc: issue MON_DISK_BIG health warning when mon database is above this size
+  fmt_desc: Raise ``HEALTH_WARN`` status when a monitor's data
+    store grows to be larger than this size, 15GB by default.
+  default: 15_G
+  services:
+  - mon
+  with_legacy: true
+- name: mon_daemon_bytes
+  type: size
+  level: advanced
+  desc: max bytes of outstanding mon messages mon will read off the network
+  fmt_desc: The message memory cap for metadata server and OSD messages (in bytes).
+  default: 400_M
+  services:
+  - mon
+  with_legacy: true
+- name: mon_election_timeout
+  type: float
+  level: advanced
+  desc: maximum time for a mon election (seconds)
+  fmt_desc: On election proposer, maximum waiting time for all ACKs in seconds.
+  default: 5
+  services:
+  - mon
+  with_legacy: true
+- name: mon_election_default_strategy
+  type: uint
+  level: advanced
+  desc: The election strategy to set when constructing the first monmap.
+  default: 1
+  min: 1
+  max: 3
+- name: mon_lease
+  type: float
+  level: advanced
+  desc: lease interval between quorum monitors (seconds)
+  long_desc: This setting controls how sensitive your mon quorum is to intermittent
+    network issues or other failures.
+  fmt_desc: The length (in seconds) of the lease on the monitor's versions.
+  default: 5
+  services:
+  - mon
+  with_legacy: true
+- name: mon_lease_renew_interval_factor
+  type: float
+  level: advanced
+  desc: multiple of mon_lease for the lease renewal interval
+  long_desc: Leases must be renewed before they time out.  A smaller value means frequent
+    renewals, while a value close to 1 makes a lease expiration more likely.
+  fmt_desc: |
+    ``mon_lease`` \* ``mon_lease_renew_interval_factor`` will be the
+    interval for the Leader to renew the other monitor's leases. The
+    factor should be less than ``1.0``.
+  default: 0.6
+  services:
+  - mon
+  see_also:
+  - mon_lease
+  min: 0
+  max: 0.9999999
+  with_legacy: true
+- name: mon_lease_ack_timeout_factor
+  type: float
+  level: advanced
+  desc: multiple of mon_lease for the lease ack interval before calling new election
+  fmt_desc: The Leader will wait ``mon_lease`` \* ``mon_lease_ack_timeout_factor``
+    for the Providers to acknowledge the lease extension.
+  default: 2
+  services:
+  - mon
+  see_also:
+  - mon_lease
+  min: 1.0001
+  max: 100
+  with_legacy: true
+- name: mon_accept_timeout_factor
+  type: float
+  level: advanced
+  desc: multiple of mon_lease for follower mons to accept proposed state changes before
+    calling a new election
+  fmt_desc: The Leader will wait ``mon_lease`` \* ``mon_accept_timeout_factor``
+    for the Requester(s) to accept a Paxos update. It is also used
+    during the Paxos recovery phase for similar purposes.
+  default: 2
+  services:
+  - mon
+  see_also:
+  - mon_lease
+  with_legacy: true
+- name: mon_elector_ping_timeout
+  type: float
+  level: advanced
+  desc: The time after which a ping 'times out' and a connection is considered down
+  default: 2
+  services:
+  - mon
+  see_also:
+  - mon_elector_ping_divisor
+- name: mon_elector_ping_divisor
+  type: uint
+  level: advanced
+  desc: We will send a ping up to this many times per timeout per
+  default: 2
+  services:
+  - mon
+  see_also:
+  - mon_elector_ping_timeout
+- name: mon_con_tracker_persist_interval
+  type: uint
+  level: advanced
+  desc: how many updates the ConnectionTracker takes before it persists to disk
+  default: 10
+  services:
+  - mon
+  min: 1
+  max: 100000
+- name: mon_con_tracker_score_halflife
+  type: uint
+  level: advanced
+  desc: The 'halflife' used when updating/calculating peer connection scores
+  default: 43200
+  services:
+  - mon
+  min: 60
+- name: mon_elector_ignore_propose_margin
+  type: float
+  level: advanced
+  desc: The difference in connection score allowed before a peon stops ignoring out-of-quorum
+    PROPOSEs
+  default: 0.0005
+  services:
+  - mon
+- name: mon_warn_on_cache_pools_without_hit_sets
+  type: bool
+  level: advanced
+  desc: issue CACHE_POOL_NO_HIT_SET health warning for cache pools that do not have
+    hit sets configured
+  fmt_desc: Raise ``HEALTH_WARN`` when a cache pool does not have the ``hit_set_type``
+    value configured. See :ref:`hit_set_type <hit_set_type>` for more details.
+  default: true
+  services:
+  - mon
+  with_legacy: true
+- name: mon_warn_on_pool_pg_num_not_power_of_two
+  type: bool
+  level: dev
+  desc: issue POOL_PG_NUM_NOT_POWER_OF_TWO warning if pool has a non-power-of-two
+    pg_num value
+  default: true
+  services:
+  - mon
+- name: mon_allow_pool_size_one
+  type: bool
+  level: advanced
+  desc: allow configuring pool with no replicas
+  default: false
+  services:
+  - mon
+- name: mon_warn_on_crush_straw_calc_version_zero
+  type: bool
+  level: advanced
+  desc: issue OLD_CRUSH_STRAW_CALC_VERSION health warning if the CRUSH map's straw_calc_version
+    is zero
+  fmt_desc: Raise ``HEALTH_WARN`` when the CRUSH ``straw_calc_version`` is zero. See
+    :ref:`CRUSH map tunables <crush-map-tunables>` for details.
+  default: true
+  services:
+  - mon
+  with_legacy: true
+- name: mon_warn_on_pool_no_redundancy
+  type: bool
+  level: advanced
+  desc: Issue a health warning if any pool is configured with no replicas
+  fmt_desc: Raise ``HEALTH_WARN`` if any pool is configured with no replicas.
+  default: true
+  services:
+  - mon
+  see_also:
+  - osd_pool_default_size
+  - osd_pool_default_min_size
+- name: mon_warn_on_osd_down_out_interval_zero
+  type: bool
+  level: advanced
+  desc: issue OSD_NO_DOWN_OUT_INTERVAL health warning if mon_osd_down_out_interval
+    is zero
+  long_desc: Having mon_osd_down_out_interval set to 0 means that down OSDs are not
+    marked out automatically and the cluster does not heal itself without administrator
+    intervention.
+  fmt_desc: Raise ``HEALTH_WARN`` when ``mon_osd_down_out_interval`` is zero. Having this
+    option set to zero on the leader acts much like the ``noout`` flag. It's hard to figure
+    out what's going wrong with clusters without the ``noout`` flag set but acting like that
+    just the same, so we report a warning in this case.
+  default: true
+  services:
+  - mon
+  see_also:
+  - mon_osd_down_out_interval
+  with_legacy: true
+- name: mon_warn_on_legacy_crush_tunables
+  type: bool
+  level: advanced
+  desc: issue OLD_CRUSH_TUNABLES health warning if CRUSH tunables are older than mon_crush_min_required_version
+  fmt_desc: Raise ``HEALTH_WARN`` when CRUSH tunables are too old (older than ``mon_min_crush_required_version``)
+  default: true
+  services:
+  - mon
+  see_also:
+  - mon_crush_min_required_version
+  with_legacy: true
+- name: mon_crush_min_required_version
+  type: str
+  level: advanced
+  desc: minimum ceph release to use for mon_warn_on_legacy_crush_tunables
+  fmt_desc: The minimum tunable profile required by the cluster. See
+    :ref:`CRUSH map tunables <crush-map-tunables>` for details.
+  default: hammer
+  services:
+  - mon
+  see_also:
+  - mon_warn_on_legacy_crush_tunables
+  with_legacy: true
+- name: mon_warn_on_degraded_stretch_mode
+  type: bool
+  level: advanced
+  desc: Issue a health warning if we are in degraded stretch mode
+  default: true
+  services:
+  - mon
+- name: mon_stretch_cluster_recovery_ratio
+  type: float
+  level: advanced
+  desc: the ratio of up OSDs at which a degraded stretch cluster enters recovery
+  default: 0.6
+  services:
+  - mon
+  min: 0.51
+  max: 1
+- name: mon_stretch_recovery_min_wait
+  type: float
+  level: advanced
+  desc: how long the monitors wait before considering fully-healthy PGs as evidence
+    the stretch mode is repaired
+  default: 15
+  services:
+  - mon
+  min: 1
+- name: mon_stretch_pool_size
+  type: uint
+  level: dev
+  default: 4
+  services:
+  - mon
+  min: 3
+  max: 6
+- name: mon_stretch_pool_min_size
+  type: uint
+  level: dev
+  default: 2
+  services:
+  - mon
+  min: 2
+  max: 4
+- name: mon_clock_drift_allowed
+  type: float
+  level: advanced
+  desc: allowed clock drift (in seconds) between mons before issuing a health warning
+  default: 0.05
+  services:
+  - mon
+  with_legacy: true
+# exponential backoff for clock drift warnings
+- name: mon_clock_drift_warn_backoff
+  type: float
+  level: advanced
+  desc: exponential backoff factor for logging clock drift warnings in the cluster
+    log
+  default: 5
+  services:
+  - mon
+  with_legacy: true
+# on leader, timecheck (clock drift check) interval (seconds)
+- name: mon_timecheck_interval
+  type: float
+  level: advanced
+  desc: frequency of clock synchronization checks between monitors (seconds)
+  fmt_desc: The time check interval (clock drift check) in seconds
+    for the Leader.
+  default: 5_min
+  services:
+  - mon
+  with_legacy: true
+# on leader, timecheck (clock drift check) interval when in presence of a skew (seconds)
+- name: mon_timecheck_skew_interval
+  type: float
+  level: advanced
+  desc: frequency of clock synchronization (re)checks between monitors while clocks
+    are believed to be skewed (seconds)
+  fmt_desc: The time check interval (clock drift check) in seconds when in
+    presence of a skew in seconds for the Leader.
+  default: 30
+  services:
+  - mon
+  see_also:
+  - mon_timecheck_interval
+  with_legacy: true
+# how often (in commits) to stash a full copy of the PaxosService state
+- name: paxos_stash_full_interval
+  type: int
+  level: advanced
+  default: 25
+  services:
+  - mon
+  fmt_desc: How often (in commits) to stash a full copy of the PaxosService state.
+    Current this setting only affects ``mds``, ``mon``, ``auth`` and ``mgr``
+    PaxosServices.
+  with_legacy: true
+# max paxos iterations before we must first sync the monitor stores
+- name: paxos_max_join_drift
+  type: int
+  level: advanced
+  default: 10
+  services:
+  - mon
+  fmt_desc: The maximum Paxos iterations before we must first sync the
+    monitor data stores. When a monitor finds that its peer is too
+    far ahead of it, it will first sync with data stores before moving
+    on.
+  with_legacy: true
+# gather updates for this long before proposing a map update
+- name: paxos_propose_interval
+  type: float
+  level: advanced
+  default: 1
+  services:
+  - mon
+  fmt_desc: Gather updates for this time interval before proposing
+    a map update.
+  with_legacy: true
+# min time to gather updates for after period of inactivity
+- name: paxos_min_wait
+  type: float
+  level: advanced
+  default: 0.05
+  services:
+  - mon
+  fmt_desc: The minimum amount of time to gather updates after a period of
+    inactivity.
+  with_legacy: true
+# minimum number of paxos states to keep around
+- name: paxos_min
+  type: int
+  level: advanced
+  default: 500
+  services:
+  - mon
+  fmt_desc: The minimum number of Paxos states to keep around
+  with_legacy: true
+# number of extra proposals tolerated before trimming
+- name: paxos_trim_min
+  type: int
+  level: advanced
+  default: 250
+  services:
+  - mon
+  fmt_desc: Number of extra proposals tolerated before trimming
+  with_legacy: true
+# maximum amount of versions to trim during a single proposal (0 disables it)
+- name: paxos_trim_max
+  type: int
+  level: advanced
+  default: 500
+  services:
+  - mon
+  fmt_desc: The maximum number of extra proposals to trim at a time
+  with_legacy: true
+# minimum amount of versions to trigger a trim (0 disables it)
+- name: paxos_service_trim_min
+  type: uint
+  level: advanced
+  default: 250
+  services:
+  - mon
+  fmt_desc: The minimum amount of versions to trigger a trim (0 disables it)
+  with_legacy: true
+# maximum amount of versions to trim during a single proposal (0 disables it)
+- name: paxos_service_trim_max
+  type: uint
+  level: advanced
+  default: 500
+  services:
+  - mon
+  fmt_desc: The maximum amount of versions to trim during a single proposal (0 disables it)
+  with_legacy: true
+- name: paxos_service_trim_max_multiplier
+  type: uint
+  level: advanced
+  desc: factor by which paxos_service_trim_max will be multiplied to get a new upper
+    bound when trim sizes are high  (0 disables it)
+  default: 20
+  services:
+  - mon
+  min: 0
+  flags:
+  - runtime
+- name: paxos_kill_at
+  type: int
+  level: dev
+  default: 0
+  services:
+  - mon
+  with_legacy: true
+- name: mon_auth_validate_all_caps
+  type: bool
+  level: advanced
+  desc: Whether to parse non-monitor capabilities set by the 'ceph auth ...' commands.
+    Disabling this saves CPU on the monitor, but allows invalid capabilities to be
+    set, and only be rejected later, when they are used.
+  default: true
+  services:
+  - mon
+  flags:
+  - runtime
+# force mon to trim mdsmaps to this point (dangerous)
+- name: mon_mds_force_trim_to
+  type: int
+  level: dev
+  desc: force mons to trim mdsmaps/fsmaps up to this epoch
+  fmt_desc: Force monitor to trim mdsmaps up to but not including this FSMap
+    epoch. A value of 0 disables (the default) this config. This command is
+    potentially dangerous, use with care.
+  default: 0
+  services:
+  - mon
+  with_legacy: true
+- name: mds_beacon_mon_down_grace
+  type: secs
+  level: advanced
+  desc: tolerance in seconds for missed MDS beacons to monitors
+  fmt_desc: The interval without beacons before Ceph declares an MDS laggy
+    when a monitor is down.
+  default: 1_min
+# skip safety assertions on FSMap (in case of bugs where we want to continue anyway)
+- name: mon_mds_skip_sanity
+  type: bool
+  level: advanced
+  desc: skip sanity checks on fsmap/mdsmap
+  fmt_desc: Skip safety assertions on FSMap (in case of bugs where we want to
+    continue anyway). Monitor terminates if the FSMap sanity check
+    fails, but we can disable it by enabling this option.
+  default: false
+  services:
+  - mon
+  with_legacy: true
+- name: mon_mds_blocklist_interval
+  type: float
+  level: dev
+  desc: Duration in seconds that blocklist entries for MDS daemons remain in the OSD
+    map
+  fmt_desc: The blocklist duration for failed MDSs in the OSD map. Note,
+    this controls how long failed MDS daemons will stay in the
+    OSDMap blocklist. It has no effect on how long something is
+    blocklisted when the administrator blocklists it manually. For
+    example, ``ceph osd blocklist add`` will still use the default
+    blocklist time.
+  default: 1_day
+  services:
+  - mon
+  min: 1_hr
+  flags:
+  - runtime
+- name: mon_mgr_blocklist_interval
+  type: float
+  level: dev
+  desc: Duration in seconds that blocklist entries for mgr daemons remain in the OSD
+    map
+  default: 1_day
+  services:
+  - mon
+  min: 1_hr
+  flags:
+  - runtime
+- name: mon_osd_laggy_halflife
+  type: int
+  level: advanced
+  desc: halflife of OSD 'lagginess' factor
+  fmt_desc: The number of seconds laggy estimates will decay.
+  default: 1_hr
+  services:
+  - mon
+  with_legacy: true
+- name: mon_osd_laggy_weight
+  type: float
+  level: advanced
+  desc: how heavily to weight OSD marking itself back up in overall laggy_probability
+  long_desc: 1.0 means that an OSD marking itself back up (because it was marked down
+    but not actually dead) means a 100% laggy_probability; 0.0 effectively disables
+    tracking of laggy_probability.
+  fmt_desc: The weight for new samples in laggy estimation decay.
+  default: 0.3
+  services:
+  - mon
+  min: 0
+  max: 1
+  with_legacy: true
+- name: mon_osd_laggy_max_interval
+  type: int
+  level: advanced
+  desc: cap value for period for OSD to be marked for laggy_interval calculation
+  fmt_desc: Maximum value of ``laggy_interval`` in laggy estimations (in seconds).
+              Monitor uses an adaptive approach to evaluate the ``laggy_interval`` of
+              a certain OSD. This value will be used to calculate the grace time for
+              that OSD.
+  default: 5_min
+  services:
+  - mon
+  with_legacy: true
+- name: mon_osd_adjust_heartbeat_grace
+  type: bool
+  level: advanced
+  desc: increase OSD heartbeat grace if peers appear to be laggy
+  long_desc: If an OSD is marked down but then marks itself back up, it implies it
+    wasn't actually down but was unable to respond to heartbeats.  If this option
+    is true, we can use the laggy_probability and laggy_interval values calculated
+    to model this situation to increase the heartbeat grace period for this OSD so
+    that it isn't marked down again.  laggy_probability is an estimated probability
+    that the given OSD is down because it is laggy (not actually down), and laggy_interval
+    is an estiate on how long it stays down when it is laggy.
+  fmt_desc: If set to ``true``, Ceph will scale based on laggy estimations.
+  default: true
+  services:
+  - mon
+  see_also:
+  - mon_osd_laggy_halflife
+  - mon_osd_laggy_weight
+  - mon_osd_laggy_max_interval
+  with_legacy: true
+- name: mon_osd_adjust_down_out_interval
+  type: bool
+  level: advanced
+  desc: increase the mon_osd_down_out_interval if an OSD appears to be laggy
+  fmt_desc: If set to ``true``, Ceph will scaled based on laggy estimations.
+  default: true
+  services:
+  - mon
+  see_also:
+  - mon_osd_adjust_heartbeat_grace
+  with_legacy: true
+- name: mon_osd_auto_mark_in
+  type: bool
+  level: advanced
+  desc: mark any OSD that comes up 'in'
+  fmt_desc: Ceph will mark any booting Ceph OSD Daemons as ``in``
+              the Ceph Storage Cluster.
+  default: false
+  services:
+  - mon
+  with_legacy: true
+- name: mon_osd_auto_mark_auto_out_in
+  type: bool
+  level: advanced
+  desc: mark any OSD that comes up that was automatically marked 'out' back 'in'
+  fmt_desc: Ceph will mark booting Ceph OSD Daemons auto marked ``out``
+              of the Ceph Storage Cluster as ``in`` the cluster.
+  default: true
+  services:
+  - mon
+  see_also:
+  - mon_osd_down_out_interval
+  with_legacy: true
+- name: mon_osd_auto_mark_new_in
+  type: bool
+  level: advanced
+  desc: mark any new OSD that comes up 'in'
+  fmt_desc: Ceph will mark booting new Ceph OSD Daemons as ``in`` the
+              Ceph Storage Cluster.
+  default: true
+  services:
+  - mon
+  with_legacy: true
+- name: mon_osd_destroyed_out_interval
+  type: int
+  level: advanced
+  desc: mark any OSD 'out' that has been 'destroy'ed for this long (seconds)
+  default: 10_min
+  services:
+  - mon
+  with_legacy: true
+- name: mon_osd_down_out_interval
+  type: int
+  level: advanced
+  desc: mark any OSD 'out' that has been 'down' for this long (seconds)
+  fmt_desc: The number of seconds Ceph waits before marking a Ceph OSD Daemon
+              ``down`` and ``out`` if it doesn't respond.
+  default: 10_min
+  services:
+  - mon
+  with_legacy: true
+- name: mon_osd_down_out_subtree_limit
+  type: str
+  level: advanced
+  desc: do not automatically mark OSDs 'out' if an entire subtree of this size is
+    down
+  fmt_desc: The smallest :term:`CRUSH` unit type that Ceph will **not**
+              automatically mark out. For instance, if set to ``host`` and if
+              all OSDs of a host are down, Ceph will not automatically mark out
+              these OSDs.
+  default: rack
+  services:
+  - mon
+  see_also:
+  - mon_osd_down_out_interval
+  flags:
+  - runtime
+- name: mon_osd_min_up_ratio
+  type: float
+  level: advanced
+  desc: do not automatically mark OSDs 'out' if fewer than this many OSDs are 'up'
+  fmt_desc: The minimum ratio of ``up`` Ceph OSD Daemons before Ceph will
+              mark Ceph OSD Daemons ``down``.
+  default: 0.3
+  services:
+  - mon
+  see_also:
+  - mon_osd_down_out_interval
+  with_legacy: true
+- name: mon_osd_min_in_ratio
+  type: float
+  level: advanced
+  desc: do not automatically mark OSDs 'out' if fewer than this many OSDs are 'in'
+  fmt_desc: The minimum ratio of ``in`` Ceph OSD Daemons before Ceph will
+              mark Ceph OSD Daemons ``out``.
+  default: 0.75
+  services:
+  - mon
+  see_also:
+  - mon_osd_down_out_interval
+  with_legacy: true
+- name: mon_osd_warn_op_age
+  type: float
+  level: advanced
+  desc: issue REQUEST_SLOW health warning if OSD ops are slower than this age (seconds)
+  default: 32
+  services:
+  - mgr
+  with_legacy: true
+- name: mon_osd_warn_num_repaired
+  type: uint
+  level: advanced
+  desc: issue OSD_TOO_MANY_REPAIRS health warning if an OSD has more than this many
+    read repairs
+  default: 10
+  services:
+  - mon
+- name: mon_osd_prime_pg_temp
+  type: bool
+  level: dev
+  desc: minimize peering work by priming pg_temp values after a map change
+  fmt_desc: Enables or disables priming the PGMap with the previous OSDs when an ``out``
+    OSD comes back into the cluster. With the ``true`` setting, clients
+    will continue to use the previous OSDs until the newly ``in`` OSDs for
+    a PG have peered.
+  default: true
+  services:
+  - mon
+  with_legacy: true
+- name: mon_osd_prime_pg_temp_max_time
+  type: float
+  level: dev
+  desc: maximum time to spend precalculating PG mappings on map change (seconds)
+  fmt_desc: How much time in seconds the monitor should spend trying to prime the
+    PGMap when an out OSD comes back into the cluster.
+  default: 0.5
+  services:
+  - mon
+  with_legacy: true
+- name: mon_osd_prime_pg_temp_max_estimate
+  type: float
+  level: advanced
+  desc: calculate all PG mappings if estimated fraction of PGs that change is above
+    this amount
+  fmt_desc: Maximum estimate of time spent on each PG before we prime all PGs
+    in parallel.
+  default: 0.25
+  services:
+  - mon
+  with_legacy: true
+- name: mon_osd_blocklist_default_expire
+  type: float
+  level: advanced
+  desc: Duration in seconds that blocklist entries for clients remain in the OSD map
+  default: 1_hr
+  services:
+  - mon
+  with_legacy: true
+- name: mon_osd_crush_smoke_test
+  type: bool
+  level: advanced
+  desc: perform a smoke test on any new CRUSH map before accepting changes
+  default: true
+  services:
+  - mon
+  with_legacy: true
+- name: mon_smart_report_timeout
+  type: uint
+  level: advanced
+  desc: Timeout (in seconds) for smartctl to run, default is set to 5
+  default: 5
+  services:
+  - mon
+- name: mon_warn_on_older_version
+  type: bool
+  level: advanced
+  desc: issue DAEMON_OLD_VERSION health warning if daemons are not all running the
+    same version
+  default: true
+  services:
+  - mon
+- name: mon_warn_older_version_delay
+  type: secs
+  level: advanced
+  desc: issue DAEMON_OLD_VERSION health warning after this amount of time has elapsed
+  default: 7_day
+  services:
+  - mon
+- name: mon_data
+  type: str
+  level: advanced
+  desc: path to mon database
+  fmt_desc: The monitor's data location.
+  default: /var/lib/ceph/mon/$cluster-$id
+  services:
+  - mon
+  flags:
+  - no_mon_update
+  with_legacy: true
+- name: mon_rocksdb_options
+  type: str
+  level: advanced
+  default: write_buffer_size=33554432,compression=kNoCompression,level_compaction_dynamic_level_bytes=true
+  with_legacy: true
+- name: mon_enable_op_tracker
+  type: bool
+  level: advanced
+  desc: enable/disable MON op tracking
+  default: true
+  services:
+  - mon
+# compact leveldb on ceph-mon start
+- name: mon_compact_on_start
+  type: bool
+  level: advanced
+  default: false
+  services:
+  - mon
+  fmt_desc: Compact the database used as Ceph Monitor store on
+    ``ceph-mon`` start. A manual compaction helps to shrink the
+    monitor database and improve the performance of it if the regular
+    compaction fails to work.
+  with_legacy: true
+# trigger leveldb compaction on bootstrap
+- name: mon_compact_on_bootstrap
+  type: bool
+  level: advanced
+  default: false
+  services:
+  - mon
+  fmt_desc: Compact the database used as Ceph Monitor store
+    on bootstrap. Monitors probe each other to establish
+    a quorum after bootstrap. If a monitor times out before joining the
+    quorum, it will start over and bootstrap again.
+  with_legacy: true
+# compact (a prefix) when we trim old states
+- name: mon_compact_on_trim
+  type: bool
+  level: advanced
+  default: true
+  services:
+  - mon
+  fmt_desc: Compact a certain prefix (including paxos) when we trim its old states.
+  with_legacy: true
+- name: mon_op_complaint_time
+  type: secs
+  level: advanced
+  desc: time after which to consider a monitor operation blocked after no updates
+  default: 30
+  services:
+  - mon
+- name: mon_op_log_threshold
+  type: int
+  level: advanced
+  desc: max number of slow ops to display
+  default: 5
+  services:
+  - mon
+- name: mon_op_history_size
+  type: uint
+  level: advanced
+  desc: max number of completed ops to track
+  default: 20
+  services:
+  - mon
+- name: mon_op_history_duration
+  type: secs
+  level: advanced
+  desc: expiration time in seconds of historical MON OPS
+  default: 10_min
+  services:
+  - mon
+- name: mon_op_history_slow_op_size
+  type: uint
+  level: advanced
+  desc: max number of slow historical MON OPS to keep
+  default: 20
+  services:
+  - mon
+- name: mon_op_history_slow_op_threshold
+  type: secs
+  level: advanced
+  desc: duration of an op to be considered as a historical slow op
+  default: 10
+  services:
+  - mon
+- name: mon_osdmap_full_prune_enabled
+  type: bool
+  level: advanced
+  desc: enables pruning full osdmap versions when we go over a given number of maps
+  default: true
+  services:
+  - mon
+  see_also:
+  - mon_osdmap_full_prune_min
+  - mon_osdmap_full_prune_interval
+  - mon_osdmap_full_prune_txsize
+- name: mon_osdmap_full_prune_min
+  type: uint
+  level: advanced
+  desc: minimum number of versions in the store to trigger full map pruning
+  default: 10000
+  services:
+  - mon
+  see_also:
+  - mon_osdmap_full_prune_enabled
+  - mon_osdmap_full_prune_interval
+  - mon_osdmap_full_prune_txsize
+- name: mon_osdmap_full_prune_interval
+  type: uint
+  level: advanced
+  desc: interval between maps that will not be pruned; maps in the middle will be
+    pruned.
+  default: 10
+  services:
+  - mon
+  see_also:
+  - mon_osdmap_full_prune_enabled
+  - mon_osdmap_full_prune_interval
+  - mon_osdmap_full_prune_txsize
+- name: mon_osdmap_full_prune_txsize
+  type: uint
+  level: advanced
+  desc: number of maps we will prune per iteration
+  default: 100
+  services:
+  - mon
+  see_also:
+  - mon_osdmap_full_prune_enabled
+  - mon_osdmap_full_prune_interval
+  - mon_osdmap_full_prune_txsize
+- name: mon_osd_cache_size
+  type: int
+  level: advanced
+  desc: maximum number of OSDMaps to cache in memory
+  fmt_desc: The size of osdmaps cache, not to rely on underlying store's cache
+  default: 500
+  services:
+  - mon
+  with_legacy: true
+- name: mon_osd_cache_size_min
+  type: size
+  level: advanced
+  desc: The minimum amount of bytes to be kept mapped in memory for osd monitor caches.
+  fmt_desc: The minimum amount of bytes to be kept mapped in memory for osd
+     monitor caches.
+  default: 128_M
+  services:
+  - mon
+  with_legacy: true
+- name: mon_osd_mapping_pgs_per_chunk
+  type: int
+  level: dev
+  desc: granularity of PG placement calculation background work
+  fmt_desc: We calculate the mapping from placement group to OSDs in chunks.
+    This option specifies the number of placement groups per chunk.
+  default: 4096
+  services:
+  - mon
+  with_legacy: true
+- name: mon_clean_pg_upmaps_per_chunk
+  type: uint
+  level: dev
+  desc: granularity of PG upmap validation background work
+  default: 256
+  services:
+  - mon
+  with_legacy: true
+- name: mon_osd_max_creating_pgs
+  type: int
+  level: advanced
+  desc: maximum number of PGs the mon will create at once
+  default: 1024
+  services:
+  - mon
+  with_legacy: true
+- name: mon_osd_max_initial_pgs
+  type: int
+  level: advanced
+  desc: maximum number of PGs a pool will created with
+  long_desc: If the user specifies more PGs than this, the cluster will subsequently
+    split PGs after the pool is created in order to reach the target.
+  default: 1024
+  services:
+  - mon
+- name: mon_memory_target
+  type: size
+  level: basic
+  desc: The amount of bytes pertaining to osd monitor caches and kv cache to be kept
+    mapped in memory with cache auto-tuning enabled
+  fmt_desc: The amount of bytes pertaining to OSD monitor caches and KV cache
+    to be kept mapped in memory with cache auto-tuning enabled.
+  default: 2_G
+  services:
+  - mon
+  flags:
+  - runtime
+  with_legacy: true
+- name: mon_memory_autotune
+  type: bool
+  level: basic
+  desc: Autotune the cache memory being used for osd monitors and kv database
+  fmt_desc: Autotune the cache memory used for OSD monitors and KV
+    database.
+  default: true
+  services:
+  - mon
+  flags:
+  - runtime
+  with_legacy: true
+- name: mon_cpu_threads
+  type: int
+  level: advanced
+  desc: worker threads for CPU intensive background work
+  fmt_desc: Number of threads for performing CPU intensive work on monitor.
+  default: 4
+  services:
+  - mon
+  with_legacy: true
+- name: mon_tick_interval
+  type: int
+  level: advanced
+  desc: interval for internal mon background checks
+  fmt_desc: A monitor's tick interval in seconds.
+  default: 5
+  services:
+  - mon
+  with_legacy: true
+- name: mon_session_timeout
+  type: int
+  level: advanced
+  desc: close inactive mon client connections after this many seconds
+  fmt_desc: Monitor will terminate inactive sessions stay idle over this
+    time limit.
+  default: 5_min
+  services:
+  - mon
+  with_legacy: true
+- name: mon_subscribe_interval
+  type: float
+  level: dev
+  desc: subscribe interval for pre-jewel clients
+  fmt_desc: The refresh interval (in seconds) for subscriptions. The
+    subscription mechanism enables obtaining cluster maps
+    and log information.
+  default: 1_day
+  services:
+  - mon
+  with_legacy: true
+- name: mon_use_min_delay_socket
+  type: bool
+  level: advanced
+  default: false
+  desc: priority packets between mons
+  with_legacy: true
+  see_also:
+  - osd_heartbeat_use_min_delay_socket
diff --git a/src/common/options/osd.yaml.in b/src/common/options/osd.yaml.in
new file mode 100644
index 000000000..7291ce11d
--- /dev/null
+++ b/src/common/options/osd.yaml.in
@@ -0,0 +1,1415 @@
+# -*- mode: YAML -*-
+---
+
+options:
+- name: osd_numa_prefer_iface
+  type: bool
+  level: advanced
+  desc: prefer IP on network interface on same numa node as storage
+  default: true
+  see_also:
+  - osd_numa_auto_affinity
+  flags:
+  - startup
+- name: osd_numa_auto_affinity
+  type: bool
+  level: advanced
+  desc: automatically set affinity to numa node when storage and network match
+  default: true
+  flags:
+  - startup
+- name: osd_numa_node
+  type: int
+  level: advanced
+  desc: set affinity to a numa node (-1 for none)
+  default: -1
+  see_also:
+  - osd_numa_auto_affinity
+  flags:
+  - startup
+- name: set_keepcaps
+  type: bool
+  level: advanced
+  desc: set the keepcaps flag before changing UID, preserving the permitted capability set
+  long_desc: When ceph switches from root to the ceph uid, all capabilities in all sets are eraseed. If
+    a component that is capability aware needs a specific capability, the keepcaps flag maintains
+     the permitted capability set, allowing the capabilities in the effective set to be activated as needed.
+  default: false
+  flags:
+  - startup
+- name: osd_smart_report_timeout
+  type: uint
+  level: advanced
+  desc: Timeout (in seconds) for smartctl to run, default is set to 5
+  default: 5
+# verify backend can support configured max object name length
+- name: osd_check_max_object_name_len_on_startup
+  type: bool
+  level: dev
+  default: true
+  with_legacy: true
+- name: osd_max_backfills
+  type: uint
+  level: advanced
+  desc: Maximum number of concurrent local and remote backfills or recoveries per
+    OSD
+  long_desc: There can be osd_max_backfills local reservations AND the same remote
+    reservations per OSD. So a value of 1 lets this OSD participate as 1 PG primary
+    in recovery and 1 shard of another recovering PG.
+  fmt_desc: The maximum number of backfills allowed to or from a single OSD.
+    Note that this is applied separately for read and write operations.
+  default: 1
+  flags:
+  - runtime
+  with_legacy: true
+# Minimum recovery priority (255 = max, smaller = lower)
+- name: osd_min_recovery_priority
+  type: int
+  level: advanced
+  desc: Minimum priority below which recovery is not performed
+  long_desc: The purpose here is to prevent the cluster from doing *any* lower priority
+    work (e.g., rebalancing) below this threshold and focus solely on higher priority
+    work (e.g., replicating degraded objects).
+  default: 0
+  with_legacy: true
+- name: osd_backfill_retry_interval
+  type: float
+  level: advanced
+  desc: how frequently to retry backfill reservations after being denied (e.g., due
+    to a full OSD)
+  fmt_desc: The number of seconds to wait before retrying backfill requests.
+  default: 30
+  with_legacy: true
+- name: osd_recovery_retry_interval
+  type: float
+  level: advanced
+  desc: how frequently to retry recovery reservations after being denied (e.g., due
+    to a full OSD)
+  default: 30
+  with_legacy: true
+- name: osd_recovery_sleep
+  type: float
+  level: advanced
+  desc: Time in seconds to sleep before next recovery or backfill op. This setting
+    overrides _ssd, _hdd, and _hybrid if non-zero.
+  fmt_desc: Time in seconds to sleep before the next recovery or backfill op.
+    Increasing this value will slow down recovery operation while
+    client operations will be less impacted.
+  default: 0
+  flags:
+  - runtime
+  with_legacy: true
+- name: osd_recovery_sleep_hdd
+  type: float
+  level: advanced
+  desc: Time in seconds to sleep before next recovery or backfill op for HDDs
+  fmt_desc: Time in seconds to sleep before next recovery or backfill op
+    for HDDs.
+  default: 0.1
+  flags:
+  - runtime
+  with_legacy: true
+- name: osd_recovery_sleep_ssd
+  type: float
+  level: advanced
+  desc: Time in seconds to sleep before next recovery or backfill op for SSDs
+  fmt_desc: Time in seconds to sleep before the next recovery or backfill op
+    for SSDs.
+  default: 0
+  see_also:
+  - osd_recovery_sleep
+  flags:
+  - runtime
+  with_legacy: true
+- name: osd_recovery_sleep_hybrid
+  type: float
+  level: advanced
+  desc: Time in seconds to sleep before next recovery or backfill op when data is
+    on HDD and journal is on SSD
+  fmt_desc: Time in seconds to sleep before the next recovery or backfill op
+    when OSD data is on HDD and OSD journal / WAL+DB is on SSD.
+  default: 0.025
+  see_also:
+  - osd_recovery_sleep
+  flags:
+  - runtime
+- name: osd_snap_trim_sleep
+  type: float
+  level: advanced
+  desc: Time in seconds to sleep before next snap trim. This setting overrides _ssd,
+    _hdd, and _hybrid if non-zero.
+  fmt_desc: Time in seconds to sleep before next snap trim op.
+    Increasing this value will slow down snap trimming.
+    This option overrides backend specific variants.
+  default: 0
+  flags:
+  - runtime
+  with_legacy: true
+- name: osd_snap_trim_sleep_hdd
+  type: float
+  level: advanced
+  desc: Time in seconds to sleep before next snap trim for HDDs
+  default: 5
+  flags:
+  - runtime
+- name: osd_snap_trim_sleep_ssd
+  type: float
+  level: advanced
+  desc: Time in seconds to sleep before next snap trim for SSDs
+  fmt_desc: Time in seconds to sleep before next snap trim op
+    for SSD OSDs (including NVMe).
+  default: 0
+  flags:
+  - runtime
+- name: osd_snap_trim_sleep_hybrid
+  type: float
+  level: advanced
+  desc: Time in seconds to sleep before next snap trim when data is on HDD and journal
+    is on SSD
+  fmt_desc: Time in seconds to sleep before next snap trim op
+    when OSD data is on an HDD and the OSD journal or WAL+DB is on an SSD.
+  default: 2
+  flags:
+  - runtime
+- name: osd_scrub_invalid_stats
+  type: bool
+  level: advanced
+  default: true
+  with_legacy: true
+- name: osd_max_scrubs
+  type: int
+  level: advanced
+  desc: Maximum concurrent scrubs on a single OSD
+  fmt_desc: The maximum number of simultaneous scrub operations for
+    a Ceph OSD Daemon.
+  default: 1
+  with_legacy: true
+- name: osd_scrub_during_recovery
+  type: bool
+  level: advanced
+  desc: Allow scrubbing when PGs on the OSD are undergoing recovery
+  fmt_desc: Allow scrub during recovery. Setting this to ``false`` will disable
+    scheduling new scrub (and deep--scrub) while there is active recovery.
+    Already running scrubs will be continued. This might be useful to reduce
+    load on busy clusters.
+  default: false
+  with_legacy: true
+- name: osd_repair_during_recovery
+  type: bool
+  level: advanced
+  desc: Allow requested repairing when PGs on the OSD are undergoing recovery
+  default: false
+  with_legacy: true
+- name: osd_scrub_begin_hour
+  type: int
+  level: advanced
+  desc: Restrict scrubbing to this hour of the day or later
+  long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day.
+  fmt_desc: This restricts scrubbing to this hour of the day or later.
+    Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0``
+    to allow scrubbing the entire day.  Along with ``osd_scrub_end_hour``, they define a time
+    window, in which the scrubs can happen.
+    But a scrub will be performed
+    no matter whether the time window allows or not, as long as the placement
+    group's scrub interval exceeds ``osd_scrub_max_interval``.
+  default: 0
+  see_also:
+  - osd_scrub_end_hour
+  min: 0
+  max: 23
+  with_legacy: true
+- name: osd_scrub_end_hour
+  type: int
+  level: advanced
+  desc: Restrict scrubbing to hours of the day earlier than this
+  long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day.
+  fmt_desc: This restricts scrubbing to the hour earlier than this.
+    Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0`` to allow scrubbing
+    for the entire day.  Along with ``osd_scrub_begin_hour``, they define a time
+    window, in which the scrubs can happen. But a scrub will be performed
+    no matter whether the time window allows or not, as long as the placement
+    group's scrub interval exceeds ``osd_scrub_max_interval``.
+  default: 0
+  see_also:
+  - osd_scrub_begin_hour
+  min: 0
+  max: 23
+  with_legacy: true
+- name: osd_scrub_begin_week_day
+  type: int
+  level: advanced
+  desc: Restrict scrubbing to this day of the week or later
+  long_desc: 0 = Sunday, 1 = Monday, etc. Use osd_scrub_begin_week_day=0 osd_scrub_end_week_day=0
+    for the entire week.
+  fmt_desc: This restricts scrubbing to this day of the week or later.
+    0  = Sunday, 1 = Monday, etc. Use ``osd_scrub_begin_week_day = 0``
+    and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week.
+    Along with ``osd_scrub_end_week_day``, they define a time window in which
+    scrubs can happen. But a scrub will be performed
+    no matter whether the time window allows or not, when the PG's
+    scrub interval exceeds ``osd_scrub_max_interval``.
+  default: 0
+  see_also:
+  - osd_scrub_end_week_day
+  min: 0
+  max: 6
+  with_legacy: true
+- name: osd_scrub_end_week_day
+  type: int
+  level: advanced
+  desc: Restrict scrubbing to days of the week earlier than this
+  long_desc: 0 = Sunday, 1 = Monday, etc. Use osd_scrub_begin_week_day=0 osd_scrub_end_week_day=0
+    for the entire week.
+  fmt_desc: This restricts scrubbing to days of the week earlier than this.
+    0 = Sunday, 1 = Monday, etc.  Use ``osd_scrub_begin_week_day = 0``
+    and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week.
+    Along with ``osd_scrub_begin_week_day``, they define a time
+    window, in which the scrubs can happen. But a scrub will be performed
+    no matter whether the time window allows or not, as long as the placement
+    group's scrub interval exceeds ``osd_scrub_max_interval``.
+  default: 0
+  see_also:
+  - osd_scrub_begin_week_day
+  min: 0
+  max: 6
+  with_legacy: true
+- name: osd_scrub_load_threshold
+  type: float
+  level: advanced
+  desc: Allow scrubbing when system load divided by number of CPUs is below this value
+  fmt_desc: The normalized maximum load. Ceph will not scrub when the system load
+    (as defined by ``getloadavg() / number of online CPUs``) is higher than this number.
+    Default is ``0.5``.
+  default: 0.5
+  with_legacy: true
+# if load is low
+- name: osd_scrub_min_interval
+  type: float
+  level: advanced
+  desc: Scrub each PG no more often than this interval
+  fmt_desc: The minimal interval in seconds for scrubbing the Ceph OSD Daemon
+    when the Ceph Storage Cluster load is low.
+  default: 1_day
+  see_also:
+  - osd_scrub_max_interval
+  with_legacy: true
+# regardless of load
+- name: osd_scrub_max_interval
+  type: float
+  level: advanced
+  desc: Scrub each PG no less often than this interval
+  fmt_desc: The maximum interval in seconds for scrubbing the Ceph OSD Daemon
+    irrespective of cluster load.
+  default: 7_day
+  see_also:
+  - osd_scrub_min_interval
+  with_legacy: true
+# randomize the scheduled scrub in the span of [min,min*(1+randomize_ratio))
+- name: osd_scrub_interval_randomize_ratio
+  type: float
+  level: advanced
+  desc: Ratio of scrub interval to randomly vary
+  long_desc: This prevents a scrub 'stampede' by randomly varying the scrub intervals
+    so that they are soon uniformly distributed over the week
+  fmt_desc: Add a random delay to ``osd_scrub_min_interval`` when scheduling
+    the next scrub job for a PG. The delay is a random
+    value less than ``osd_scrub_min_interval`` \*
+    ``osd_scrub_interval_randomized_ratio``. The default setting
+    spreads scrubs throughout the allowed time
+    window of ``[1, 1.5]`` \* ``osd_scrub_min_interval``.
+  default: 0.5
+  see_also:
+  - osd_scrub_min_interval
+  with_legacy: true
+# the probability to back off the scheduled scrub
+- name: osd_scrub_backoff_ratio
+  type: float
+  level: dev
+  desc: Backoff ratio for scheduling scrubs
+  long_desc: This is the precentage of ticks that do NOT schedule scrubs, 66% means
+    that 1 out of 3 ticks will schedule scrubs
+  default: 0.66
+  with_legacy: true
+- name: osd_scrub_chunk_min
+  type: int
+  level: advanced
+  desc: Minimum number of objects to deep-scrub in a single chunk
+  fmt_desc: The minimal number of object store chunks to scrub during single operation.
+    Ceph blocks writes to single chunk during scrub.
+  default: 5
+  see_also:
+  - osd_scrub_chunk_max
+  with_legacy: true
+- name: osd_scrub_chunk_max
+  type: int
+  level: advanced
+  desc: Maximum number of objects to deep-scrub in a single chunk
+  fmt_desc: The maximum number of object store chunks to scrub during single operation.
+  default: 25
+  see_also:
+  - osd_scrub_chunk_min
+  with_legacy: true
+- name: osd_shallow_scrub_chunk_min
+  type: int
+  level: advanced
+  desc: Minimum number of objects to scrub in a single chunk
+  fmt_desc: The minimum number of object store chunks to scrub during single operation.
+    Not applicable to deep scrubs.
+    Ceph blocks writes to single chunk during scrub.
+  default: 50
+  see_also:
+  - osd_shallow_scrub_chunk_max
+  - osd_scrub_chunk_min
+  with_legacy: true
+- name: osd_shallow_scrub_chunk_max
+  type: int
+  level: advanced
+  desc: Maximum number of objects to scrub in a single chunk
+  fmt_desc: The maximum number of object store chunks to scrub during single operation.
+    Not applicable to deep scrubs.
+  default: 100
+  see_also:
+  - osd_shallow_scrub_chunk_min
+  - osd_scrub_chunk_max
+  with_legacy: true
+# sleep between [deep]scrub ops
+- name: osd_scrub_sleep
+  type: float
+  level: advanced
+  desc: Duration to inject a delay during scrubbing
+  fmt_desc: Time to sleep before scrubbing the next group of chunks. Increasing this value will slow
+    down the overall rate of scrubbing so that client operations will be less impacted.
+  default: 0
+  flags:
+  - runtime
+  with_legacy: true
+# more sleep between [deep]scrub ops
+- name: osd_scrub_extended_sleep
+  type: float
+  level: advanced
+  desc: Duration to inject a delay during scrubbing out of scrubbing hours
+  default: 0
+  see_also:
+  - osd_scrub_begin_hour
+  - osd_scrub_end_hour
+  - osd_scrub_begin_week_day
+  - osd_scrub_end_week_day
+  with_legacy: true
+# whether auto-repair inconsistencies upon deep-scrubbing
+- name: osd_scrub_auto_repair
+  type: bool
+  level: advanced
+  desc: Automatically repair damaged objects detected during scrub
+  fmt_desc: Setting this to ``true`` will enable automatic PG repair when errors
+    are found by scrubs or deep-scrubs.  However, if more than
+    ``osd_scrub_auto_repair_num_errors`` errors are found a repair is NOT performed.
+  default: false
+  with_legacy: true
+# only auto-repair when number of errors is below this threshold
+- name: osd_scrub_auto_repair_num_errors
+  type: uint
+  level: advanced
+  desc: Maximum number of detected errors to automatically repair
+  fmt_desc: Auto repair will not occur if more than this many errors are found.
+  default: 5
+  see_also:
+  - osd_scrub_auto_repair
+  with_legacy: true
+- name: osd_scrub_max_preemptions
+  type: uint
+  level: advanced
+  desc: Set the maximum number of times we will preempt a deep scrub due to a client
+    operation before blocking client IO to complete the scrub
+  default: 5
+  min: 0
+  max: 30
+- name: osd_deep_scrub_interval
+  type: float
+  level: advanced
+  desc: Deep scrub each PG (i.e., verify data checksums) at least this often
+  fmt_desc: The interval for "deep" scrubbing (fully reading all data). The
+    ``osd_scrub_load_threshold`` does not affect this setting.
+  default: 7_day
+  with_legacy: true
+- name: osd_deep_scrub_randomize_ratio
+  type: float
+  level: advanced
+  desc: Scrubs will randomly become deep scrubs at this rate (0.15 -> 15% of scrubs
+    are deep)
+  long_desc: This prevents a deep scrub 'stampede' by spreading deep scrubs so they
+    are uniformly distributed over the week
+  default: 0.15
+  with_legacy: true
+- name: osd_deep_scrub_stride
+  type: size
+  level: advanced
+  desc: Number of bytes to read from an object at a time during deep scrub
+  fmt_desc: Read size when doing a deep scrub.
+  default: 512_K
+  with_legacy: true
+- name: osd_deep_scrub_keys
+  type: int
+  level: advanced
+  desc: Number of keys to read from an object at a time during deep scrub
+  default: 1024
+  with_legacy: true
+# objects must be this old (seconds) before we update the whole-object digest on scrub
+- name: osd_deep_scrub_update_digest_min_age
+  type: int
+  level: advanced
+  desc: Update overall object digest only if object was last modified longer ago than
+    this
+  default: 2_hr
+  with_legacy: true
+- name: osd_deep_scrub_large_omap_object_key_threshold
+  type: uint
+  level: advanced
+  desc: Warn when we encounter an object with more omap keys than this
+  default: 200000
+  services:
+  - osd
+  - mds
+  see_also:
+  - osd_deep_scrub_large_omap_object_value_sum_threshold
+  with_legacy: true
+- name: osd_deep_scrub_large_omap_object_value_sum_threshold
+  type: size
+  level: advanced
+  desc: Warn when we encounter an object with more omap key bytes than this
+  default: 1_G
+  services:
+  - osd
+  see_also:
+  - osd_deep_scrub_large_omap_object_key_threshold
+  with_legacy: true
+# when scrubbing blocks on a locked object
+- name: osd_blocked_scrub_grace_period
+  type: int
+  level: advanced
+  desc: Time (seconds) before issuing a cluster-log warning
+  long_desc: Waiting too long for an object in the scrubbed chunk to be unlocked.
+  default: 120
+  with_legacy: true
+# timely updates to the 'pg dump' output, esp. re scrub scheduling
+- name: osd_stats_update_period_scrubbing
+  type: int
+  level: advanced
+  desc: Stats update period (seconds) when scrubbing
+  long_desc: A PG actively scrubbing (or blocked while scrubbing) publishes its
+    stats (inc. scrub/block duration) every this many seconds.
+  default: 15
+  with_legacy: false
+- name: osd_stats_update_period_not_scrubbing
+  type: int
+  level: advanced
+  desc: Stats update period (seconds) when not scrubbing
+  long_desc: A PG we are a primary of, publishes its
+    stats (inc. scrub/block duration) every this many seconds.
+  default: 120
+  with_legacy: false
+# when replicas are slow to respond to scrub resource reservations
+# Note: disable by using a very large value
+- name: osd_scrub_slow_reservation_response
+  type: millisecs
+  level: advanced
+  desc: Duration before issuing a cluster-log warning
+  long_desc: Waiting too long for a replica to respond (after at least half of the
+    replicas have responded).
+  default: 2200
+  min: 500
+  see_also:
+  - osd_scrub_reservation_timeout
+  with_legacy: false
+# when a replica does not respond to scrub resource request
+# Note: disable by using a very large value
+- name: osd_scrub_reservation_timeout
+  type: millisecs
+  level: advanced
+  desc: Duration before aborting the scrub session
+  long_desc: Waiting too long for some replicas to respond to
+    scrub reservation requests.
+  default: 5000
+  min: 2000
+  see_also:
+  - osd_scrub_slow_reservation_response
+  with_legacy: false
+# where rados plugins are stored
+- name: osd_class_dir
+  type: str
+  level: advanced
+  default: @CMAKE_INSTALL_LIBDIR@/rados-classes
+  fmt_desc: The class path for RADOS class plug-ins.
+  with_legacy: true
+- name: osd_open_classes_on_start
+  type: bool
+  level: advanced
+  default: true
+  with_legacy: true
+# list of object classes allowed to be loaded (allow all: *)
+- name: osd_class_load_list
+  type: str
+  level: advanced
+  default: cephfs hello journal lock log numops otp rbd refcount rgw rgw_gc timeindex
+    user version cas cmpomap queue 2pc_queue fifo
+  with_legacy: true
+# list of object classes with default execute perm (allow all: *)
+- name: osd_class_default_list
+  type: str
+  level: advanced
+  default: cephfs hello journal lock log numops otp rbd refcount rgw rgw_gc timeindex
+    user version cas cmpomap queue 2pc_queue fifo
+  with_legacy: true
+- name: osd_agent_max_ops
+  type: int
+  level: advanced
+  desc: maximum concurrent tiering operations for tiering agent
+  fmt_desc: The maximum number of simultaneous flushing ops per tiering agent
+    in the high speed mode.
+  default: 4
+  with_legacy: true
+- name: osd_agent_max_low_ops
+  type: int
+  level: advanced
+  desc: maximum concurrent low-priority tiering operations for tiering agent
+  fmt_desc: The maximum number of simultaneous flushing ops per tiering agent
+    in the low speed mode.
+  default: 2
+  with_legacy: true
+- name: osd_agent_min_evict_effort
+  type: float
+  level: advanced
+  desc: minimum effort to expend evicting clean objects
+  default: 0.1
+  min: 0
+  max: 0.99
+  with_legacy: true
+- name: osd_agent_quantize_effort
+  type: float
+  level: advanced
+  desc: size of quantize unit for eviction effort
+  default: 0.1
+  with_legacy: true
+- name: osd_agent_delay_time
+  type: float
+  level: advanced
+  desc: how long agent should sleep if it has no work to do
+  default: 5
+  with_legacy: true
+# decay atime and hist histograms after how many objects go by
+- name: osd_agent_hist_halflife
+  type: int
+  level: advanced
+  desc: halflife of agent atime and temp histograms
+  default: 1000
+  with_legacy: true
+# decay atime and hist histograms after how many objects go by
+- name: osd_agent_slop
+  type: float
+  level: advanced
+  desc: slop factor to avoid switching tiering flush and eviction mode
+  default: 0.02
+  with_legacy: true
+- name: osd_find_best_info_ignore_history_les
+  type: bool
+  level: dev
+  desc: ignore last_epoch_started value when peering AND PROBABLY LOSE DATA
+  long_desc: THIS IS AN EXTREMELY DANGEROUS OPTION THAT SHOULD ONLY BE USED AT THE
+    DIRECTION OF A DEVELOPER.  It makes peering ignore the last_epoch_started value
+    when peering, which can allow the OSD to believe an OSD has an authoritative view
+    of a PG's contents even when it is in fact old and stale, typically leading to
+    data loss (by believing a stale PG is up to date).
+  default: false
+  with_legacy: true
+- name: osd_uuid
+  type: uuid
+  level: advanced
+  desc: uuid label for a new OSD
+  fmt_desc: The universally unique identifier (UUID) for the Ceph OSD Daemon.
+  note: The ``osd_uuid`` applies to a single Ceph OSD Daemon. The ``fsid``
+    applies to the entire cluster.
+  flags:
+  - create
+  with_legacy: true
+- name: osd_data
+  type: str
+  level: advanced
+  desc: path to OSD data
+  fmt_desc: The path to the OSDs data. You must create the directory when
+    deploying Ceph. You should mount a drive for OSD data at this
+    mount point. We do not recommend changing the default.
+  default: /var/lib/ceph/osd/$cluster-$id
+  flags:
+  - no_mon_update
+  with_legacy: true
+- name: osd_journal
+  type: str
+  level: advanced
+  desc: path to OSD journal (when FileStore backend is in use)
+  fmt_desc: The path to the OSD's journal. This may be a path to a file or a
+    block device (such as a partition of an SSD). If it is a file,
+    you must create the directory to contain it. We recommend using a
+    separate fast device when the ``osd_data`` drive is an HDD.
+  default: /var/lib/ceph/osd/$cluster-$id/journal
+  flags:
+  - no_mon_update
+  with_legacy: true
+- name: osd_journal_size
+  type: size
+  level: advanced
+  desc: size of FileStore journal (in MiB)
+  fmt_desc: The size of the journal in megabytes.
+  default: 5_K
+  flags:
+  - create
+  with_legacy: true
+- name: osd_journal_flush_on_shutdown
+  type: bool
+  level: advanced
+  desc: flush FileStore journal contents during clean OSD shutdown
+  default: true
+  with_legacy: true
+- name: osd_compact_on_start
+  type: bool
+  level: advanced
+  desc: compact OSD's object store's OMAP on start
+  default: false
+# flags for specific control purpose during osd mount() process.
+# e.g., can be 1 to skip over replaying journal
+# or 2 to skip over mounting omap or 3 to skip over both.
+# This might be helpful in case the journal is totally corrupted
+# and we still want to bring the osd daemon back normally, etc.
+- name: osd_os_flags
+  type: uint
+  level: dev
+  desc: flags to skip filestore omap or journal initialization
+  default: 0
+- name: osd_max_write_size
+  type: size
+  level: advanced
+  desc: Maximum size of a RADOS write operation in megabytes
+  long_desc: This setting prevents clients from doing very large writes to RADOS.  If
+    you set this to a value below what clients expect, they will receive an error
+    when attempting to write to the cluster.
+  fmt_desc: The maximum size of a write in megabytes.
+  default: 90
+  min: 4
+  with_legacy: true
+- name: osd_max_pgls
+  type: uint
+  level: advanced
+  desc: maximum number of results when listing objects in a pool
+  fmt_desc: The maximum number of placement groups to list. A client
+    requesting a large number can tie up the Ceph OSD Daemon.
+  default: 1_K
+  with_legacy: true
+- name: osd_client_message_size_cap
+  type: size
+  level: advanced
+  desc: maximum memory to devote to in-flight client requests
+  long_desc: If this value is exceeded, the OSD will not read any new client data
+    off of the network until memory is freed.
+  fmt_desc: The largest client data message allowed in memory.
+  default: 500_M
+  with_legacy: true
+- name: osd_client_message_cap
+  type: uint
+  level: advanced
+  desc: maximum number of in-flight client requests
+  default: 256
+  with_legacy: true
+- name: osd_crush_update_on_start
+  type: bool
+  level: advanced
+  desc: update OSD CRUSH location on startup
+  default: true
+  with_legacy: true
+- name: osd_class_update_on_start
+  type: bool
+  level: advanced
+  desc: set OSD device class on startup
+  default: true
+  with_legacy: true
+- name: osd_crush_initial_weight
+  type: float
+  level: advanced
+  desc: if >= 0, initial CRUSH weight for newly created OSDs
+  long_desc: If this value is negative, the size of the OSD in TiB is used.
+  fmt_desc: The initial CRUSH weight for newly added OSDs. The default
+    value of this option is ``the size of a newly added OSD in TB``. By default,
+    the initial CRUSH weight for a newly added OSD is set to its device size in
+    TB. See `Weighting Bucket Items`_ for details.
+  default: -1
+  with_legacy: true
+# Allows the "peered" state for recovery and backfill below min_size
+- name: osd_allow_recovery_below_min_size
+  type: bool
+  level: dev
+  desc: allow replicated pools to recover with < min_size active members
+  default: true
+  services:
+  - osd
+  with_legacy: true
+# cap on # of inc maps we send to peers, clients
+- name: osd_map_share_max_epochs
+  type: int
+  level: advanced
+  default: 40
+  with_legacy: true
+- name: osd_map_cache_size
+  type: int
+  level: advanced
+  default: 50
+  fmt_desc: The number of OSD maps to keep cached.
+  with_legacy: true
+- name: osd_pg_epoch_max_lag_factor
+  type: float
+  level: advanced
+  desc: Max multiple of the map cache that PGs can lag before we throttle map injest
+  default: 2
+  see_also:
+  - osd_map_cache_size
+- name: osd_inject_bad_map_crc_probability
+  type: float
+  level: dev
+  default: 0
+  with_legacy: true
+- name: osd_inject_failure_on_pg_removal
+  type: bool
+  level: dev
+  default: false
+  with_legacy: true
+# shutdown the OSD if stuatus flipping more than max_markdown_count times in recent max_markdown_period seconds
+- name: osd_max_markdown_period
+  type: int
+  level: advanced
+  default: 10_min
+  with_legacy: true
+- name: osd_max_markdown_count
+  type: int
+  level: advanced
+  default: 5
+  with_legacy: true
+- name: osd_op_thread_timeout
+  type: int
+  level: advanced
+  default: 15
+  fmt_desc: The Ceph OSD Daemon operation thread timeout in seconds.
+  with_legacy: true
+- name: osd_op_thread_suicide_timeout
+  type: int
+  level: advanced
+  default: 150
+  with_legacy: true
+- name: osd_op_pq_max_tokens_per_priority
+  type: uint
+  level: advanced
+  default: 4_M
+  with_legacy: true
+- name: osd_op_pq_min_cost
+  type: size
+  level: advanced
+  default: 64_K
+  with_legacy: true
+# preserve clone_overlap during recovery/migration
+- name: osd_recover_clone_overlap
+  type: bool
+  level: advanced
+  default: true
+  fmt_desc: Preserves clone overlap during recovery. Should always be set
+    to ``true``.
+  with_legacy: true
+- name: osd_num_cache_shards
+  type: size
+  level: advanced
+  desc: The number of cache shards to use in the object store.
+  default: 32
+  flags:
+  - startup
+- name: osd_aggregated_slow_ops_logging
+  type: bool
+  level: advanced
+  desc: Allow OSD daemon to send an aggregated slow ops to the cluster log
+  fmt_desc: If set to ``true``, the OSD daemon will send slow ops information in 
+    an aggregated format to the cluster log else sends every slow op to the
+    cluster log.
+  default: true
+  with_legacy: true
+- name: osd_op_num_threads_per_shard
+  type: int
+  level: advanced
+  default: 0
+  flags:
+  - startup
+  with_legacy: true
+- name: osd_op_num_threads_per_shard_hdd
+  type: int
+  level: advanced
+  default: 1
+  see_also:
+  - osd_op_num_threads_per_shard
+  flags:
+  - startup
+  with_legacy: true
+- name: osd_op_num_threads_per_shard_ssd
+  type: int
+  level: advanced
+  default: 2
+  see_also:
+  - osd_op_num_threads_per_shard
+  flags:
+  - startup
+  with_legacy: true
+- name: osd_op_num_shards
+  type: int
+  level: advanced
+  fmt_desc: The number of shards allocated for a given OSD. Each shard has its own processing queue.
+    PGs on the OSD are distributed evenly in the shard. This setting overrides _ssd and _hdd if
+    non-zero.
+  default: 0
+  flags:
+  - startup
+  with_legacy: true
+- name: osd_op_num_shards_hdd
+  type: int
+  level: advanced
+  fmt_desc: the number of shards allocated for a given OSD (for rotational media).
+  default: 5
+  see_also:
+  - osd_op_num_shards
+  flags:
+  - startup
+  with_legacy: true
+- name: osd_op_num_shards_ssd
+  type: int
+  level: advanced
+  fmt_desc: the number of shards allocated for a given OSD (for solid state media).
+  default: 8
+  see_also:
+  - osd_op_num_shards
+  flags:
+  - startup
+  with_legacy: true
+- name: osd_skip_data_digest
+  type: bool
+  level: dev
+  desc: Do not store full-object checksums if the backend (bluestore) does its own
+    checksums.  Only usable with all BlueStore OSDs.
+  default: false
+# PrioritzedQueue (prio), Weighted Priority Queue (wpq ; default),
+# mclock_opclass, mclock_client, or debug_random. "mclock_opclass"
+# and "mclock_client" are based on the mClock/dmClock algorithm
+# (Gulati, et al. 2010). "mclock_opclass" prioritizes based on the
+# class the operation belongs to. "mclock_client" does the same but
+# also works to ienforce fairness between clients. "debug_random"
+# chooses among all four with equal probability.
+- name: osd_op_queue
+  type: str
+  level: advanced
+  desc: which operation priority queue algorithm to use
+  long_desc: which operation priority queue algorithm to use
+  fmt_desc: This sets the type of queue to be used for prioritizing ops
+    within each OSD. Both queues feature a strict sub-queue which is
+    dequeued before the normal queue. The normal queue is different
+    between implementations. The WeightedPriorityQueue (``wpq``)
+    dequeues operations in relation to their priorities to prevent
+    starvation of any queue. WPQ should help in cases where a few OSDs
+    are more overloaded than others. The mClockQueue
+    (``mclock_scheduler``) prioritizes operations based on which class
+    they belong to (recovery, scrub, snaptrim, client op, osd subop).
+    See `QoS Based on mClock`_. Requires a restart.
+  default: mclock_scheduler
+  see_also:
+  - osd_op_queue_cut_off
+  enum_values:
+  - wpq
+  - mclock_scheduler
+  - debug_random
+  with_legacy: true
+# Min priority to go to strict queue. (low, high)
+- name: osd_op_queue_cut_off
+  type: str
+  level: advanced
+  desc: the threshold between high priority ops and low priority ops
+  long_desc: the threshold between high priority ops that use strict priority ordering
+    and low priority ops that use a fairness algorithm that may or may not incorporate
+    priority
+  fmt_desc: This selects which priority ops will be sent to the strict
+    queue verses the normal queue. The ``low`` setting sends all
+    replication ops and higher to the strict queue, while the ``high``
+    option sends only replication acknowledgment ops and higher to
+    the strict queue. Setting this to ``high`` should help when a few
+    OSDs in the cluster are very busy especially when combined with
+    ``wpq`` in the ``osd_op_queue`` setting. OSDs that are very busy
+    handling replication traffic could starve primary client traffic
+    on these OSDs without these settings. Requires a restart.
+  default: high
+  see_also:
+  - osd_op_queue
+  enum_values:
+  - low
+  - high
+  - debug_random
+  with_legacy: true
+- name: osd_mclock_scheduler_client_res
+  type: float
+  level: advanced
+  desc: IO proportion reserved for each client (default). The default value
+    of 0 specifies the lowest possible reservation. Any value greater than
+    0 and up to 1.0 specifies the minimum IO proportion to reserve for each
+    client in terms of a fraction of the OSD's maximum IOPS capacity.
+  long_desc: Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: IO proportion reserved for each client (default).
+  default: 0
+  min: 0
+  max: 1.0
+  see_also:
+  - osd_op_queue
+- name: osd_mclock_scheduler_client_wgt
+  type: uint
+  level: advanced
+  desc: IO share for each client (default) over reservation
+  long_desc: Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: IO share for each client (default) over reservation.
+  default: 1
+  see_also:
+  - osd_op_queue
+- name: osd_mclock_scheduler_client_lim
+  type: float
+  level: advanced
+  desc: IO limit for each client (default) over reservation. The default
+    value of 0 specifies no limit enforcement, which means each client can
+    use the maximum possible IOPS capacity of the OSD. Any value greater
+    than 0 and up to 1.0 specifies the upper IO limit over reservation
+    that each client receives in terms of a fraction of the OSD's
+    maximum IOPS capacity.
+  long_desc: Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: IO limit for each client (default) over reservation.
+  default: 0
+  min: 0
+  max: 1.0
+  see_also:
+  - osd_op_queue
+- name: osd_mclock_scheduler_background_recovery_res
+  type: float
+  level: advanced
+  desc: IO proportion reserved for background recovery (default). The
+    default value of 0 specifies the lowest possible reservation. Any value
+    greater than 0 and up to 1.0 specifies the minimum IO proportion to
+    reserve for background recovery operations in terms of a fraction of
+    the OSD's maximum IOPS capacity.
+  long_desc: Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: IO proportion reserved for background recovery (default).
+  default: 0
+  min: 0
+  max: 1.0
+  see_also:
+  - osd_op_queue
+- name: osd_mclock_scheduler_background_recovery_wgt
+  type: uint
+  level: advanced
+  desc: IO share for each background recovery over reservation
+  long_desc: Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: IO share for each background recovery over reservation.
+  default: 1
+  see_also:
+  - osd_op_queue
+- name: osd_mclock_scheduler_background_recovery_lim
+  type: float
+  level: advanced
+  desc: IO limit for background recovery over reservation. The default
+    value of 0 specifies no limit enforcement, which means background
+    recovery operation can use the maximum possible IOPS capacity of the
+    OSD. Any value greater than 0 and up to 1.0 specifies the upper IO
+    limit over reservation that background recovery operation receives in
+    terms of a fraction of the OSD's maximum IOPS capacity.
+  long_desc: Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: IO limit for background recovery over reservation.
+  default: 0
+  min: 0
+  max: 1.0
+  see_also:
+  - osd_op_queue
+- name: osd_mclock_scheduler_background_best_effort_res
+  type: float
+  level: advanced
+  desc: IO proportion reserved for background best_effort (default). The
+    default value of 0 specifies the lowest possible reservation. Any value
+    greater than 0 and up to 1.0 specifies the minimum IO proportion to
+    reserve for background best_effort operations in terms of a fraction
+    of the OSD's maximum IOPS capacity.
+  long_desc: Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: IO proportion reserved for background best_effort (default).
+  default: 0
+  min: 0
+  max: 1.0
+  see_also:
+  - osd_op_queue
+- name: osd_mclock_scheduler_background_best_effort_wgt
+  type: uint
+  level: advanced
+  desc: IO share for each background best_effort over reservation
+  long_desc: Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: IO share for each background best_effort over reservation.
+  default: 1
+  see_also:
+  - osd_op_queue
+- name: osd_mclock_scheduler_background_best_effort_lim
+  type: float
+  level: advanced
+  desc: IO limit for background best_effort over reservation. The default
+    value of 0 specifies no limit enforcement, which means background
+    best_effort operation can use the maximum possible IOPS capacity of the
+    OSD. Any value greater than 0 and up to 1.0 specifies the upper IO
+    limit over reservation that background best_effort operation receives
+    in terms of a fraction of the OSD's maximum IOPS capacity.
+  long_desc: Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: IO limit for background best_effort over reservation.
+  default: 0
+  min: 0
+  max: 1.0
+  see_also:
+  - osd_op_queue
+- name: osd_mclock_scheduler_anticipation_timeout
+  type: float
+  level: advanced
+  desc: mclock anticipation timeout in seconds
+  long_desc: the amount of time that mclock waits until the unused resource is forfeited
+  default: 0
+- name: osd_mclock_max_sequential_bandwidth_hdd
+  type: size
+  level: basic
+  desc: The maximum sequential bandwidth in bytes/second of the OSD (for
+    rotational media)
+  long_desc: This option specifies the maximum sequential bandwidth to consider
+    for an OSD whose underlying device type is rotational media. This is
+    considered by the mclock scheduler to derive the cost factor to be used in
+    QoS calculations. Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: The maximum sequential bandwidth in bytes/second to consider for the
+    OSD (for rotational media)
+  default: 150_M
+  flags:
+  - runtime
+- name: osd_mclock_max_sequential_bandwidth_ssd
+  type: size
+  level: basic
+  desc: The maximum sequential bandwidth in bytes/second of the OSD (for
+    solid state media)
+  long_desc: This option specifies the maximum sequential bandwidth to consider
+    for an OSD whose underlying device type is solid state media. This is
+    considered by the mclock scheduler to derive the cost factor to be used in
+    QoS calculations. Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: The maximum sequential bandwidth in bytes/second to consider for the
+    OSD (for solid state media)
+  default: 1200_M
+  flags:
+  - runtime
+- name: osd_mclock_max_capacity_iops_hdd
+  type: float
+  level: basic
+  desc: Max random write IOPS capacity (at 4KiB block size) to consider per OSD
+    (for rotational media)
+  long_desc: This option specifies the max OSD random write IOPS capacity per
+    OSD. Contributes in QoS calculations when enabling a dmclock profile. Only
+    considered for osd_op_queue = mclock_scheduler
+  fmt_desc: Max random write IOPS capacity (at 4 KiB block size) to consider per
+    OSD (for rotational media)
+  default: 315
+  flags:
+  - runtime
+- name: osd_mclock_max_capacity_iops_ssd
+  type: float
+  level: basic
+  desc: Max random write IOPS capacity (at 4 KiB block size) to consider per OSD
+    (for solid state media)
+  long_desc: This option specifies the max OSD random write IOPS capacity per
+    OSD. Contributes in QoS calculations when enabling a dmclock profile. Only
+    considered for osd_op_queue = mclock_scheduler
+  fmt_desc: Max random write IOPS capacity (at 4 KiB block size) to consider per
+    OSD (for solid state media)
+  default: 21500
+  flags:
+  - runtime
+- name: osd_mclock_force_run_benchmark_on_init
+  type: bool
+  level: advanced
+  desc: Force run the OSD benchmark on OSD initialization/boot-up
+  long_desc: This option specifies whether the OSD benchmark must be run during
+    the OSD boot-up sequence even if historical data about the OSD iops capacity
+    is available in the MON config store. Enable this to refresh the OSD iops
+    capacity if the underlying device's performance characteristics have changed
+    significantly. Only considered for osd_op_queue = mclock_scheduler.
+  fmt_desc: Force run the OSD benchmark on OSD initialization/boot-up
+  default: false
+  see_also:
+  - osd_mclock_max_capacity_iops_hdd
+  - osd_mclock_max_capacity_iops_ssd
+  flags:
+  - startup
+- name: osd_mclock_skip_benchmark
+  type: bool
+  level: dev
+  desc: Skip the OSD benchmark on OSD initialization/boot-up
+  long_desc: This option specifies whether the OSD benchmark must be skipped during
+    the OSD boot-up sequence. Only considered for osd_op_queue = mclock_scheduler.
+  fmt_desc: Skip the OSD benchmark on OSD initialization/boot-up
+  default: false
+  see_also:
+  - osd_mclock_max_capacity_iops_hdd
+  - osd_mclock_max_capacity_iops_ssd
+  flags:
+  - runtime
+- name: osd_mclock_profile
+  type: str
+  level: advanced
+  desc: Which mclock profile to use
+  long_desc: This option specifies the mclock profile to enable - one among the set
+    of built-in profiles or a custom profile. Only considered for osd_op_queue = mclock_scheduler
+  fmt_desc: |
+    This sets the type of mclock profile to use for providing QoS
+    based on operations belonging to different classes (background
+    recovery, scrub, snaptrim, client op, osd subop). Once a built-in
+    profile is enabled, the lower level mclock resource control
+    parameters [*reservation, weight, limit*] and some Ceph
+    configuration parameters are set transparently. Note that the
+    above does not apply for the *custom* profile.
+  default: balanced
+  see_also:
+  - osd_op_queue
+  enum_values:
+  - balanced
+  - high_recovery_ops
+  - high_client_ops
+  - custom
+  flags:
+  - runtime
+- name: osd_mclock_override_recovery_settings
+  type: bool
+  level: advanced
+  desc: Setting this option enables the override of recovery/backfill limits
+    for the mClock scheduler.
+  long_desc: This option when set enables the override of the max recovery
+    active and the max backfills limits with mClock scheduler active. These
+    options are not modifiable when mClock scheduler is active. Any attempt
+    to modify these values without setting this option will reset the
+    recovery or backfill option back to its default value.
+  fmt_desc: Setting this option will enable the override of the
+    recovery/backfill limits for the mClock scheduler as defined by the
+    ``osd_recovery_max_active_hdd``, ``osd_recovery_max_active_ssd`` and
+    ``osd_max_backfills`` options.
+  default: false
+  see_also:
+  - osd_recovery_max_active_hdd
+  - osd_recovery_max_active_ssd
+  - osd_max_backfills
+  flags:
+  - runtime
+- name: osd_mclock_iops_capacity_threshold_hdd
+  type: float
+  level: basic
+  desc: The threshold IOPs capacity (at 4KiB block size) beyond which to ignore
+    the OSD bench results for an OSD (for rotational media)
+  long_desc: This option specifies the threshold IOPS capacity for an OSD under
+    which the OSD bench results can be considered for QoS calculations. Only
+    considered for osd_op_queue = mclock_scheduler
+  fmt_desc: The threshold IOPS capacity (at 4KiB block size) beyond which to
+    ignore OSD bench results for an OSD (for rotational media)
+  default: 500
+  flags:
+  - runtime
+- name: osd_mclock_iops_capacity_threshold_ssd
+  type: float
+  level: basic
+  desc: The threshold IOPs capacity (at 4KiB block size) beyond which to ignore
+    the OSD bench results for an OSD (for solid state media)
+  long_desc: This option specifies the threshold IOPS capacity for an OSD under
+    which the OSD bench results can be considered for QoS calculations. Only
+    considered for osd_op_queue = mclock_scheduler
+  fmt_desc: The threshold IOPS capacity (at 4KiB block size) beyond which to
+    ignore OSD bench results for an OSD (for solid state media)
+  default: 80000
+  flags:
+  - runtime
+# Set to true for testing.  Users should NOT set this.
+# If set to true even after reading enough shards to
+# decode the object, any error will be reported.
+- name: osd_read_ec_check_for_errors
+  type: bool
+  level: advanced
+  default: false
+  with_legacy: true
+- name: osd_recovery_delay_start
+  type: float
+  level: advanced
+  default: 0
+  fmt_desc: After peering completes, Ceph will delay for the specified number
+    of seconds before starting to recover RADOS objects.
+  with_legacy: true
+- name: osd_recovery_max_active
+  type: uint
+  level: advanced
+  desc: Number of simultaneous active recovery operations per OSD (overrides _ssd
+    and _hdd if non-zero)
+  fmt_desc: The number of active recovery requests per OSD at one time. More
+    requests will accelerate recovery, but the requests places an
+    increased load on the cluster.
+  note: This value is only used if it is non-zero. Normally it
+    is ``0``, which means that the ``hdd`` or ``ssd`` values
+    (below) are used, depending on the type of the primary
+    device backing the OSD.
+  default: 0
+  see_also:
+  - osd_recovery_max_active_hdd
+  - osd_recovery_max_active_ssd
+  flags:
+  - runtime
+  with_legacy: true
+- name: osd_recovery_max_active_hdd
+  type: uint
+  level: advanced
+  desc: Number of simultaneous active recovery operations per OSD (for rotational
+    devices)
+  fmt_desc: The number of active recovery requests per OSD at one time, if the
+    primary device is rotational.
+  default: 3
+  see_also:
+  - osd_recovery_max_active
+  - osd_recovery_max_active_ssd
+  flags:
+  - runtime
+  with_legacy: true
+- name: osd_recovery_max_active_ssd
+  type: uint
+  level: advanced
+  desc: Number of simultaneous active recovery operations per OSD (for non-rotational
+    solid state devices)
+  fmt_desc: The number of active recovery requests per OSD at one time, if the
+    primary device is non-rotational (i.e., an SSD).
+  default: 10
+  see_also:
+  - osd_recovery_max_active
+  - osd_recovery_max_active_hdd
+  flags:
+  - runtime
+  with_legacy: true
+- name: osd_recovery_max_single_start
+  type: uint
+  level: advanced
+  default: 1
+  fmt_desc: The maximum number of recovery operations per OSD that will be
+    newly started when an OSD is recovering.
+  with_legacy: true
+# max size of push chunk
+- name: osd_recovery_max_chunk
+  type: size
+  level: advanced
+  default: 8_M
+  fmt_desc: the maximum total size of data chunks a recovery op can carry.
+  with_legacy: true
+# max number of omap entries per chunk; 0 to disable limit
+- name: osd_recovery_max_omap_entries_per_chunk
+  type: uint
+  level: advanced
+  default: 8096
+  with_legacy: true
+# max size of a COPYFROM chunk
+- name: osd_copyfrom_max_chunk
+  type: size
+  level: advanced
+  default: 8_M
+  with_legacy: true
+# push cost per object
+- name: osd_push_per_object_cost
+  type: size
+  level: advanced
+  default: 1000
+  fmt_desc: the overhead for serving a push op
+  with_legacy: true
+# max size of push message
+- name: osd_max_push_cost
+  type: size
+  level: advanced
+  default: 8_M
+  with_legacy: true
+# max objects in single push op
+- name: osd_max_push_objects
+  type: uint
+  level: advanced
+  default: 10
+  with_legacy: true
+# Only use clone_overlap for recovery if there are fewer than
+# osd_recover_clone_overlap_limit entries in the overlap set
+- name: osd_recover_clone_overlap_limit
+  type: uint
+  level: advanced
+  default: 10
+  flags:
+  - runtime
+- name: osd_debug_feed_pullee
+  type: int
+  level: dev
+  desc: Feed a pullee, and force primary to pull a currently missing object from it
+  default: -1
+  with_legacy: true
+- name: osd_backfill_scan_min
+  type: int
+  level: advanced
+  default: 64
+  fmt_desc: The minimum number of objects per backfill scan.
+  with_legacy: true
+- name: osd_backfill_scan_max
+  type: int
+  level: advanced
+  default: 512
+  fmt_desc: The maximum number of objects per backfill scan.p
+  with_legacy: true
+- name: osd_extblkdev_plugins
+  type: str
+  level: advanced
+  desc: extended block device plugins to load, provide compression feedback at runtime
+  default: vdo
+  flags:
+  - startup
+# minimum number of peers
+- name: osd_heartbeat_min_peers
+  type: int
+  level: advanced
+  default: 10
+  with_legacy: true
+- name: osd_delete_sleep
+  type: float
+  level: advanced
+  desc: Time in seconds to sleep before next removal transaction. This setting
+    overrides _ssd, _hdd, and _hybrid if non-zero.
+  fmt_desc: Time in seconds to sleep before the next removal transaction. This
+    throttles the PG deletion process.
+  default: 0
+  flags:
+  - runtime
+- name: osd_delete_sleep_hdd
+  type: float
+  level: advanced
+  desc: Time in seconds to sleep before next removal transaction for HDDs
+  default: 5
+  flags:
+  - runtime
+- name: osd_delete_sleep_ssd
+  type: float
+  level: advanced
+  desc: Time in seconds to sleep before next removal transaction for SSDs
+  default: 1
+  flags:
+  - runtime
+- name: osd_delete_sleep_hybrid
+  type: float
+  level: advanced
+  desc: Time in seconds to sleep before next removal transaction when OSD data is on HDD
+    and OSD journal or WAL+DB is on SSD
+  default: 1
+  flags:
+  - runtime
+- name: osd_rocksdb_iterator_bounds_enabled
+  desc: Whether omap iterator bounds are applied to rocksdb iterator ReadOptions
+  type: bool
+  level: dev
+  default: true
+  with_legacy: true
diff --git a/src/common/options/rbd-mirror.yaml.in b/src/common/options/rbd-mirror.yaml.in
new file mode 100644
index 000000000..93c059ff2
--- /dev/null
+++ b/src/common/options/rbd-mirror.yaml.in
@@ -0,0 +1,210 @@
+# -*- mode: YAML -*-
+---
+
+options:
+- name: rbd_mirror_journal_commit_age
+  type: float
+  level: advanced
+  desc: commit time interval, seconds
+  default: 5
+  services:
+  - rbd-mirror
+- name: rbd_mirror_journal_poll_age
+  type: float
+  level: advanced
+  desc: maximum age (in seconds) between successive journal polls
+  default: 5
+  services:
+  - rbd-mirror
+- name: rbd_mirror_sync_point_update_age
+  type: float
+  level: advanced
+  desc: number of seconds between each update of the image sync point object number
+  default: 30
+  services:
+  - rbd-mirror
+- name: rbd_mirror_concurrent_image_syncs
+  type: uint
+  level: advanced
+  desc: maximum number of image syncs in parallel
+  default: 5
+  services:
+  - rbd-mirror
+- name: rbd_mirror_pool_replayers_refresh_interval
+  type: uint
+  level: advanced
+  desc: interval to refresh peers in rbd-mirror daemon
+  default: 30
+  services:
+  - rbd-mirror
+- name: rbd_mirror_concurrent_image_deletions
+  type: uint
+  level: advanced
+  desc: maximum number of image deletions in parallel
+  default: 1
+  services:
+  - rbd-mirror
+  min: 1
+- name: rbd_mirror_delete_retry_interval
+  type: float
+  level: advanced
+  desc: interval to check and retry the failed deletion requests
+  default: 30
+  services:
+  - rbd-mirror
+- name: rbd_mirror_image_state_check_interval
+  type: uint
+  level: advanced
+  desc: interval to get images from pool watcher and set sources in replayer
+  default: 30
+  services:
+  - rbd-mirror
+  min: 1
+- name: rbd_mirror_leader_heartbeat_interval
+  type: uint
+  level: advanced
+  desc: interval (in seconds) between mirror leader heartbeats
+  default: 5
+  services:
+  - rbd-mirror
+  min: 1
+- name: rbd_mirror_leader_max_missed_heartbeats
+  type: uint
+  level: advanced
+  desc: number of missed heartbeats for non-lock owner to attempt to acquire lock
+  default: 2
+  services:
+  - rbd-mirror
+- name: rbd_mirror_leader_max_acquire_attempts_before_break
+  type: uint
+  level: advanced
+  desc: number of failed attempts to acquire lock after missing heartbeats before
+    breaking lock
+  default: 3
+  services:
+  - rbd-mirror
+- name: rbd_mirror_image_policy_type
+  type: str
+  level: advanced
+  desc: active/active policy type for mapping images to instances
+  default: simple
+  services:
+  - rbd-mirror
+  enum_values:
+  - none
+  - simple
+- name: rbd_mirror_image_policy_migration_throttle
+  type: uint
+  level: advanced
+  desc: number of seconds after which an image can be reshuffled (migrated) again
+  default: 300
+  services:
+  - rbd-mirror
+- name: rbd_mirror_image_policy_update_throttle_interval
+  type: float
+  level: advanced
+  desc: interval (in seconds) to throttle images for mirror daemon peer updates
+  default: 1
+  services:
+  - rbd-mirror
+  min: 1
+- name: rbd_mirror_image_policy_rebalance_timeout
+  type: float
+  level: advanced
+  desc: number of seconds policy should be idle before trigerring reshuffle (rebalance)
+    of images
+  default: 0
+  services:
+  - rbd-mirror
+- name: rbd_mirror_perf_stats_prio
+  type: int
+  level: advanced
+  desc: Priority level for mirror daemon replication perf counters
+  long_desc: The daemon will send perf counter data to the manager daemon if the priority
+    is not lower than mgr_stats_threshold.
+  default: 5
+  services:
+  - rbd-mirror
+  min: 0
+  max: 11
+- name: rbd_mirror_image_perf_stats_prio
+  type: int
+  level: advanced
+  desc: Priority level for mirror daemon per-image replication perf counters
+  long_desc: The daemon will send per-image perf counter data to the manager daemon
+    if the priority is not lower than mgr_stats_threshold.
+  default: 5
+  services:
+  - rbd-mirror
+  min: 0
+  max: 11
+- name: rbd_mirror_memory_autotune
+  type: bool
+  level: dev
+  desc: Automatically tune the ratio of caches while respecting min values.
+  default: true
+  services:
+  - rbd-mirror
+  see_also:
+  - rbd_mirror_memory_target
+- name: rbd_mirror_memory_target
+  type: size
+  level: basic
+  desc: When tcmalloc and cache autotuning is enabled, try to keep this many bytes
+    mapped in memory.
+  default: 4_G
+  services:
+  - rbd-mirror
+  see_also:
+  - rbd_mirror_memory_autotune
+- name: rbd_mirror_memory_base
+  type: size
+  level: dev
+  desc: When tcmalloc and cache autotuning is enabled, estimate the minimum amount
+    of memory in bytes the rbd-mirror daemon will need.
+  default: 768_M
+  services:
+  - rbd-mirror
+  see_also:
+  - rbd_mirror_memory_autotune
+- name: rbd_mirror_memory_expected_fragmentation
+  type: float
+  level: dev
+  desc: When tcmalloc and cache autotuning is enabled, estimate the percent of memory
+    fragmentation.
+  default: 0.15
+  services:
+  - rbd-mirror
+  see_also:
+  - rbd_mirror_memory_autotune
+  min: 0
+  max: 1
+- name: rbd_mirror_memory_cache_min
+  type: size
+  level: dev
+  desc: When tcmalloc and cache autotuning is enabled, set the minimum amount of memory
+    used for cache.
+  default: 128_M
+  services:
+  - rbd-mirror
+  see_also:
+  - rbd_mirror_memory_autotune
+- name: rbd_mirror_memory_cache_resize_interval
+  type: float
+  level: dev
+  desc: When tcmalloc and cache autotuning is enabled, wait this many seconds between
+    resizing caches.
+  default: 5
+  services:
+  - rbd-mirror
+  see_also:
+  - rbd_mirror_memory_autotune
+- name: rbd_mirror_memory_cache_autotune_interval
+  type: float
+  level: dev
+  desc: The number of seconds to wait between rebalances when cache autotune is enabled.
+  default: 30
+  services:
+  - rbd-mirror
+  see_also:
+  - rbd_mirror_memory_autotune
diff --git a/src/common/options/rbd.yaml.in b/src/common/options/rbd.yaml.in
new file mode 100644
index 000000000..c2da27aaa
--- /dev/null
+++ b/src/common/options/rbd.yaml.in
@@ -0,0 +1,881 @@
+# -*- mode: YAML -*-
+---
+
+headers: |
+  #include <bit>
+  #include <regex>
+  // rbd feature and io operation validation
+  #include "include/stringify.h"
+  #include "common/strtol.h"
+  #include "librbd/Features.h"
+  #include "librbd/io/IoOperations.h"
+options:
+- name: rbd_default_pool
+  type: str
+  level: advanced
+  desc: default pool for storing new images
+  default: rbd
+  services:
+  - rbd
+  validator: |
+    [](std::string *value, std::string *error_message) {
+      std::regex pattern("^[^@/]+$");
+      if (!std::regex_match (*value, pattern)) {
+        *value = "rbd";
+        *error_message = "invalid RBD default pool, resetting to 'rbd'";
+      }
+      return 0;
+    }
+- name: rbd_default_data_pool
+  type: str
+  level: advanced
+  desc: default pool for storing data blocks for new images
+  services:
+  - rbd
+  validator: |
+    [](std::string *value, std::string *error_message) {
+      std::regex pattern("^[^@/]*$");
+      if (!std::regex_match (*value, pattern)) {
+        *value = "";
+        *error_message = "ignoring invalid RBD data pool";
+      }
+      return 0;
+    }
+- name: rbd_default_features
+  type: str
+  level: advanced
+  desc: default v2 image features for new images
+  long_desc: 'RBD features are only applicable for v2 images. This setting accepts
+    either an integer bitmask value or comma-delimited string of RBD feature names.
+    This setting is always internally stored as an integer bitmask value. The mapping
+    between feature bitmask value and feature name is as follows: +1 -> layering,
+    +2 -> striping, +4 -> exclusive-lock, +8 -> object-map, +16 -> fast-diff, +32
+    -> deep-flatten, +64 -> journaling, +128 -> data-pool'
+  default: layering,exclusive-lock,object-map,fast-diff,deep-flatten
+  services:
+  - rbd
+  flags:
+  - runtime
+  validator: |
+    [](std::string *value, std::string *error_message) {
+      std::stringstream ss;
+      uint64_t features = librbd::rbd_features_from_string(*value, &ss);
+      // Leave this in integer form to avoid breaking Cinder.  Someday
+      // we would like to present this in string form instead...
+      *value = stringify(features);
+      if (ss.str().size()) {
+        return -EINVAL;
+      }
+      return 0;
+    }
+- name: rbd_op_threads
+  type: uint
+  level: advanced
+  desc: number of threads to utilize for internal processing
+  default: 1
+  services:
+  - rbd
+- name: rbd_op_thread_timeout
+  type: uint
+  level: advanced
+  desc: time in seconds for detecting a hung thread
+  default: 60
+  services:
+  - rbd
+- name: rbd_disable_zero_copy_writes
+  type: bool
+  level: advanced
+  desc: Disable the use of zero-copy writes to ensure unstable writes from clients
+    cannot cause a CRC mismatch
+  default: true
+  services:
+  - rbd
+- name: rbd_non_blocking_aio
+  type: bool
+  level: advanced
+  desc: process AIO ops from a dispatch thread to prevent blocking
+  default: true
+  services:
+  - rbd
+- name: rbd_cache
+  type: bool
+  level: advanced
+  desc: whether to enable caching (writeback unless rbd_cache_max_dirty is 0)
+  fmt_desc: Enable caching for RADOS Block Device (RBD).
+  default: true
+  services:
+  - rbd
+- name: rbd_cache_policy
+  type: str
+  level: advanced
+  desc: cache policy for handling writes.
+  fmt_desc: Select the caching policy for librbd.
+  default: writearound
+  services:
+  - rbd
+  enum_values:
+  - writethrough
+  - writeback
+  - writearound
+- name: rbd_cache_writethrough_until_flush
+  type: bool
+  level: advanced
+  desc: whether to make writeback caching writethrough until flush is called, to be
+    sure the user of librbd will send flushes so that writeback is safe
+  fmt_desc: Start out in ``writethrough`` mode, and switch to ``writeback``
+    after the first flush request is received. Enabling is a
+    conservative but safe strategy in case VMs running on RBD volumes
+    are too old to send flushes, like the ``virtio`` driver in Linux
+    kernels older than 2.6.32.
+  default: true
+  services:
+  - rbd
+- name: rbd_cache_size
+  type: size
+  level: advanced
+  desc: cache size in bytes
+  fmt_desc: The per-volume RBD client cache size in bytes.
+  default: 32_M
+  policies: write-back and write-through
+  services:
+  - rbd
+- name: rbd_cache_max_dirty
+  type: size
+  level: advanced
+  desc: dirty limit in bytes - set to 0 for write-through caching
+  fmt_desc: The ``dirty`` limit in bytes at which the cache triggers write-back.
+    If ``0``, uses write-through caching.
+  default: 24_M
+  constraint: Must be less than ``rbd_cache_size``.
+  policies: write-around and write-back
+  services:
+  - rbd
+- name: rbd_cache_target_dirty
+  type: size
+  level: advanced
+  desc: target dirty limit in bytes
+  fmt_desc: The ``dirty target`` before the cache begins writing data to the data
+    storage. Does not block writes to the cache.
+  default: 16_M
+  constraint: Must be less than ``rbd_cache_max_dirty``.
+  policies: write-back
+  services:
+  - rbd
+- name: rbd_cache_max_dirty_age
+  type: float
+  level: advanced
+  desc: seconds in cache before writeback starts
+  fmt_desc: The number of seconds dirty data is in the cache before writeback starts.
+  default: 1
+  policies: write-back
+  services:
+  - rbd
+- name: rbd_cache_max_dirty_object
+  type: uint
+  level: advanced
+  desc: dirty limit for objects - set to 0 for auto calculate from rbd_cache_size
+  default: 0
+  services:
+  - rbd
+- name: rbd_cache_block_writes_upfront
+  type: bool
+  level: advanced
+  desc: whether to block writes to the cache before the aio_write call completes
+  default: false
+  services:
+  - rbd
+- name: rbd_parent_cache_enabled
+  type: bool
+  level: advanced
+  desc: whether to enable rbd shared ro cache
+  default: false
+  services:
+  - rbd
+- name: rbd_concurrent_management_ops
+  type: uint
+  level: advanced
+  desc: how many operations can be in flight for a management operation like deleting
+    or resizing an image
+  default: 10
+  services:
+  - rbd
+  min: 1
+- name: rbd_balance_snap_reads
+  type: bool
+  level: advanced
+  desc: distribute snap read requests to random OSD
+  default: false
+  services:
+  - rbd
+  see_also:
+  - rbd_read_from_replica_policy
+- name: rbd_localize_snap_reads
+  type: bool
+  level: advanced
+  desc: localize snap read requests to closest OSD
+  default: false
+  services:
+  - rbd
+  see_also:
+  - rbd_read_from_replica_policy
+- name: rbd_balance_parent_reads
+  type: bool
+  level: advanced
+  desc: distribute parent read requests to random OSD
+  default: false
+  services:
+  - rbd
+  see_also:
+  - rbd_read_from_replica_policy
+- name: rbd_localize_parent_reads
+  type: bool
+  level: advanced
+  desc: localize parent requests to closest OSD
+  default: false
+  services:
+  - rbd
+  see_also:
+  - rbd_read_from_replica_policy
+- name: rbd_sparse_read_threshold_bytes
+  type: size
+  level: advanced
+  desc: threshold for issuing a sparse-read
+  long_desc: minimum number of sequential bytes to read against an object before issuing
+    a sparse-read request to the cluster. 0 implies it must be a full object read
+    to issue a sparse-read, 1 implies always use sparse-read, and any value larger
+    than the maximum object size will disable sparse-read for all requests
+  default: 64_K
+  services:
+  - rbd
+- name: rbd_readahead_trigger_requests
+  type: uint
+  level: advanced
+  desc: number of sequential requests necessary to trigger readahead
+  default: 10
+  services:
+  - rbd
+- name: rbd_readahead_max_bytes
+  type: size
+  level: advanced
+  desc: set to 0 to disable readahead
+  fmt_desc: Maximum size of a read-ahead request.  If zero, read-ahead is disabled.
+  default: 512_K
+  services:
+  - rbd
+- name: rbd_readahead_disable_after_bytes
+  type: size
+  level: advanced
+  desc: how many bytes are read in total before readahead is disabled
+  fmt_desc: After this many bytes have been read from an RBD image, read-ahead
+    is disabled for that image until it is closed.  This allows the
+    guest OS to take over read-ahead once it is booted.  If zero,
+    read-ahead stays enabled.
+  default: 50_M
+  services:
+  - rbd
+- name: rbd_clone_copy_on_read
+  type: bool
+  level: advanced
+  desc: copy-up parent image blocks to clone upon read request
+  default: false
+  services:
+  - rbd
+- name: rbd_blocklist_on_break_lock
+  type: bool
+  level: advanced
+  desc: whether to blocklist clients whose lock was broken
+  default: true
+  services:
+  - rbd
+- name: rbd_blocklist_expire_seconds
+  type: uint
+  level: advanced
+  desc: number of seconds to blocklist - set to 0 for OSD default
+  default: 0
+  services:
+  - rbd
+- name: rbd_request_timed_out_seconds
+  type: uint
+  level: advanced
+  desc: number of seconds before maintenance request times out
+  default: 30
+  services:
+  - rbd
+- name: rbd_skip_partial_discard
+  type: bool
+  level: advanced
+  desc: skip discard (zero) of unaligned extents within an object
+  default: true
+  services:
+  - rbd
+- name: rbd_discard_granularity_bytes
+  type: uint
+  level: advanced
+  desc: minimum aligned size of discard operations
+  default: 64_K
+  services:
+  - rbd
+  min: 4_K
+  max: 32_M
+  validator: |
+    [](std::string *value, std::string *error_message) {
+      uint64_t f = strict_si_cast<uint64_t>(*value, error_message);
+      if (!error_message->empty()) {
+        return -EINVAL;
+      } else if (!std::has_single_bit(f)) {
+        *error_message = "value must be a power of two";
+        return -EINVAL;
+      }
+      return 0;
+    }
+- name: rbd_enable_alloc_hint
+  type: bool
+  level: advanced
+  desc: when writing a object, it will issue a hint to osd backend to indicate the
+    expected size object need
+  default: true
+  services:
+  - rbd
+- name: rbd_compression_hint
+  type: str
+  level: basic
+  desc: Compression hint to send to the OSDs during writes
+  fmt_desc: Hint to send to the OSDs on write operations. If set to
+    ``compressible`` and the OSD ``bluestore_compression_mode``
+    setting is ``passive``, the OSD will attempt to compress data.
+    If set to ``incompressible`` and the OSD compression setting
+    is ``aggressive``, the OSD will not attempt to compress data.
+  default: none
+  services:
+  - rbd
+  enum_values:
+  - none
+  - compressible
+  - incompressible
+  flags:
+  - runtime
+- name: rbd_read_from_replica_policy
+  type: str
+  level: basic
+  desc: Read replica policy send to the OSDS during reads
+  fmt_desc: |
+    Policy for determining which OSD will receive read operations.
+    If set to ``default``, each PG's primary OSD will always be used
+    for read operations. If set to ``balance``, read operations will
+    be sent to a randomly selected OSD within the replica set. If set
+    to ``localize``, read operations will be sent to the closest OSD
+    as determined by the CRUSH map. Unlike ``rbd_balance_snap_reads``
+    and ``rbd_localize_snap_reads`` or ``rbd_balance_parent_reads`` and
+    ``rbd_localize_parent_reads``, it affects all read operations, not
+    just snap or parent. Note: this feature requires the cluster to
+    be configured with a minimum compatible OSD release of Octopus.
+  default: default
+  services:
+  - rbd
+  enum_values:
+  - default
+  - balance
+  - localize
+  flags:
+  - runtime
+- name: rbd_tracing
+  type: bool
+  level: advanced
+  desc: true if LTTng-UST tracepoints should be enabled
+  default: false
+  services:
+  - rbd
+- name: rbd_blkin_trace_all
+  type: bool
+  level: advanced
+  desc: create a blkin trace for all RBD requests
+  default: false
+  services:
+  - rbd
+- name: rbd_validate_pool
+  type: bool
+  level: advanced
+  desc: validate empty pools for RBD compatibility
+  default: true
+  services:
+  - rbd
+- name: rbd_validate_names
+  type: bool
+  level: advanced
+  desc: validate new image names for RBD compatibility
+  default: true
+  services:
+  - rbd
+- name: rbd_invalidate_object_map_on_timeout
+  type: bool
+  level: dev
+  desc: true if object map should be invalidated when load or update timeout
+  default: true
+  services:
+  - rbd
+- name: rbd_auto_exclusive_lock_until_manual_request
+  type: bool
+  level: advanced
+  desc: automatically acquire/release exclusive lock until it is explicitly requested
+  default: true
+  services:
+  - rbd
+- name: rbd_move_to_trash_on_remove
+  type: bool
+  level: basic
+  desc: automatically move images to the trash when deleted
+  default: false
+  services:
+  - rbd
+- name: rbd_move_to_trash_on_remove_expire_seconds
+  type: uint
+  level: basic
+  desc: default number of seconds to protect deleted images in the trash
+  default: 0
+  services:
+  - rbd
+- name: rbd_move_parent_to_trash_on_remove
+  type: bool
+  level: basic
+  desc: move parent with clone format v2 children to the trash when deleted
+  default: false
+  services:
+  - rbd
+- name: rbd_mirroring_resync_after_disconnect
+  type: bool
+  level: advanced
+  desc: automatically start image resync after mirroring is disconnected due to being
+    laggy
+  default: false
+  services:
+  - rbd
+- name: rbd_mirroring_delete_delay
+  type: uint
+  level: advanced
+  desc: time-delay in seconds for rbd-mirror delete propagation
+  default: 0
+  services:
+  - rbd
+- name: rbd_mirroring_replay_delay
+  type: uint
+  level: advanced
+  desc: time-delay in seconds for rbd-mirror asynchronous replication
+  default: 0
+  services:
+  - rbd
+- name: rbd_mirroring_max_mirroring_snapshots
+  type: uint
+  level: advanced
+  desc: mirroring snapshots limit
+  default: 5
+  services:
+  - rbd
+  min: 3
+- name: rbd_default_format
+  type: uint
+  level: advanced
+  desc: default image format for new images
+  default: 2
+  services:
+  - rbd
+- name: rbd_default_order
+  type: uint
+  level: advanced
+  desc: default order (data block object size) for new images
+  long_desc: This configures the default object size for new images. The value is used as a
+    power of two, meaning ``default_object_size = 2 ^ rbd_default_order``. Configure a value
+    between 12 and 25 (inclusive), translating to 4KiB lower and 32MiB upper limit.
+  default: 22
+  services:
+  - rbd
+- name: rbd_default_stripe_count
+  type: uint
+  level: advanced
+  desc: default stripe count for new images
+  default: 0
+  services:
+  - rbd
+- name: rbd_default_stripe_unit
+  type: size
+  level: advanced
+  desc: default stripe width for new images
+  default: 0
+  services:
+  - rbd
+- name: rbd_default_map_options
+  type: str
+  level: advanced
+  desc: default krbd map options
+  services:
+  - rbd
+- name: rbd_default_clone_format
+  type: str
+  level: advanced
+  desc: default internal format for handling clones
+  long_desc: This sets the internal format for tracking cloned images. The setting
+    of '1' requires attaching to protected snapshots that cannot be removed until
+    the clone is removed/flattened. The setting of '2' will allow clones to be attached
+    to any snapshot and permits removing in-use parent snapshots but requires Mimic
+    or later clients. The default setting of 'auto' will use the v2 format if the
+    cluster is configured to require mimic or later clients.
+  default: auto
+  services:
+  - rbd
+  enum_values:
+  - '1'
+  - '2'
+  - auto
+  flags:
+  - runtime
+- name: rbd_journal_order
+  type: uint
+  level: advanced
+  desc: default order (object size) for journal data objects
+  default: 24
+  services:
+  - rbd
+  min: 12
+  max: 26
+- name: rbd_journal_splay_width
+  type: uint
+  level: advanced
+  desc: number of active journal objects
+  default: 4
+  services:
+  - rbd
+- name: rbd_journal_commit_age
+  type: float
+  level: advanced
+  desc: commit time interval, seconds
+  default: 5
+  services:
+  - rbd
+- name: rbd_journal_object_writethrough_until_flush
+  type: bool
+  level: advanced
+  desc: when enabled, the rbd_journal_object_flush* configuration options are ignored
+    until the first flush so that batched journal IO is known to be safe for consistency
+  default: true
+  services:
+  - rbd
+- name: rbd_journal_object_flush_interval
+  type: uint
+  level: advanced
+  desc: maximum number of pending commits per journal object
+  default: 0
+  services:
+  - rbd
+- name: rbd_journal_object_flush_bytes
+  type: size
+  level: advanced
+  desc: maximum number of pending bytes per journal object
+  default: 1_M
+  services:
+  - rbd
+- name: rbd_journal_object_flush_age
+  type: float
+  level: advanced
+  desc: maximum age (in seconds) for pending commits
+  default: 0
+  services:
+  - rbd
+- name: rbd_journal_object_max_in_flight_appends
+  type: uint
+  level: advanced
+  desc: maximum number of in-flight appends per journal object
+  default: 0
+  services:
+  - rbd
+- name: rbd_journal_pool
+  type: str
+  level: advanced
+  desc: pool for journal objects
+  services:
+  - rbd
+- name: rbd_journal_max_payload_bytes
+  type: size
+  level: advanced
+  desc: maximum journal payload size before splitting
+  default: 16_K
+  services:
+  - rbd
+- name: rbd_journal_max_concurrent_object_sets
+  type: uint
+  level: advanced
+  desc: maximum number of object sets a journal client can be behind before it is
+    automatically unregistered
+  default: 0
+  services:
+  - rbd
+- name: rbd_qos_iops_limit
+  type: uint
+  level: advanced
+  desc: the desired limit of IO operations per second
+  default: 0
+  services:
+  - rbd
+- name: rbd_qos_bps_limit
+  type: uint
+  level: advanced
+  desc: the desired limit of IO bytes per second
+  default: 0
+  services:
+  - rbd
+- name: rbd_qos_read_iops_limit
+  type: uint
+  level: advanced
+  desc: the desired limit of read operations per second
+  default: 0
+  services:
+  - rbd
+- name: rbd_qos_write_iops_limit
+  type: uint
+  level: advanced
+  desc: the desired limit of write operations per second
+  default: 0
+  services:
+  - rbd
+- name: rbd_qos_read_bps_limit
+  type: uint
+  level: advanced
+  desc: the desired limit of read bytes per second
+  default: 0
+  services:
+  - rbd
+- name: rbd_qos_write_bps_limit
+  type: uint
+  level: advanced
+  desc: the desired limit of write bytes per second
+  default: 0
+  services:
+  - rbd
+- name: rbd_qos_iops_burst
+  type: uint
+  level: advanced
+  desc: the desired burst limit of IO operations
+  default: 0
+  services:
+  - rbd
+- name: rbd_qos_bps_burst
+  type: uint
+  level: advanced
+  desc: the desired burst limit of IO bytes
+  default: 0
+  services:
+  - rbd
+- name: rbd_qos_read_iops_burst
+  type: uint
+  level: advanced
+  desc: the desired burst limit of read operations
+  default: 0
+  services:
+  - rbd
+- name: rbd_qos_write_iops_burst
+  type: uint
+  level: advanced
+  desc: the desired burst limit of write operations
+  default: 0
+  services:
+  - rbd
+- name: rbd_qos_read_bps_burst
+  type: uint
+  level: advanced
+  desc: the desired burst limit of read bytes
+  default: 0
+  services:
+  - rbd
+- name: rbd_qos_write_bps_burst
+  type: uint
+  level: advanced
+  desc: the desired burst limit of write bytes
+  default: 0
+  services:
+  - rbd
+- name: rbd_qos_iops_burst_seconds
+  type: uint
+  level: advanced
+  desc: the desired burst duration in seconds of IO operations
+  default: 1
+  services:
+  - rbd
+  min: 1
+- name: rbd_qos_bps_burst_seconds
+  type: uint
+  level: advanced
+  desc: the desired burst duration in seconds of IO bytes
+  default: 1
+  services:
+  - rbd
+  min: 1
+- name: rbd_qos_read_iops_burst_seconds
+  type: uint
+  level: advanced
+  desc: the desired burst duration in seconds of read operations
+  default: 1
+  services:
+  - rbd
+  min: 1
+- name: rbd_qos_write_iops_burst_seconds
+  type: uint
+  level: advanced
+  desc: the desired burst duration in seconds of write operations
+  default: 1
+  services:
+  - rbd
+  min: 1
+- name: rbd_qos_read_bps_burst_seconds
+  type: uint
+  level: advanced
+  desc: the desired burst duration in seconds of read bytes
+  default: 1
+  services:
+  - rbd
+  min: 1
+- name: rbd_qos_write_bps_burst_seconds
+  type: uint
+  level: advanced
+  desc: the desired burst duration in seconds of write bytes
+  default: 1
+  services:
+  - rbd
+  min: 1
+- name: rbd_qos_schedule_tick_min
+  type: uint
+  level: advanced
+  desc: minimum schedule tick (in milliseconds) for QoS
+  long_desc: This determines the minimum time (in milliseconds) at which I/Os
+    can become unblocked if the limit of a throttle is hit. In terms of the
+    token bucket algorithm, this is the minimum interval at which tokens are
+    added to the bucket.
+  default: 50
+  services:
+  - rbd
+  min: 1
+- name: rbd_qos_exclude_ops
+  type: str
+  level: advanced
+  desc: optionally exclude ops from QoS
+  long_desc: 'Optionally exclude ops from QoS. This setting accepts either an integer
+    bitmask value or comma-delimited string of op names. This setting is always internally
+    stored as an integer bitmask value. The mapping between op bitmask value and op
+    name is as follows: +1 -> read, +2 -> write, +4 -> discard, +8 -> write_same,
+    +16 -> compare_and_write'
+  services:
+  - rbd
+  flags:
+  - runtime
+  validator: |
+    [](std::string *value, std::string *error_message) {
+        std::ostringstream ss;
+        uint64_t exclude_ops = librbd::io::rbd_io_operations_from_string(*value, &ss);
+        // Leave this in integer form to avoid breaking Cinder.  Someday
+        // we would like to present this in string form instead...
+        *value = stringify(exclude_ops);
+        if (ss.str().size()) {
+          return -EINVAL;
+        }
+        return 0;
+    }
+- name: rbd_discard_on_zeroed_write_same
+  type: bool
+  level: advanced
+  desc: discard data on zeroed write same instead of writing zero
+  default: true
+  services:
+  - rbd
+- name: rbd_mtime_update_interval
+  type: uint
+  level: advanced
+  desc: RBD Image modify timestamp refresh interval. Set to 0 to disable modify timestamp
+    update.
+  default: 60
+  services:
+  - rbd
+  min: 0
+- name: rbd_atime_update_interval
+  type: uint
+  level: advanced
+  desc: RBD Image access timestamp refresh interval. Set to 0 to disable access timestamp
+    update.
+  default: 60
+  services:
+  - rbd
+  min: 0
+- name: rbd_io_scheduler
+  type: str
+  level: advanced
+  desc: RBD IO scheduler
+  default: simple
+  services:
+  - rbd
+  enum_values:
+  - none
+  - simple
+- name: rbd_io_scheduler_simple_max_delay
+  type: uint
+  level: advanced
+  desc: maximum io delay (in milliseconds) for simple io scheduler (if set to 0 dalay
+    is calculated based on latency stats)
+  default: 0
+  services:
+  - rbd
+  min: 0
+- name: rbd_persistent_cache_mode
+  type: str
+  level: advanced
+  desc: enable persistent write back cache for this volume
+  default: disabled
+  services:
+  - rbd
+  enum_values:
+  - disabled
+  - rwl
+  - ssd
+- name: rbd_persistent_cache_size
+  type: uint
+  level: advanced
+  desc: size of the persistent write back cache for this volume
+  default: 1_G
+  services:
+  - rbd
+  min: 1_G
+- name: rbd_persistent_cache_path
+  type: str
+  level: advanced
+  desc: location of the persistent write back cache in a DAX-enabled filesystem on
+    persistent memory
+  default: /tmp
+  services:
+  - rbd
+- name: rbd_quiesce_notification_attempts
+  type: uint
+  level: dev
+  desc: the number of quiesce notification attempts
+  default: 10
+  services:
+  - rbd
+  min: 1
+- name: rbd_default_snapshot_quiesce_mode
+  type: str
+  level: advanced
+  desc: default snapshot quiesce mode
+  default: required
+  services:
+  - rbd
+  enum_values:
+  - required
+  - ignore-error
+  - skip
+- name: rbd_plugins
+  type: str
+  level: advanced
+  desc: comma-delimited list of librbd plugins to enable
+  services:
+  - rbd
+- name: rbd_config_pool_override_update_timestamp
+  type: uint
+  level: dev
+  desc: timestamp of last update to pool-level config overrides
+  default: 0
+  services:
+  - rbd
diff --git a/src/common/options/rgw.yaml.in b/src/common/options/rgw.yaml.in
new file mode 100644
index 000000000..241632a22
--- /dev/null
+++ b/src/common/options/rgw.yaml.in
@@ -0,0 +1,3770 @@
+# -*- mode: YAML -*-
+---
+
+options:
+# According to AWS S3(http://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html),
+# An ACL may have up to 100 grants.
+- name: rgw_acl_grants_max_num
+  type: int
+  level: advanced
+  desc: The maximum number of ACL grants in a single request.
+  default: 100
+  services:
+  - rgw
+  with_legacy: true
+# A user may have up to 100 IAM user policies.
+- name: rgw_user_policies_max_num
+  type: int
+  level: advanced
+  desc: The maximum number of IAM user policies for a single user.
+  default: 100
+  services:
+  - rgw
+  with_legacy: true
+# According to AWS S3 (http://docs.aws.amazon.com/AmazonS3/latest/dev/cors.html),
+# A CORS request may have up to 100 rules.
+- name: rgw_cors_rules_max_num
+  type: int
+  level: advanced
+  desc: The maximum number of CORS rules in a single request.
+  default: 100
+  services:
+  - rgw
+  with_legacy: true
+# According to AWS S3 (https://docs.aws.amazon.com/AmazonS3/latest/dev/DeletingObjects.html),
+# Amazon S3 also provides the Multi-Object Delete API that you can use to delete up to 1000
+# objects in a single HTTP request.
+- name: rgw_delete_multi_obj_max_num
+  type: int
+  level: advanced
+  desc: The maximum number of objects in a single multi-object delete request.
+  default: 1000
+  services:
+  - rgw
+  with_legacy: true
+# According to AWS S3, An website routing config can have up to 50 rules.
+- name: rgw_website_routing_rules_max_num
+  type: int
+  level: advanced
+  desc: The maximum number of website routing rules in a single request.
+  default: 50
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_rados_tracing
+  type: bool
+  level: advanced
+  desc: Enables LTTng-UST tracepoints.
+  default: false
+  services:
+  - rgw
+- name: rgw_op_tracing
+  type: bool
+  level: advanced
+  desc: Enables LTTng-UST operator tracepoints.
+  default: false
+  services:
+  - rgw
+- name: rgw_max_chunk_size
+  type: size
+  level: advanced
+  desc: The maximum RGW chunk size.
+  long_desc: The chunk size is the size of RADOS I/O requests that RGW sends when
+    accessing data objects. RGW read and write operations will never request more than
+    this amount in a single request. This also defines the RGW head object size, as
+    head operations need to be atomic, and anything larger than this would require
+    more than a single operation. When RGW objects are written to the default
+    storage class, up to this amount of payload data will be stored alongside
+    metadata in the head object.
+  default: 4_M
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_put_obj_min_window_size
+  type: size
+  level: advanced
+  desc: The minimum RADOS write window size (in bytes).
+  long_desc: The window size determines the total concurrent RADOS writes of a single
+    RGW object. When writing an object RGW will send multiple chunks to RADOS. The
+    total size of the writes does not exceed the window size. The window size may
+    be adjusted dynamically in order to better utilize the pipe.
+  default: 16_M
+  services:
+  - rgw
+  see_also:
+  - rgw_put_obj_max_window_size
+  - rgw_max_chunk_size
+  with_legacy: true
+- name: rgw_put_obj_max_window_size
+  type: size
+  level: advanced
+  desc: The maximum RADOS write window size (in bytes).
+  long_desc: The window size may be dynamically adjusted, but will not surpass this
+    value.
+  default: 64_M
+  services:
+  - rgw
+  see_also:
+  - rgw_put_obj_min_window_size
+  - rgw_max_chunk_size
+  with_legacy: true
+- name: rgw_max_put_size
+  type: size
+  level: advanced
+  desc: The maximum size (in bytes) of regular (non multi-part) object upload.
+  long_desc: Plain object upload is capped at this amount of data. In order to upload
+    larger objects, a special upload mechanism is required. The S3 API provides the
+    multi-part upload, and Swift provides DLO and SLO.
+  default: 5_G
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_max_put_param_size
+  type: size
+  level: advanced
+  desc: The maximum size (in bytes) of data input of certain RESTful requests.
+  default: 1_M
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_max_attr_size
+  type: size
+  level: advanced
+  desc: The maximum length of metadata value. 0 skips the check
+  default: 0
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_max_attr_name_len
+  type: size
+  level: advanced
+  desc: The maximum length of metadata name. 0 skips the check
+  default: 0
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_max_attrs_num_in_req
+  type: uint
+  level: advanced
+  desc: The maximum number of metadata items that can be put via single request
+  default: 0
+  services:
+  - rgw
+  with_legacy: true
+# override max bucket index shards in zone configuration (if not zero)
+#
+# Represents the number of shards for the bucket index object, a value of zero
+# indicates there is no sharding. By default (no sharding, the name of the object
+# is '.dir.{marker}', with sharding, the name is '.dir.{markder}.{sharding_id}',
+# sharding_id is zero-based value. It is not recommended to set a too large value
+# (e.g. thousand) as it increases the cost for bucket listing.
+- name: rgw_override_bucket_index_max_shards
+  type: uint
+  level: dev
+  desc: The default number of bucket index shards for newly-created buckets. This
+    value overrides bucket_index_max_shards stored in the zone. Setting this value
+    in the zone is preferred, because it applies globally to all radosgw daemons running
+    in the zone.
+  fmt_desc: Represents the number of shards for the bucket index object,
+    a value of zero indicates there is no sharding. It is not
+    recommended to set a value too large (e.g. thousand) as it
+    increases the cost for bucket listing.
+    This variable should be set in the client or global sections
+    so that it is automatically applied to radosgw-admin commands.
+  default: 0
+  services:
+  - rgw
+  with_legacy: true
+# Represents the maximum AIO pending requests for the bucket index object shards.
+- name: rgw_bucket_index_max_aio
+  type: uint
+  level: advanced
+  desc: Max number of concurrent RADOS requests when handling bucket shards.
+  default: 128
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_multi_obj_del_max_aio
+  type: uint
+  level: advanced
+  desc: Max number of concurrent RADOS requests per multi-object delete request.
+  default: 16
+  services:
+  - rgw
+  with_legacy: true
+# whether or not the quota/gc threads should be started
+- name: rgw_enable_quota_threads
+  type: bool
+  level: advanced
+  desc: Enables the quota maintenance thread.
+  long_desc: The quota maintenance thread is responsible for quota related maintenance
+    work. The thread itself can be disabled, but in order for quota to work correctly,
+    at least one RGW in each zone needs to have this thread running. Having the thread
+    enabled on multiple RGW processes within the same zone can spread some of the
+    maintenance work between them.
+  default: true
+  services:
+  - rgw
+  see_also:
+  - rgw_enable_gc_threads
+  - rgw_enable_lc_threads
+  with_legacy: true
+- name: rgw_enable_gc_threads
+  type: bool
+  level: advanced
+  desc: Enables the garbage collection maintenance thread.
+  long_desc: The garbage collection maintenance thread is responsible for garbage
+    collector maintenance work. The thread itself can be disabled, but in order for
+    garbage collection to work correctly, at least one RGW in each zone needs to have
+    this thread running.  Having the thread enabled on multiple RGW processes within
+    the same zone can spread some of the maintenance work between them.
+  default: true
+  services:
+  - rgw
+  see_also:
+  - rgw_enable_quota_threads
+  - rgw_enable_lc_threads
+  with_legacy: true
+- name: rgw_enable_lc_threads
+  type: bool
+  level: advanced
+  desc: Enables the lifecycle maintenance thread. This is required on at least one
+    rgw for each zone.
+  long_desc: The lifecycle maintenance thread is responsible for lifecycle related
+    maintenance work. The thread itself can be disabled, but in order for lifecycle
+    to work correctly, at least one RGW in each zone needs to have this thread running.
+    Havingthe thread enabled on multiple RGW processes within the same zone can spread
+    some of the maintenance work between them.
+  default: true
+  services:
+  - rgw
+  see_also:
+  - rgw_enable_gc_threads
+  - rgw_enable_quota_threads
+  with_legacy: true
+- name: rgw_data
+  type: str
+  level: advanced
+  desc: Alternative location for RGW configuration.
+  long_desc: If this is set, the different Ceph system configurables (such as the keyring file will be located in the path that is specified here.
+  fmt_desc: Sets the location of the data files for Ceph RADOS Gateway.
+  default: /var/lib/ceph/radosgw/$cluster-$id
+  services:
+  - rgw
+  flags:
+  - no_mon_update
+  with_legacy: true
+- name: rgw_enable_apis
+  type: str
+  level: advanced
+  desc: A list of set of RESTful APIs that rgw handles.
+  fmt_desc: |
+    Enables the specified APIs.
+
+      .. note:: Enabling the ``s3`` API is a requirement for
+                any ``radosgw`` instance that is meant to
+                participate in a `multi-site <../multisite>`_
+                configuration.
+  default: s3, s3website, swift, swift_auth, admin, sts, iam, notifications
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_cache_enabled
+  type: bool
+  level: advanced
+  desc: Enable RGW metadata cache.
+  long_desc: The metadata cache holds metadata entries that RGW requires for processing
+    requests. Metadata entries can be user info, bucket info, and bucket instance
+    info. If not found in the cache, entries will be fetched from the backing RADOS
+    store.
+  fmt_desc: Whether the Ceph Object Gateway cache is enabled.
+  default: true
+  services:
+  - rgw
+  see_also:
+  - rgw_cache_lru_size
+  with_legacy: true
+- name: rgw_cache_lru_size
+  type: int
+  level: advanced
+  desc: Max number of items in RGW metadata cache.
+  long_desc: When full, the RGW metadata cache evicts least recently used entries.
+  fmt_desc: The number of entries in the Ceph Object Gateway cache.
+  default: 10000
+  services:
+  - rgw
+  see_also:
+  - rgw_cache_enabled
+  with_legacy: true
+- name: rgw_dns_name
+  type: str
+  level: advanced
+  desc: The host names that RGW uses.
+  long_desc: A comma separated list of DNS names.
+    This is Needed for virtual hosting of buckets to work properly, unless
+    configured via zonegroup configuration.
+  fmt_desc: The DNS names of the served domains. See also the ``hostnames`` setting within zonegroups.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_dns_s3website_name
+  type: str
+  level: advanced
+  desc: The host name that RGW uses for static websites (S3)
+  long_desc: This is needed for virtual hosting of buckets, unless configured via
+    zonegroup configuration.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_numa_node
+  type: int
+  level: advanced
+  desc: set rgw's cpu affinity to a numa node (-1 for none)
+  default: -1
+  services:
+  - rgw
+  flags:
+  - startup
+- name: rgw_service_provider_name
+  type: str
+  level: advanced
+  desc: Service provider name which is contained in http response headers
+  long_desc: As S3 or other cloud storage providers do, http response headers should
+    contain the name of the provider. This name will be placed in http header 'Server'.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_content_length_compat
+  type: bool
+  level: advanced
+  desc: Multiple content length headers compatibility
+  long_desc: Try to handle requests with abiguous multiple content length headers
+    (Content-Length, Http-Content-Length).
+  fmt_desc: Enable compatibility handling of FCGI requests with both ``CONTENT_LENGTH``
+    and ``HTTP_CONTENT_LENGTH`` set.
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_relaxed_region_enforcement
+  type: bool
+  level: advanced
+  desc: Disable region constraint enforcement
+  long_desc: Enable requests such as bucket creation to succeed irrespective of region
+    restrictions (Jewel compat).
+  default: false
+  services:
+  - rgw
+- name: rgw_lifecycle_work_time
+  type: str
+  level: advanced
+  desc: Lifecycle allowed work time
+  long_desc: Local time window in which the lifecycle maintenance thread can work.
+  default: 00:00-06:00
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_lc_lock_max_time
+  type: int
+  level: dev
+  default: 90
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_lc_thread_delay
+  type: int
+  level: advanced
+  desc: Delay after processing of bucket listing chunks (i.e., per 1000 entries) in
+    milliseconds
+  default: 0
+  services:
+  - rgw
+- name: rgw_lc_max_worker
+  type: int
+  level: advanced
+  desc: Number of LCWorker tasks that will be run in parallel
+  long_desc: Number of LCWorker tasks that will run in parallel--used to permit >1
+    bucket/index shards to be processed simultaneously
+  fmt_desc: This option specifies the number of lifecycle worker threads
+    to run in parallel, thereby processing bucket and index
+    shards simultaneously.
+  default: 3
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_lc_max_wp_worker
+  type: int
+  level: advanced
+  desc: Number of workpool threads per LCWorker
+  long_desc: Number of threads in per-LCWorker workpools--used to accelerate per-bucket
+    processing
+  fmt_desc: This option specifies the number of threads in each lifecycle
+    workers work pool. This option can help accelerate processing each bucket.
+  default: 3
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_lc_max_objs
+  type: int
+  level: advanced
+  desc: Number of lifecycle data shards
+  long_desc: Number of RADOS objects to use for storing lifecycle index. This affects
+    concurrency of lifecycle maintenance, as shards can be processed in parallel.
+  default: 32
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_lc_max_rules
+  type: uint
+  level: advanced
+  desc: Max number of lifecycle rules set on one bucket
+  long_desc: Number of lifecycle rules set on one bucket should be limited.
+  default: 1000
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_lc_debug_interval
+  type: int
+  level: dev
+  desc: The number of seconds that simulate one "day" in order to debug RGW LifeCycle.
+    Do *not* modify for a production cluster.
+  long_desc: For debugging RGW LifeCycle, the number of seconds that are equivalent to
+    one simulated "day". Values less than 1 are ignored and do not change LifeCycle behavior.
+    For example, during debugging if one wanted every 10 minutes to be equivalent to one day,
+    then this would be set to 600, the number of seconds in 10 minutes.
+  default: -1
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_mp_lock_max_time
+  type: int
+  level: advanced
+  desc: Multipart upload max completion time
+  long_desc: Time length to allow completion of a multipart upload operation. This
+    is done to prevent concurrent completions on the same object with the same upload
+    id.
+  default: 10_min
+  services:
+  - rgw
+- name: rgw_script_uri
+  type: str
+  level: dev
+  fmt_desc: The alternative value for the ``SCRIPT_URI`` if not set
+    in the request.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_request_uri
+  type: str
+  level: dev
+  fmt_desc: The alternative value for the ``REQUEST_URI`` if not set
+    in the request.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_ignore_get_invalid_range
+  type: bool
+  level: advanced
+  desc: Treat invalid (e.g., negative) range request as full
+  long_desc: Treat invalid (e.g., negative) range request as request for the full
+    object (AWS compatibility)
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_swift_url
+  type: str
+  level: advanced
+  desc: Swift-auth storage URL
+  long_desc: Used in conjunction with rgw internal swift authentication. This affects
+    the X-Storage-Url response header value.
+  fmt_desc: The URL for the Ceph Object Gateway Swift API.
+  services:
+  - rgw
+  see_also:
+  - rgw_swift_auth_entry
+  with_legacy: true
+- name: rgw_swift_url_prefix
+  type: str
+  level: advanced
+  desc: Swift URL prefix
+  long_desc: The URL path prefix for swift requests.
+  fmt_desc: |
+    The URL prefix for the Swift API, to distinguish it from
+    the S3 API endpoint. The default is ``swift``, which
+    makes the Swift API available at the URL
+    ``http://host:port/swift/v1`` (or
+    ``http://host:port/swift/v1/AUTH_%(tenant_id)s`` if
+    ``rgw swift account in url`` is enabled).
+
+    For compatibility, setting this configuration variable
+    to the empty string causes the default ``swift`` to be
+    used; if you do want an empty prefix, set this option to
+    ``/``.
+
+    .. warning:: If you set this option to ``/``, you must
+                 disable the S3 API by modifying ``rgw
+                 enable apis`` to exclude ``s3``. It is not
+                 possible to operate radosgw with ``rgw
+                 swift url prefix = /`` and simultaneously
+                 support both the S3 and Swift APIs. If you
+                 do need to support both APIs without
+                 prefixes, deploy multiple radosgw instances
+                 to listen on different hosts (or ports)
+                 instead, enabling some for S3 and some for
+                 Swift.
+  example: /swift-testing
+  default: swift
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_swift_auth_url
+  type: str
+  level: advanced
+  desc: Swift auth URL
+  long_desc: Default url to which RGW connects and verifies tokens for v1 auth (if
+    not using internal swift auth).
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_swift_auth_entry
+  type: str
+  level: advanced
+  desc: Swift auth URL prefix
+  long_desc: URL path prefix for internal swift auth requests.
+  fmt_desc: The entry point for a Swift auth URL.
+  default: auth
+  services:
+  - rgw
+  see_also:
+  - rgw_swift_url
+  with_legacy: true
+- name: rgw_swift_tenant_name
+  type: str
+  level: advanced
+  desc: Swift tenant name
+  long_desc: Tenant name that is used when constructing the swift path.
+  services:
+  - rgw
+  see_also:
+  - rgw_swift_account_in_url
+  with_legacy: true
+- name: rgw_swift_account_in_url
+  type: bool
+  level: advanced
+  desc: Swift account encoded in URL
+  long_desc: Whether the swift account is encoded in the uri path (AUTH_<account>).
+  fmt_desc: |
+    Whether or not the Swift account name should be included
+    in the Swift API URL.
+    If set to ``false`` (the default), then the Swift API
+    will listen on a URL formed like
+    ``http://host:port/<rgw_swift_url_prefix>/v1``, and the
+    account name (commonly a Keystone project UUID if
+    radosgw is configured with `Keystone integration
+    <../keystone>`_) will be inferred from request
+    headers.
+    If set to ``true``, the Swift API URL will be
+    ``http://host:port/<rgw_swift_url_prefix>/v1/AUTH_<account_name>``
+    (or
+    ``http://host:port/<rgw_swift_url_prefix>/v1/AUTH_<keystone_project_id>``)
+    instead, and the Keystone ``object-store`` endpoint must
+    accordingly be configured to include the
+    ``AUTH_%(tenant_id)s`` suffix.
+    You **must** set this option to ``true`` (and update the
+    Keystone service catalog) if you want radosgw to support
+    publicly-readable containers and `temporary URLs
+    <../swift/tempurl>`_.
+  default: false
+  services:
+  - rgw
+  see_also:
+  - rgw_swift_tenant_name
+  with_legacy: true
+- name: rgw_swift_enforce_content_length
+  type: bool
+  level: advanced
+  desc: Send content length when listing containers (Swift)
+  long_desc: Whether content length header is needed when listing containers. When
+    this is set to false, RGW will send extra info for each entry in the response.
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_keystone_url
+  type: str
+  level: basic
+  desc: The URL to the Keystone server.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_keystone_admin_token
+  type: str
+  level: advanced
+  desc: 'DEPRECATED: The admin token (shared secret) that is used for the Keystone
+    requests.'
+  fmt_desc: The Keystone admin token (shared secret). In Ceph RGW
+    authentication with the admin token has priority over
+    authentication with the admin credentials
+    (``rgw_keystone_admin_user``, ``rgw_keystone_admin_password``,
+    ``rgw_keystone_admin_tenant``, ``rgw_keystone_admin_project``,
+    ``rgw_keystone_admin_domain``). The Keystone admin token
+    has been deprecated, but can be used to integrate with
+    older environments.  It is preferred to instead configure
+    ``rgw_keystone_admin_token_path`` to avoid exposing the token.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_keystone_admin_token_path
+  type: str
+  level: advanced
+  desc: Path to a file containing the admin token (shared secret) that is used for
+    the Keystone requests.
+  fmt_desc: Path to a file containing the Keystone admin token
+    (shared secret).  In Ceph RadosGW authentication with
+    the admin token has priority over authentication with
+    the admin credentials
+    (``rgw_keystone_admin_user``, ``rgw_keystone_admin_password``,
+    ``rgw_keystone_admin_tenant``, ``rgw_keystone_admin_project``,
+    ``rgw_keystone_admin_domain``).
+    The Keystone admin token has been deprecated, but can be
+    used to integrate with older environments.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_keystone_admin_user
+  type: str
+  level: advanced
+  desc: Keystone admin user.
+  fmt_desc: The name of OpenStack user with admin privilege for Keystone
+    authentication (Service User) when using OpenStack Identity API v2
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_keystone_admin_password
+  type: str
+  level: advanced
+  desc: 'DEPRECATED: Keystone admin password.'
+  fmt_desc: The password for OpenStack admin user when using OpenStack
+    Identity API v2.  It is preferred to instead configure
+    ``rgw_keystone_admin_password_path`` to avoid exposing the token.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_keystone_admin_password_path
+  type: str
+  level: advanced
+  desc: Path to a file containing the Keystone admin password.
+  fmt_desc: Path to a file containing the password for OpenStack
+    admin user when using OpenStack Identity API v2.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_keystone_admin_tenant
+  type: str
+  level: advanced
+  desc: Keystone admin user tenant.
+  fmt_desc: The name of OpenStack tenant with admin privilege (Service Tenant) when
+    using OpenStack Identity API v2
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_keystone_admin_project
+  type: str
+  level: advanced
+  desc: Keystone admin user project (for Keystone v3).
+  fmt_desc: The name of OpenStack project with admin privilege when using
+    OpenStack Identity API v3. If left unspecified, value of
+    ``rgw keystone admin tenant`` will be used instead.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_keystone_admin_domain
+  type: str
+  level: advanced
+  desc: Keystone admin user domain (for Keystone v3).
+  fmt_desc: The name of OpenStack domain with admin privilege when using
+    OpenStack Identity API v3.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_keystone_service_token_enabled
+  type: bool
+  level: advanced
+  desc: Service tokens allowing the usage of expired Keystone auth tokens
+  fmt_desc: The service token support allows the incoming request to contain
+    a X-Service-Token header with a Keystone token that if it has acceptable
+    roles allows using an expired token in the X-Auth-Token header.
+  default: false
+  see_also:
+  - rgw_keystone_service_token_accepted_roles
+  - rgw_keystone_expired_token_cache_expiration
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_keystone_service_token_accepted_roles
+  type: str
+  level: advanced
+  desc: Only users with one of these roles will be valid for service users.
+  fmt_desc: The users that created the service token given must have one of
+    these roles to be considered a valid service user.
+  default: admin
+  see_also:
+  - rgw_keystone_service_token_enabled
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_keystone_expired_token_cache_expiration
+  type: int
+  level: advanced
+  desc: The number of seconds to add to current time for expired token expiration
+  fmt_desc: The expired token that is allowed when a valid service token is given
+    need a new expiration date for the caching. This is the seconds to add to the
+    current time and then set on an expired token that is verified with a service token.
+  default: 3600
+  services:
+  - rgw
+  see_also:
+  - rgw_keystone_service_token_enabled
+  with_legacy: true
+- name: rgw_keystone_barbican_user
+  type: str
+  level: advanced
+  desc: Keystone user to access barbican secrets.
+  fmt_desc: The name of the OpenStack user with access to the `Barbican`_
+    secrets used for `Encryption`_.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_keystone_barbican_password
+  type: str
+  level: advanced
+  desc: Keystone password for barbican user.
+  fmt_desc: The password associated with the `Barbican`_ user.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_keystone_barbican_tenant
+  type: str
+  level: advanced
+  desc: Keystone barbican user tenant (Keystone v2.0).
+  fmt_desc: The name of the OpenStack tenant associated with the `Barbican`_
+    user when using OpenStack Identity API v2.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_keystone_barbican_project
+  type: str
+  level: advanced
+  desc: Keystone barbican user project (Keystone v3).
+  fmt_desc: The name of the OpenStack project associated with the `Barbican`_
+    user when using OpenStack Identity API v3.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_keystone_barbican_domain
+  type: str
+  level: advanced
+  desc: Keystone barbican user domain.
+  fmt_desc: The name of the OpenStack domain associated with the `Barbican`_
+    user when using OpenStack Identity API v3.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_keystone_api_version
+  type: int
+  level: advanced
+  desc: Version of Keystone API to use (2 or 3).
+  fmt_desc: The version (2 or 3) of OpenStack Identity API that should be
+    used for communication with the Keystone server.
+  default: 2
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_keystone_accepted_roles
+  type: str
+  level: advanced
+  desc: Only users with one of these roles will be served when doing Keystone authentication.
+  fmt_desc: The roles required to serve requests.
+  default: Member, admin
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_keystone_accepted_admin_roles
+  type: str
+  level: advanced
+  desc: List of roles allowing user to gain admin privileges (Keystone).
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_keystone_token_cache_size
+  type: int
+  level: advanced
+  desc: Keystone token cache size
+  long_desc: Max number of Keystone tokens that will be cached. Token that is not
+    cached requires RGW to access the Keystone server when authenticating.
+  fmt_desc: The maximum number of entries in each Keystone token cache.
+  default: 10000
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_keystone_verify_ssl
+  type: bool
+  level: advanced
+  desc: Should RGW verify the Keystone server SSL certificate.
+  fmt_desc: Verify SSL certificates while making token requests to keystone.
+  default: true
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_keystone_implicit_tenants
+  type: str
+  level: advanced
+  desc: RGW Keystone implicit tenants creation
+  long_desc: Implicitly create new users in their own tenant with the same name when
+    authenticating via Keystone.  Can be limited to s3 or swift only.
+  default: 'false'
+  services:
+  - rgw
+  enum_values:
+  - 'false'
+  - 'true'
+  - swift
+  - s3
+  - both
+  - '0'
+  - '1'
+  - none
+  with_legacy: true
+- name: rgw_cross_domain_policy
+  type: str
+  level: advanced
+  desc: RGW handle cross domain policy
+  long_desc: Returned cross domain policy when accessing the crossdomain.xml resource
+    (Swift compatiility).
+  default: <allow-access-from domain="*" secure="false" />
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_healthcheck_disabling_path
+  type: str
+  level: dev
+  desc: Swift health check api can be disabled if a file can be accessed in this path.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_s3_auth_use_rados
+  type: bool
+  level: advanced
+  desc: Should S3 authentication use credentials stored in RADOS backend.
+  default: true
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_s3_auth_use_keystone
+  type: bool
+  level: advanced
+  desc: Should S3 authentication use Keystone.
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_s3_auth_order
+  type: str
+  level: advanced
+  desc: Authentication strategy order to use for s3 authentication
+  long_desc: Order of authentication strategies to try for s3 authentication, the
+    allowed options are a comma separated list of engines external, local. The default
+    order is to try all the externally configured engines before attempting local
+    rados based authentication
+  default: sts, external, local
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_barbican_url
+  type: str
+  level: advanced
+  desc: URL to barbican server.
+  fmt_desc: The URL for the Barbican server.
+  services:
+  - rgw
+  with_legacy: true
+# OpenLDAP-style LDAP parameter strings
+- name: rgw_ldap_uri
+  type: str
+  level: advanced
+  desc: Space-separated list of LDAP servers in URI format.
+  default: ldaps://<ldap.your.domain>
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_ldap_binddn
+  type: str
+  level: advanced
+  desc: LDAP entry RGW will bind with (user match).
+  default: uid=admin,cn=users,dc=example,dc=com
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_ldap_searchdn
+  type: str
+  level: advanced
+  desc: LDAP search base (basedn).
+  default: cn=users,cn=accounts,dc=example,dc=com
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_ldap_dnattr
+  type: str
+  level: advanced
+  desc: LDAP attribute containing RGW user names (to form binddns).
+  default: uid
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_ldap_secret
+  type: str
+  level: advanced
+  desc: Path to file containing credentials for rgw_ldap_binddn.
+  default: /etc/openldap/secret
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_s3_auth_use_ldap
+  type: bool
+  level: advanced
+  desc: Should S3 authentication use LDAP.
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_ldap_searchfilter
+  type: str
+  level: advanced
+  desc: LDAP search filter.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_opa_url
+  type: str
+  level: advanced
+  desc: URL to OPA server.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_opa_token
+  type: str
+  level: advanced
+  desc: The Bearer token OPA uses to authenticate client requests.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_opa_verify_ssl
+  type: bool
+  level: advanced
+  desc: Should RGW verify the OPA server SSL certificate.
+  default: true
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_use_opa_authz
+  type: bool
+  level: advanced
+  desc: Should OPA be used to authorize client requests.
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_admin_entry
+  type: str
+  level: advanced
+  desc: Path prefix to be used for accessing RGW RESTful admin API.
+  fmt_desc: The entry point for an admin request URL.
+  default: admin
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_enforce_swift_acls
+  type: bool
+  level: advanced
+  desc: RGW enforce swift acls
+  long_desc: Should RGW enforce special Swift-only ACLs. Swift has a special ACL that
+    gives permission to access all objects in a container.
+  fmt_desc: Enforces the Swift Access Control List (ACL) settings.
+  default: true
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_swift_token_expiration
+  type: int
+  level: advanced
+  desc: Expiration time (in seconds) for token generated through RGW Swift auth.
+  fmt_desc: The time in seconds for expiring a Swift token.
+  default: 1_day
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_print_continue
+  type: bool
+  level: advanced
+  desc: RGW support of 100-continue
+  long_desc: Should RGW explicitly send 100 (continue) responses. This is mainly relevant
+    when using FastCGI, as some FastCGI modules do not fully support this feature.
+  fmt_desc: Enable ``100-continue`` if it is operational.
+  default: true
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_print_prohibited_content_length
+  type: bool
+  level: advanced
+  desc: RGW RFC-7230 compatibility
+  long_desc: Specifies whether RGW violates RFC 7230 and sends Content-Length with
+    204 or 304 statuses.
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_remote_addr_param
+  type: str
+  level: advanced
+  desc: HTTP header that holds the remote address in incoming requests.
+  long_desc: RGW will use this header to extract requests origin. When RGW runs behind
+    a reverse proxy, the remote address header will point at the proxy's address and
+    not at the originator's address. Therefore it is sometimes possible to have the
+    proxy add the originator's address in a separate HTTP header, which will allow
+    RGW to log it correctly.
+  fmt_desc: The remote address parameter. For example, the HTTP field
+    containing the remote address, or the ``X-Forwarded-For``
+    address if a reverse proxy is operational.
+  default: REMOTE_ADDR
+  services:
+  - rgw
+  see_also:
+  - rgw_enable_ops_log
+  with_legacy: true
+- name: rgw_op_thread_timeout
+  type: int
+  level: dev
+  desc: Timeout for async rados coroutine operations.
+  fmt_desc: The timeout in seconds for open threads.
+  default: 10_min
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_op_thread_suicide_timeout
+  type: int
+  level: dev
+  default: 0
+  fmt_desc: The time ``timeout`` in seconds before a Ceph Object Gateway
+    process dies. Disabled if set to ``0``.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_thread_pool_size
+  type: int
+  level: basic
+  desc: RGW requests handling thread pool size.
+  long_desc: This parameter determines the number of concurrent requests RGW can process
+    when using either the civetweb, or the fastcgi frontends. The higher this number
+    is, RGW will be able to deal with more concurrent requests at the cost of more
+    resource utilization.
+  fmt_desc: The size of the thread pool.
+  default: 512
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_num_control_oids
+  type: int
+  level: advanced
+  desc: Number of control objects used for cross-RGW communication.
+  long_desc: RGW uses certain control objects to send messages between different RGW
+    processes running on the same zone. These messages include metadata cache invalidation
+    info that is being sent when metadata is modified (such as user or bucket information).
+    A higher number of control objects allows better concurrency of these messages,
+    at the cost of more resource utilization.
+  fmt_desc: The number of notification objects used for cache synchronization
+    between different ``rgw`` instances.
+  default: 8
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_verify_ssl
+  type: bool
+  level: advanced
+  desc: Should RGW verify SSL when connecing to a remote HTTP server
+  long_desc: RGW can send requests to other RGW servers (e.g., in multi-site sync
+    work). This configurable selects whether RGW should verify the certificate for
+    the remote peer and host.
+  fmt_desc: Verify SSL certificates while making requests.
+  default: true
+  services:
+  - rgw
+  see_also:
+  - rgw_keystone_verify_ssl
+  with_legacy: true
+# The following are tunables for caches of RGW NFS (and other file
+# client) objects.
+#
+# The file handle cache is a partitioned hash table
+# (fhcache_partitions), each with a closed hash part and backing
+# b-tree mapping.  The number of partions is expected to be a small
+# prime, the cache size something larger but less than 5K, the total
+# size of the cache is n_part *  cache_size.
+- name: rgw_nfs_lru_lanes
+  type: int
+  level: advanced
+  default: 5
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_nfs_lru_lane_hiwat
+  type: int
+  level: advanced
+  default: 911
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_nfs_fhcache_partitions
+  type: int
+  level: advanced
+  default: 3
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_nfs_fhcache_size
+  type: int
+  level: advanced
+  default: 2017
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_nfs_namespace_expire_secs
+  type: int
+  level: advanced
+  default: 5_min
+  services:
+  - rgw
+  min: 1
+  with_legacy: true
+- name: rgw_nfs_max_gc
+  type: int
+  level: advanced
+  default: 5_min
+  services:
+  - rgw
+  min: 1
+  with_legacy: true
+- name: rgw_nfs_write_completion_interval_s
+  type: int
+  level: advanced
+  default: 10
+  services:
+  - rgw
+  with_legacy: true
+# use fast S3 attrs from bucket index--currently assumes NFS mounts are immutable
+- name: rgw_nfs_s3_fast_attrs
+  type: bool
+  level: advanced
+  desc: use fast S3 attrs from bucket index (immutable only)
+  long_desc: use fast S3 attrs from bucket index (assumes NFS mounts are immutable)
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+# overrides for librgw/nfs
+- name: rgw_nfs_run_gc_threads
+  type: bool
+  level: advanced
+  desc: run GC threads in librgw (default off)
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_nfs_run_lc_threads
+  type: bool
+  level: advanced
+  desc: run lifecycle threads in librgw (default off)
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_nfs_run_quota_threads
+  type: bool
+  level: advanced
+  desc: run quota threads in librgw (default off)
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_nfs_run_sync_thread
+  type: bool
+  level: advanced
+  desc: run sync thread in librgw (default off)
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_nfs_frontends
+  type: str
+  level: basic
+  desc: RGW frontends configuration when running as librgw/nfs
+  long_desc: A comma-delimited list of frontends configuration. Each configuration
+    contains the type of the frontend followed by an optional space delimited set
+    of key=value config parameters.
+  fmt_desc: Configures the HTTP frontend(s). The configuration for multiple
+    frontends can be provided in a comma-delimited list. Each frontend
+    configuration may include a list of options separated by spaces,
+    where each option is in the form "key=value" or "key". See
+    `HTTP Frontends`_ for more on supported options.
+  default: rgw-nfs
+  services:
+  - rgw
+  with_legacy: true
+  see_also:
+  - rgw_frontends
+- name: rgw_rados_pool_autoscale_bias
+  type: float
+  level: advanced
+  desc: pg_autoscale_bias value for RGW metadata (omap-heavy) pools
+  default: 4
+  services:
+  - rgw
+  min: 0.01
+  max: 100000
+- name: rgw_rados_pool_recovery_priority
+  type: uint
+  level: advanced
+  desc: recovery_priority value for RGW metadata (omap-heavy) pools
+  default: 5
+  services:
+  - rgw
+  min: -10
+  max: 10
+- name: rgw_zone
+  type: str
+  level: advanced
+  desc: Zone name
+  fmt_desc: The name of the zone for the gateway instance. If no zone is
+    set, a cluster-wide default can be configured with the command
+    ``radosgw-admin zone default``.
+  services:
+  - rgw
+  see_also:
+  - rgw_zonegroup
+  - rgw_realm
+  with_legacy: true
+- name: rgw_zone_id
+  type: str
+  level: advanced
+  desc: Zone ID
+  services:
+  - rgw
+  see_also:
+  - rgw_zone
+  - rgw_zonegroup
+  - rgw_realm
+- name: rgw_zone_root_pool
+  type: str
+  level: advanced
+  desc: Zone root pool name
+  long_desc: The zone root pool, is the pool where the RGW zone configuration located.
+  default: .rgw.root
+  services:
+  - rgw
+  see_also:
+  - rgw_zonegroup_root_pool
+  - rgw_realm_root_pool
+  - rgw_period_root_pool
+  with_legacy: true
+- name: rgw_default_zone_info_oid
+  type: str
+  level: advanced
+  desc: Default zone info object id
+  long_desc: Name of the RADOS object that holds the default zone information.
+  default: default.zone
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_region
+  type: str
+  level: advanced
+  desc: Region name
+  long_desc: Obsolete config option. The rgw_zonegroup option should be used instead.
+  services:
+  - rgw
+  see_also:
+  - rgw_zonegroup
+  with_legacy: true
+- name: rgw_region_root_pool
+  type: str
+  level: advanced
+  desc: Region root pool
+  long_desc: Obsolete config option. The rgw_zonegroup_root_pool should be used instead.
+  default: .rgw.root
+  services:
+  - rgw
+  see_also:
+  - rgw_zonegroup_root_pool
+  with_legacy: true
+- name: rgw_default_region_info_oid
+  type: str
+  level: advanced
+  desc: Default region info object id
+  long_desc: Obsolete config option. The rgw_default_zonegroup_info_oid should be
+    used instead.
+  default: default.region
+  services:
+  - rgw
+  see_also:
+  - rgw_default_zonegroup_info_oid
+  with_legacy: true
+- name: rgw_zonegroup
+  type: str
+  level: advanced
+  desc: Zonegroup name
+  fmt_desc: The name of the zonegroup for the gateway instance. If no
+    zonegroup is set, a cluster-wide default can be configured with
+    the command ``radosgw-admin zonegroup default``.
+  services:
+  - rgw
+  see_also:
+  - rgw_zone
+  - rgw_realm
+  with_legacy: true
+- name: rgw_zonegroup_id
+  type: str
+  level: advanced
+  desc: Zonegroup ID
+  services:
+  - rgw
+  see_also:
+  - rgw_zone
+  - rgw_zonegroup
+  - rgw_realm
+- name: rgw_zonegroup_root_pool
+  type: str
+  level: advanced
+  desc: Zonegroup root pool
+  long_desc: The zonegroup root pool, is the pool where the RGW zonegroup configuration
+    located.
+  default: .rgw.root
+  services:
+  - rgw
+  see_also:
+  - rgw_zone_root_pool
+  - rgw_realm_root_pool
+  - rgw_period_root_pool
+  with_legacy: true
+- name: rgw_default_zonegroup_info_oid
+  type: str
+  level: advanced
+  default: default.zonegroup
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_realm
+  type: str
+  level: advanced
+  fmt_desc: The name of the realm for the gateway instance. If no realm is
+    set, a cluster-wide default can be configured with the command
+    ``radosgw-admin realm default``.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_realm_id
+  type: str
+  level: advanced
+  services:
+  - rgw
+- name: rgw_realm_root_pool
+  type: str
+  level: advanced
+  desc: Realm root pool
+  long_desc: The realm root pool, is the pool where the RGW realm configuration located.
+  default: .rgw.root
+  services:
+  - rgw
+  see_also:
+  - rgw_zonegroup_root_pool
+  - rgw_zone_root_pool
+  - rgw_period_root_pool
+  with_legacy: true
+- name: rgw_default_realm_info_oid
+  type: str
+  level: advanced
+  default: default.realm
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_period_root_pool
+  type: str
+  level: advanced
+  desc: Period root pool
+  long_desc: The period root pool, is the pool where the RGW period configuration
+    located.
+  default: .rgw.root
+  services:
+  - rgw
+  see_also:
+  - rgw_zonegroup_root_pool
+  - rgw_zone_root_pool
+  - rgw_realm_root_pool
+  with_legacy: true
+- name: rgw_period_latest_epoch_info_oid
+  type: str
+  level: dev
+  default: .latest_epoch
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_log_nonexistent_bucket
+  type: bool
+  level: advanced
+  desc: Should RGW log operations on bucket that does not exist
+  long_desc: This config option applies to the ops log. When this option is set, the
+    ops log will log operations that are sent to non existing buckets. These operations
+    inherently fail, and do not correspond to a specific user.
+  fmt_desc: Enables Ceph Object Gateway to log a request for a non-existent
+    bucket.
+  default: false
+  services:
+  - rgw
+  see_also:
+  - rgw_enable_ops_log
+  with_legacy: true
+# man date to see codes (a subset are supported)
+- name: rgw_log_object_name
+  type: str
+  level: advanced
+  desc: Ops log object name format
+  long_desc: Defines the format of the RADOS objects names that ops log uses to store
+    ops log data
+  fmt_desc: The logging format for an object name. See ma npage
+    :manpage:`date` for details about format specifiers.
+  default: '%Y-%m-%d-%H-%i-%n'
+  services:
+  - rgw
+  see_also:
+  - rgw_enable_ops_log
+  with_legacy: true
+- name: rgw_log_object_name_utc
+  type: bool
+  level: advanced
+  desc: Should ops log object name based on UTC
+  long_desc: If set, the names of the RADOS objects that hold the ops log data will
+    be based on UTC time zone. If not set, it will use the local time zone.
+  fmt_desc: Whether a logged object name includes a UTC time.
+    If ``false``, it uses the local time.
+  default: false
+  services:
+  - rgw
+  see_also:
+  - rgw_enable_ops_log
+  - rgw_log_object_name
+  with_legacy: true
+- name: rgw_usage_max_shards
+  type: int
+  level: advanced
+  desc: Number of shards for usage log.
+  long_desc: The number of RADOS objects that RGW will use in order to store the usage
+    log data.
+  fmt_desc: The maximum number of shards for usage logging.
+  default: 32
+  services:
+  - rgw
+  see_also:
+  - rgw_enable_usage_log
+  with_legacy: true
+- name: rgw_usage_max_user_shards
+  type: int
+  level: advanced
+  desc: Number of shards for single user in usage log
+  long_desc: The number of shards that a single user will span over in the usage log.
+  fmt_desc: The maximum number of shards used for a single user's
+    usage logging.
+  default: 1
+  services:
+  - rgw
+  see_also:
+  - rgw_enable_usage_log
+  min: 1
+  with_legacy: true
+# enable logging every rgw operation
+- name: rgw_enable_ops_log
+  type: bool
+  level: advanced
+  desc: Enable ops log
+  fmt_desc: Enable logging for each successful Ceph Object Gateway operation.
+  default: false
+  services:
+  - rgw
+  see_also:
+  - rgw_log_nonexistent_bucket
+  - rgw_log_object_name
+  - rgw_ops_log_rados
+  - rgw_ops_log_socket_path
+  - rgw_ops_log_file_path
+  with_legacy: true
+# enable logging bandwidth usage
+- name: rgw_enable_usage_log
+  type: bool
+  level: advanced
+  desc: Enable the usage log
+  default: false
+  services:
+  - rgw
+  see_also:
+  - rgw_usage_max_shards
+  with_legacy: true
+# whether ops log should go to rados
+- name: rgw_ops_log_rados
+  type: bool
+  level: advanced
+  desc: Use RADOS for ops log
+  long_desc: If set, RGW will store ops log information in RADOS. WARNING,
+    there is no automation to clean up these log entries, so by default they
+    will pile up without bound. This MUST NOT be enabled unless the admin has
+    a strategy to manage and trim these log entries with `radosgw-admin log rm`.
+  fmt_desc: Whether the operations log should be written to the
+    Ceph Storage Cluster backend.
+  default: false
+  services:
+  - rgw
+  see_also:
+  - rgw_enable_ops_log
+  - rgw_log_object_name_utc
+  - rgw_log_object_name
+  with_legacy: true
+# path to unix domain socket where ops log can go
+- name: rgw_ops_log_socket_path
+  type: str
+  level: advanced
+  desc: Unix domain socket path for ops log.
+  long_desc: Path to unix domain socket that RGW will listen for connection on. When
+    connected, RGW will send ops log data through it.
+  fmt_desc: The Unix domain socket for writing operations logs.
+  services:
+  - rgw
+  see_also:
+  - rgw_enable_ops_log
+  - rgw_ops_log_data_backlog
+  with_legacy: true
+# path to file where ops log can go
+- name: rgw_ops_log_file_path
+  type: str
+  level: advanced
+  desc: File-system path for ops log.
+  long_desc: Path to file that RGW will log ops logs to. A cephadm deployment will automatically
+    rotate these logs under /var/log/ceph/. Other deployments should arrange for similar log rotation.
+  fmt_desc: The file-system path for writing operations logs.
+  daemon_default: /var/log/ceph/ops-log-$cluster-$name.log
+  services:
+  - rgw
+  see_also:
+  - rgw_enable_ops_log
+  with_legacy: true
+# max data backlog for ops log
+- name: rgw_ops_log_data_backlog
+  type: size
+  level: advanced
+  desc: Ops log socket backlog
+  long_desc: Maximum amount of data backlog that RGW can keep when ops log is configured
+    to send info through unix domain socket. When data backlog is higher than this,
+    ops log entries will be lost. In order to avoid ops log information loss, the
+    listener needs to clear data (by reading it) quickly enough.
+  fmt_desc: The maximum data backlog data size for operations logs written
+    to a Unix domain socket.
+  default: 5_M
+  services:
+  - rgw
+  see_also:
+  - rgw_enable_ops_log
+  - rgw_ops_log_socket_path
+  with_legacy: true
+- name: rgw_usage_log_flush_threshold
+  type: int
+  level: advanced
+  desc: Number of entries in usage log before flushing
+  long_desc: This is the max number of entries that will be held in the usage log,
+    before it will be flushed to the backend. Note that the usage log is periodically
+    flushed, even if number of entries does not reach this threshold. A usage log
+    entry corresponds to one or more operations on a single bucket.i
+  fmt_desc: The number of dirty merged entries in the usage log before
+    flushing synchronously.
+  default: 1024
+  services:
+  - rgw
+  see_also:
+  - rgw_enable_usage_log
+  - rgw_usage_log_tick_interval
+  with_legacy: true
+- name: rgw_usage_log_tick_interval
+  type: int
+  level: advanced
+  desc: Number of seconds between usage log flush cycles
+  long_desc: The number of seconds between consecutive usage log flushes. The usage
+    log will also flush itself to the backend if the number of pending entries reaches
+    a certain threshold.
+  fmt_desc: Flush pending usage log data every ``n`` seconds.
+  default: 30
+  services:
+  - rgw
+  see_also:
+  - rgw_enable_usage_log
+  - rgw_usage_log_flush_threshold
+  with_legacy: true
+- name: rgw_init_timeout
+  type: int
+  level: basic
+  desc: Initialization timeout
+  long_desc: The time length (in seconds) that RGW will allow for its initialization.
+    RGW process will give up and quit if initialization is not complete after this
+    amount of time.
+  fmt_desc: The number of seconds before Ceph Object Gateway gives up on
+    initialization.
+  default: 5_min
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_mime_types_file
+  type: str
+  level: basic
+  desc: Path to local mime types file
+  long_desc: The mime types file is needed in Swift when uploading an object. If object's
+    content type is not specified, RGW will use data from this file to assign a content
+    type to the object.
+  fmt_desc: The path and location of the MIME-types file. Used for Swift
+    auto-detection of object types.
+  default: /etc/mime.types
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_gc_max_objs
+  type: int
+  level: advanced
+  desc: Number of shards for garbage collector data
+  long_desc: The number of garbage collector data shards, is the number of RADOS objects
+    that RGW will use to store the garbage collection information on.
+  fmt_desc: The maximum number of objects that may be handled by
+    garbage collection in one garbage collection processing cycle.
+    Please do not change this value after the first deployment.
+  default: 32
+  services:
+  - rgw
+  see_also:
+  - rgw_gc_obj_min_wait
+  - rgw_gc_processor_max_time
+  - rgw_gc_processor_period
+  - rgw_gc_max_concurrent_io
+  with_legacy: true
+# wait time before object may be handled by gc, recommended lower limit is 30 mins
+- name: rgw_gc_obj_min_wait
+  type: int
+  level: advanced
+  desc: Garbage collection object expiration time
+  long_desc: The length of time (in seconds) that the RGW collector will wait before
+    purging a deleted object's data. RGW will not remove object immediately, as object
+    could still have readers. A mechanism exists to increase the object's expiration
+    time when it's being read. The recommended value of its lower limit is 30 minutes
+  fmt_desc: The minimum wait time before a deleted object may be removed
+    and handled by garbage collection processing.
+  default: 2_hr
+  services:
+  - rgw
+  see_also:
+  - rgw_gc_max_objs
+  - rgw_gc_processor_max_time
+  - rgw_gc_processor_period
+  - rgw_gc_max_concurrent_io
+  with_legacy: true
+- name: rgw_gc_processor_max_time
+  type: int
+  level: advanced
+  desc: Length of time GC processor can lease shard
+  long_desc: Garbage collection thread in RGW process holds a lease on its data shards.
+    These objects contain the information about the objects that need to be removed.
+    RGW takes a lease in order to prevent multiple RGW processes from handling the
+    same objects concurrently. This time signifies that maximum amount of time (in
+    seconds) that RGW is allowed to hold that lease. In the case where RGW goes down
+    uncleanly, this is the amount of time where processing of that data shard will
+    be blocked.
+  fmt_desc: The maximum time between the beginning of two consecutive garbage
+    collection processing cycles.
+  default: 1_hr
+  services:
+  - rgw
+  see_also:
+  - rgw_gc_max_objs
+  - rgw_gc_obj_min_wait
+  - rgw_gc_processor_period
+  - rgw_gc_max_concurrent_io
+  with_legacy: true
+- name: rgw_gc_processor_period
+  type: int
+  level: advanced
+  desc: Garbage collector cycle run time
+  long_desc: The amount of time between the start of consecutive runs of the garbage
+    collector threads. If garbage collector runs takes more than this period, it will
+    not wait before running again.
+  fmt_desc: The cycle time for garbage collection processing.
+  default: 1_hr
+  services:
+  - rgw
+  see_also:
+  - rgw_gc_max_objs
+  - rgw_gc_obj_min_wait
+  - rgw_gc_processor_max_time
+  - rgw_gc_max_concurrent_io
+  - rgw_gc_max_trim_chunk
+  with_legacy: true
+- name: rgw_gc_max_concurrent_io
+  type: int
+  level: advanced
+  desc: Max concurrent RADOS IO operations for garbage collection
+  long_desc: The maximum number of concurrent IO operations that the RGW garbage collection
+    thread will use when purging old data.
+  default: 10
+  services:
+  - rgw
+  see_also:
+  - rgw_gc_max_objs
+  - rgw_gc_obj_min_wait
+  - rgw_gc_processor_max_time
+  - rgw_gc_max_trim_chunk
+  with_legacy: true
+- name: rgw_gc_max_trim_chunk
+  type: int
+  level: advanced
+  desc: Max number of keys to remove from garbage collector log in a single operation
+  default: 16
+  services:
+  - rgw
+  see_also:
+  - rgw_gc_max_objs
+  - rgw_gc_obj_min_wait
+  - rgw_gc_processor_max_time
+  - rgw_gc_max_concurrent_io
+  with_legacy: true
+- name: rgw_gc_max_deferred_entries_size
+  type: uint
+  level: advanced
+  desc: maximum allowed size of deferred entries in queue head for gc
+  default: 3_K
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_gc_max_queue_size
+  type: uint
+  level: advanced
+  desc: Maximum allowed queue size for gc
+  long_desc: The maximum allowed size of each gc queue, and its value should not be
+    greater than (osd_max_object_size - rgw_gc_max_deferred_entries_size - 1K).
+  default: 131068_K
+  services:
+  - rgw
+  see_also:
+  - osd_max_object_size
+  - rgw_gc_max_deferred_entries_size
+  with_legacy: true
+- name: rgw_gc_max_deferred
+  type: uint
+  level: advanced
+  desc: Number of maximum deferred data entries to be stored in queue for gc
+  default: 50
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_s3_success_create_obj_status
+  type: int
+  level: advanced
+  desc: HTTP return code override for object creation
+  long_desc: If not zero, this is the HTTP return code that will be returned on a
+    successful S3 object creation.
+  fmt_desc: The alternate success status response for ``create-obj``.
+  default: 0
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_s3_client_max_sig_ver
+  type: int
+  level: advanced
+  desc: Max S3 authentication signature version
+  long_desc: If greater than zero, would force max signature version to use
+  default: -1
+  services:
+  - rgw
+- name: rgw_resolve_cname
+  type: bool
+  level: advanced
+  desc: Support vanity domain names via CNAME
+  long_desc: If true, RGW will query DNS when detecting that it's serving a request
+    that was sent to a host in another domain. If a CNAME record is configured for
+    that domain it will use it instead. This gives user to have the ability of creating
+    a unique domain of their own to point at data in their bucket.
+  fmt_desc: Whether ``rgw`` should use DNS CNAME record of the request
+    hostname field (if hostname is not equal to ``rgw dns name``).
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_obj_stripe_size
+  type: size
+  level: advanced
+  desc: RGW object stripe size
+  long_desc: The size of an object stripe for RGW objects. This is the maximum size
+    a backing RADOS object will have. RGW objects that are larger than this will span
+    over multiple objects.
+  fmt_desc: The size of an object stripe for Ceph Object Gateway objects.
+    See `Architecture`_ for details on striping.
+  default: 4_M
+  services:
+  - rgw
+  with_legacy: true
+# list of extended attrs that can be set on objects (beyond the default)
+- name: rgw_extended_http_attrs
+  type: str
+  level: advanced
+  desc: RGW support extended HTTP attrs
+  long_desc: Add new set of attributes that could be set on an object. These extra
+    attributes can be set through HTTP header fields when putting the objects. If
+    set, these attributes will return as HTTP fields when doing GET/HEAD on the object.
+  fmt_desc: Add new set of attributes that could be set on an entity
+    (user, bucket or object). These extra attributes can be set
+    through HTTP header fields when putting the entity or modifying
+    it using POST method. If set, these attributes will return as
+    HTTP  fields when doing GET/HEAD on the entity.
+  services:
+  - rgw
+  example: content_foo, content_bar, x-foo-bar
+  with_legacy: true
+- name: rgw_exit_timeout_secs
+  type: int
+  level: advanced
+  desc: RGW shutdown timeout
+  long_desc: Number of seconds to wait for a process before exiting unconditionally.
+  default: 2_min
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_get_obj_window_size
+  type: size
+  level: advanced
+  desc: RGW object read window size
+  long_desc: The window size in bytes for a single object read request
+  default: 16_M
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_get_obj_max_req_size
+  type: size
+  level: advanced
+  desc: RGW object read chunk size
+  long_desc: The maximum request size of a single object read operation sent to RADOS
+  fmt_desc: The maximum request size of a single get operation sent to the
+    Ceph Storage Cluster.
+  default: 4_M
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_relaxed_s3_bucket_names
+  type: bool
+  level: advanced
+  desc: RGW enable relaxed S3 bucket names
+  long_desc: RGW enable relaxed S3 bucket name rules for US region buckets.
+  fmt_desc: Enables relaxed S3 bucket names rules for US region buckets.
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_defer_to_bucket_acls
+  type: str
+  level: advanced
+  desc: Bucket ACLs override object ACLs
+  long_desc: If not empty, a string that selects that mode of operation. 'recurse'
+    will use bucket's ACL for the authorization. 'full-control' will allow users that
+    users that have full control permission on the bucket have access to the object.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_list_buckets_max_chunk
+  type: int
+  level: advanced
+  desc: Max number of buckets to retrieve in a single listing operation
+  long_desc: When RGW fetches lists of user's buckets from the backend, this is the
+    max number of entries it will try to retrieve in a single operation. Note that
+    the backend may choose to return a smaller number of entries.
+  fmt_desc: The maximum number of buckets to retrieve in a single operation
+    when listing user buckets.
+  default: 1000
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_md_log_max_shards
+  type: int
+  level: advanced
+  desc: RGW number of metadata log shards
+  long_desc: The number of shards the RGW metadata log entries will reside in. This
+    affects the metadata sync parallelism as a shard can only be processed by a single
+    RGW at a time
+  fmt_desc: The maximum number of shards for the metadata log.
+  default: 64
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_curl_buffersize
+  type: int
+  level: dev
+  long_desc: 'Pass a long specifying your preferred size (in bytes) for the receivebuffer
+    in libcurl. See: https://curl.se/libcurl/c/CURLOPT_BUFFERSIZE.html'
+  default: 524288
+  services:
+  - rgw
+  min: 1024
+  max: 524288
+  with_legacy: true
+- name: rgw_curl_wait_timeout_ms
+  type: int
+  level: dev
+  default: 1000
+  fmt_desc: The timeout in milliseconds for certain ``curl`` calls.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_curl_low_speed_limit
+  type: int
+  level: advanced
+  long_desc: It contains the average transfer speed in bytes per second that the transfer
+    should be below during rgw_curl_low_speed_time seconds for libcurl to consider
+    it to be too slow and abort. Set it zero to disable this.
+  default: 1024
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_curl_low_speed_time
+  type: int
+  level: advanced
+  long_desc: It contains the time in number seconds that the transfer speed should
+    be below the rgw_curl_low_speed_limit for the library to consider it too slow
+    and abort. Set it zero to disable this.
+  default: 5_min
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_curl_tcp_keepalive
+  type: int
+  level: advanced
+  long_desc: Enable TCP keepalive on the HTTP client sockets managed by libcurl. This does not apply to connections received by the HTTP frontend, but only to HTTP requests sent by radosgw. Examples include requests to Keystone for authentication, sync requests from multisite, and requests to key management servers for SSE.
+  enum_values:
+  - 0
+  - 1
+  default: 0
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_copy_obj_progress
+  type: bool
+  level: advanced
+  desc: Send progress report through copy operation
+  long_desc: If true, RGW will send progress information when copy operation is executed.
+  fmt_desc: Enables output of object progress during long copy operations.
+  default: true
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_copy_obj_progress_every_bytes
+  type: size
+  level: advanced
+  desc: Send copy-object progress info after these many bytes
+  fmt_desc: The minimum bytes between copy progress output.
+  default: 1_M
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_max_copy_obj_concurrent_io
+  type: int
+  level: advanced
+  desc: Number of refcount operations to process concurrently when executing copy_obj
+  default: 10
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_sync_obj_etag_verify
+  type: bool
+  level: advanced
+  desc: Verify if the object copied from remote is identical to its source
+  long_desc: If true, this option computes the MD5 checksum of the data which is written
+    at the destination and checks if it is identical to the ETAG stored in the source.
+    It ensures integrity of the objects fetched from a remote server over HTTP including
+    multisite sync.
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_obj_tombstone_cache_size
+  type: int
+  level: advanced
+  desc: Max number of entries to keep in tombstone cache
+  long_desc: The tombstone cache is used when doing a multi-zone data sync. RGW keeps
+    there information about removed objects which is needed in order to prevent re-syncing
+    of objects that were already removed.
+  default: 1000
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_data_log_window
+  type: int
+  level: advanced
+  desc: Data log time window
+  long_desc: The data log keeps information about buckets that have objectst that
+    were modified within a specific timeframe. The sync process then knows which buckets
+    are needed to be scanned for data sync.
+  fmt_desc: The data log entries window in seconds.
+  default: 30
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_data_log_changes_size
+  type: int
+  level: dev
+  desc: Max size of pending changes in data log
+  long_desc: RGW will trigger update to the data log if the number of pending entries
+    reached this number.
+  fmt_dsec: The number of in-memory entries to hold for the data changes log.
+  default: 1000
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_data_log_num_shards
+  type: int
+  level: advanced
+  desc: Number of data log shards
+  long_desc: The number of shards the RGW data log entries will reside in. This affects
+    the data sync parallelism as a shard can only be processed by a single RGW at
+    a time.
+  fmt_desc: The number of shards (objects) on which to keep the
+    data changes log.
+  default: 128
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_data_log_obj_prefix
+  type: str
+  level: dev
+  default: data_log
+  fmt_desc: The object name prefix for the data log.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_data_sync_poll_interval
+  type: int
+  level: dev
+  default: 20
+  fmt_desc: Once multisite's incremental sync of a datalog shard is caught up
+    with its source, it will wait this long (in seconds) before polling for
+    more changes.
+  services:
+  - rgw
+  see_also:
+  - rgw_meta_sync_poll_interval
+  with_legacy: true
+- name: rgw_meta_sync_poll_interval
+  type: int
+  level: dev
+  default: 20
+  fmt_desc: Once multisite's incremental sync of a mdlog shard is caught up
+    with its source, it will wait this long (in seconds) before polling for
+    more changes.
+  services:
+  - rgw
+  see_also:
+  - rgw_data_sync_poll_interval
+  with_legacy: true
+- name: rgw_bucket_sync_spawn_window
+  type: int
+  level: dev
+  default: 20
+  fmt_desc: The maximum number of items that bucket sync is willing to
+    process in parallel (per remote bilog shard).
+  services:
+  - rgw
+  see_also:
+  - rgw_data_sync_spawn_window
+  - rgw_meta_sync_spawn_window
+  with_legacy: true
+- name: rgw_data_sync_spawn_window
+  type: int
+  level: dev
+  default: 20
+  fmt_desc: The maximum number of items that data sync is willing to
+    process in parallel (per remote datalog shard).
+  services:
+  - rgw
+  see_also:
+  - rgw_bucket_sync_spawn_window
+  - rgw_meta_sync_spawn_window
+  with_legacy: true
+- name: rgw_meta_sync_spawn_window
+  type: int
+  level: dev
+  default: 20
+  fmt_desc: The maximum number of items that metadata sync is willing to
+    process in parallel (per remote mdlog shard).
+  services:
+  - rgw
+  see_also:
+  - rgw_bucket_sync_spawn_window
+  - rgw_data_sync_spawn_window
+  with_legacy: true
+- name: rgw_bucket_quota_ttl
+  type: int
+  level: advanced
+  desc: Bucket quota stats cache TTL
+  long_desc: Length of time for bucket stats to be cached within RGW instance.
+  fmt_desc: The amount of time in seconds cached quota information is
+    trusted.  After this timeout, the quota information will be
+    re-fetched from the cluster.
+  default: 10_min
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_bucket_quota_cache_size
+  type: int
+  level: advanced
+  desc: RGW quota stats cache size
+  long_desc: Maximum number of entries in the quota stats cache.
+  default: 10000
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_bucket_default_quota_max_objects
+  type: int
+  level: basic
+  desc: Default quota for max objects in a bucket
+  long_desc: The default quota configuration for max number of objects in a bucket.
+    A negative number means 'unlimited'.
+  fmt_desc: Default max number of objects per bucket. Set on new users,
+    if no other quota is specified. Has no effect on existing users.
+    This variable should be set in the client or global sections
+    so that it is automatically applied to radosgw-admin commands.
+  default: -1
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_bucket_default_quota_max_size
+  type: int
+  level: advanced
+  desc: Default quota for total size in a bucket
+  long_desc: The default quota configuration for total size of objects in a bucket.
+    A negative number means 'unlimited'.
+  fmt_desc: Default max capacity per bucket, in bytes. Set on new users,
+    if no other quota is specified. Has no effect on existing users.
+  default: -1
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_expose_bucket
+  type: bool
+  level: advanced
+  desc: Send Bucket HTTP header with the response
+  long_desc: If true, RGW will send a Bucket HTTP header with the responses. The header
+    will contain the name of the bucket the operation happened on.
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_frontends
+  type: str
+  level: basic
+  desc: RGW frontends configuration
+  long_desc: A comma delimited list of frontends configuration. Each configuration
+    contains the type of the frontend followed by an optional space delimited set
+    of key=value config parameters.
+  fmt_desc: Configures the HTTP frontend(s). The configuration for multiple
+    frontends can be provided in a comma-delimited list. Each frontend
+    configuration may include a list of options separated by spaces,
+    where each option is in the form "key=value" or "key". See
+    `HTTP Frontends`_ for more on supported options.
+  default: beast port=7480
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_frontend_defaults
+  type: str
+  level: advanced
+  desc: RGW frontends default configuration
+  long_desc: A comma delimited list of default frontends configuration.
+  default: beast ssl_certificate=config://rgw/cert/$realm/$zone.crt ssl_private_key=config://rgw/cert/$realm/$zone.key
+  services:
+  - rgw
+- name: rgw_beast_enable_async
+  type: bool
+  level: dev
+  desc: Enable async request processing under beast using coroutines
+  long_desc: When enabled, the beast frontend will process requests using
+    coroutines, allowing the concurrent processing of several requests on the
+    same thread. When disabled, the number of concurrent requests will be
+    limited by the thread count, but debugging and tracing the synchronous
+    calls can be easier.
+  default: true
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_user_quota_bucket_sync_interval
+  type: int
+  level: advanced
+  desc: User quota bucket sync interval
+  long_desc: Time period for accumulating modified buckets before syncing these stats.
+  fmt_desc: The amount of time in seconds bucket quota information is
+    accumulated before syncing to the cluster.  During this time,
+    other RGW instances will not see the changes in bucket quota
+    stats from operations on this instance.
+  default: 3_min
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_user_quota_sync_interval
+  type: int
+  level: advanced
+  desc: User quota sync interval
+  long_desc: Time period for accumulating modified buckets before syncing entire user
+    stats.
+  fmt_desc: The amount of time in seconds user quota information is
+    accumulated before syncing to the cluster.  During this time,
+    other RGW instances will not see the changes in user quota stats
+    from operations on this instance.
+  default: 1_day
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_user_quota_sync_idle_users
+  type: bool
+  level: advanced
+  desc: Should sync idle users quota
+  long_desc: Whether stats for idle users be fully synced.
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_user_quota_sync_wait_time
+  type: int
+  level: advanced
+  desc: User quota full-sync wait time
+  long_desc: Minimum time between two full stats sync for non-idle users.
+  default: 1_day
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_user_default_quota_max_objects
+  type: int
+  level: basic
+  desc: User quota max objects
+  long_desc: The default quota configuration for total number of objects for a single
+    user. A negative number means 'unlimited'.
+  fmt_desc: Default max number of objects for a user. This includes all
+    objects in all buckets owned by the user. Set on new users,
+    if no other quota is specified. Has no effect on existing users.
+  default: -1
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_user_default_quota_max_size
+  type: int
+  level: basic
+  desc: User quota max size
+  long_desc: The default quota configuration for total size of objects for a single
+    user. A negative number means 'unlimited'.
+  fmt_desc: The value for user max size quota in bytes set on new users,
+    if no other quota is specified.  Has no effect on existing users.
+  default: -1
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_multipart_min_part_size
+  type: size
+  level: advanced
+  desc: Minimum S3 multipart-upload part size
+  long_desc: When doing a multipart upload, each part (other than the last part) must
+    be at least this size.
+  default: 5_M
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_multipart_part_upload_limit
+  type: int
+  level: advanced
+  desc: Max number of parts in multipart upload
+  default: 10000
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_max_slo_entries
+  type: int
+  level: advanced
+  desc: Max number of entries in Swift Static Large Object manifest
+  default: 1000
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_olh_pending_timeout_sec
+  type: int
+  level: dev
+  desc: Max time for pending OLH change to complete
+  long_desc: OLH is a versioned object's logical head. Operations on it are journaled
+    and as pending before completion. If an operation doesn't complete with this amount
+    of seconds, we remove the operation from the journal.
+  default: 1_hr
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_user_max_buckets
+  type: int
+  level: basic
+  desc: Max number of buckets per user
+  long_desc: A user can create at most this number of buckets. Zero means no limit;
+    a negative value means users cannot create any new buckets, although users will
+    retain buckets already created.
+  default: 1000
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_objexp_gc_interval
+  type: uint
+  level: advanced
+  desc: Swift objects expirer garbage collector interval
+  default: 600
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_objexp_hints_num_shards
+  type: uint
+  level: advanced
+  desc: Number of object expirer data shards
+  long_desc: The number of shards the (Swift) object expirer will store its data on.
+  default: 127
+  services:
+  - rgw
+  with_legacy: true
+# maximum number of entries in a single operation when processing objexp data
+- name: rgw_objexp_chunk_size
+  type: uint
+  level: dev
+  default: 100
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_enable_static_website
+  type: bool
+  level: basic
+  desc: Enable static website APIs
+  long_desc: This configurable controls whether RGW handles the website control APIs.
+    RGW can server static websites if s3website hostnames are configured, and unrelated
+    to this configurable.
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_user_unique_email
+  type: bool
+  level: basic
+  desc: Require local RGW users to have unique email addresses
+  long_desc: Enforce builtin user accounts to have unique email addresses.  This setting
+    is historical.  In future, non-enforcement of email address uniqueness is likely
+    to become the default.
+  default: true
+  services:
+  - rgw
+- name: rgw_log_http_headers
+  type: str
+  level: basic
+  desc: List of HTTP headers to log
+  long_desc: A comma delimited list of HTTP headers to log when seen, ignores case
+    (e.g., http_x_forwarded_for).
+  fmt_desc: Comma-delimited list of HTTP headers to include with ops
+    log entries.  Header names are case insensitive, and use
+    the full header name with words separated by underscores.
+  example: http_x_forwarded_for, http_x_special_k
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_num_async_rados_threads
+  type: int
+  level: advanced
+  desc: Number of concurrent RADOS operations in multisite sync
+  long_desc: The number of concurrent RADOS IO operations that will be triggered for
+    handling multisite sync operations. This includes control related work, and not
+    the actual sync operations.
+  default: 32
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_md_notify_interval_msec
+  type: int
+  level: advanced
+  desc: Length of time to aggregate metadata changes
+  long_desc: Length of time (in milliseconds) in which the master zone aggregates
+    all the metadata changes that occurred, before sending notifications to all the
+    other zones.
+  default: 200
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_run_sync_thread
+  type: bool
+  level: advanced
+  desc: Should run sync thread
+  fmt_desc: If there are other zones in the realm to sync from, spawn threads
+    to handle the sync of data and metadata.
+  default: true
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_sync_lease_period
+  type: int
+  level: dev
+  default: 2_min
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_sync_log_trim_interval
+  type: int
+  level: advanced
+  desc: Sync log trim interval
+  long_desc: Time in seconds between attempts to trim sync logs.
+  default: 20_min
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_sync_log_trim_max_buckets
+  type: int
+  level: advanced
+  desc: Maximum number of buckets to trim per interval
+  long_desc: The maximum number of buckets to consider for bucket index log trimming
+    each trim interval, regardless of the number of bucket index shards. Priority
+    is given to buckets with the most sync activity over the last trim interval.
+  default: 16
+  services:
+  - rgw
+  see_also:
+  - rgw_sync_log_trim_interval
+  - rgw_sync_log_trim_min_cold_buckets
+  - rgw_sync_log_trim_concurrent_buckets
+- name: rgw_sync_log_trim_min_cold_buckets
+  type: int
+  level: advanced
+  desc: Minimum number of cold buckets to trim per interval
+  long_desc: Of the `rgw_sync_log_trim_max_buckets` selected for bucket index log
+    trimming each trim interval, at least this many of them must be 'cold' buckets.
+    These buckets are selected in order from the list of all bucket instances, to
+    guarantee that all buckets will be visited eventually.
+  default: 4
+  services:
+  - rgw
+  see_also:
+  - rgw_sync_log_trim_interval
+  - rgw_sync_log_trim_max_buckets
+  - rgw_sync_log_trim_concurrent_buckets
+- name: rgw_sync_log_trim_concurrent_buckets
+  type: int
+  level: advanced
+  desc: Maximum number of buckets to trim in parallel
+  default: 4
+  services:
+  - rgw
+  see_also:
+  - rgw_sync_log_trim_interval
+  - rgw_sync_log_trim_max_buckets
+  - rgw_sync_log_trim_min_cold_buckets
+- name: rgw_sync_data_inject_err_probability
+  type: float
+  level: dev
+  default: 0
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_sync_meta_inject_err_probability
+  type: float
+  level: dev
+  default: 0
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_sync_data_full_inject_err_probability
+  type: float
+  level: dev
+  default: 0
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_sync_trace_history_size
+  type: size
+  level: advanced
+  desc: Sync trace history size
+  long_desc: Maximum number of complete sync trace entries to keep.
+  default: 4_K
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_sync_trace_per_node_log_size
+  type: int
+  level: advanced
+  desc: Sync trace per-node log size
+  long_desc: The number of log entries to keep per sync-trace node.
+  default: 32
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_sync_trace_servicemap_update_interval
+  type: int
+  level: advanced
+  desc: Sync-trace service-map update interval
+  long_desc: Number of seconds between service-map updates of sync-trace events.
+  default: 10
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_period_push_interval
+  type: float
+  level: advanced
+  desc: Period push interval
+  long_desc: Number of seconds to wait before retrying 'period push' operation.
+  default: 2
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_period_push_interval_max
+  type: float
+  level: advanced
+  desc: Period push maximum interval
+  long_desc: The max number of seconds to wait before retrying 'period push' after
+    exponential backoff.
+  default: 30
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_safe_max_objects_per_shard
+  type: int
+  level: advanced
+  desc: Safe number of objects per shard
+  long_desc: This is the max number of objects per bucket index shard that RGW considers
+    safe. RGW will warn if it identifies a bucket where its per-shard count is higher
+    than a percentage of this number.
+  default: 102400
+  services:
+  - rgw
+  see_also:
+  - rgw_shard_warning_threshold
+  with_legacy: true
+# pct of safe max at which to warn
+- name: rgw_shard_warning_threshold
+  type: float
+  level: advanced
+  desc: Warn about max objects per shard
+  long_desc: Warn if number of objects per shard in a specific bucket passed this
+    percentage of the safe number.
+  default: 90
+  services:
+  - rgw
+  see_also:
+  - rgw_safe_max_objects_per_shard
+  with_legacy: true
+- name: rgw_swift_versioning_enabled
+  type: bool
+  level: advanced
+  desc: Enable Swift versioning
+  fmt_desc: |
+    Enables the Object Versioning of OpenStack Object Storage API.
+    This allows clients to put the ``X-Versions-Location`` attribute
+    on containers that should be versioned. The attribute specifies
+    the name of container storing archived versions. It must be owned
+    by the same user that the versioned container due to access
+    control verification - ACLs are NOT taken into consideration.
+    Those containers cannot be versioned by the S3 object versioning
+    mechanism.
+
+    A slightly different attribute, ``X-History-Location``, which is also understood by
+    `OpenStack Swift <https://docs.openstack.org/swift/latest/api/object_versioning.html>`_
+    for handling ``DELETE`` operations, is currently not supported.
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_swift_custom_header
+  type: str
+  level: advanced
+  desc: Enable swift custom header
+  long_desc: If not empty, specifies a name of HTTP header that can include custom
+    data. When uploading an object, if this header is passed RGW will store this header
+    info and it will be available when listing the bucket.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_swift_need_stats
+  type: bool
+  level: advanced
+  desc: Enable stats on bucket listing in Swift
+  default: true
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_reshard_num_logs
+  type: uint
+  level: advanced
+  default: 16
+  services:
+  - rgw
+  - rgw
+  min: 1
+- name: rgw_reshard_bucket_lock_duration
+  type: uint
+  level: advanced
+  desc: Number of seconds the timeout on the reshard locks (bucket reshard lock and
+    reshard log lock) are set to. As a reshard proceeds these locks can be renewed/extended.
+    If too short, reshards cannot complete and will fail, causing a future reshard
+    attempt. If too long a hung or crashed reshard attempt will keep the bucket locked
+    for an extended period, not allowing RGW to detect the failed reshard attempt
+    and recover.
+  default: 360
+  tags:
+  - performance
+  services:
+  - rgw
+  - rgw
+  min: 30
+- name: rgw_debug_inject_set_olh_err
+  type: uint
+  level: dev
+  desc: Whether to inject errors between rados olh modification initialization and
+    bucket index instance linking. The value determines the error code. This exists
+    for development and testing purposes to help simulate cases where bucket index
+    entries aren't cleaned up by the request thread after an error scenario.
+  default: 0
+  with_legacy: true
+  services:
+  - rgw
+- name: rgw_debug_inject_olh_cancel_modification_err
+  type: bool
+  level: dev
+  desc: Whether to inject an error to simulate a failure to cancel olh
+    modification. This exists for development and testing purposes.
+  default: false
+  with_legacy: true
+  services:
+  - rgw
+- name: rgw_reshard_batch_size
+  type: uint
+  level: advanced
+  desc: Number of reshard entries to batch together before sending the operations
+    to the CLS back-end
+  default: 64
+  tags:
+  - performance
+  services:
+  - rgw
+  - rgw
+  min: 8
+- name: rgw_reshard_max_aio
+  type: uint
+  level: advanced
+  desc: Maximum number of outstanding asynchronous I/O operations to allow at a time
+    during resharding
+  default: 128
+  tags:
+  - performance
+  services:
+  - rgw
+  - rgw
+  min: 16
+- name: rgw_trust_forwarded_https
+  type: bool
+  level: advanced
+  desc: Trust Forwarded and X-Forwarded-Proto headers
+  long_desc: When a proxy in front of radosgw is used for ssl termination, radosgw
+    does not know whether incoming http connections are secure. Enable this option
+    to trust the Forwarded and X-Forwarded-Proto headers sent by the proxy when determining
+    whether the connection is secure. This is required for some features, such as
+    server side encryption. (Never enable this setting if you do not have a trusted
+    proxy in front of radosgw, or else malicious users will be able to set these headers
+    in any request.)
+  fmt_desc: When a proxy in front of radosgw is used for ssl termination, radosgw
+    does not know whether incoming http connections are secure. Enable
+    this option to trust the ``Forwarded`` and ``X-Forwarded-Proto`` headers
+    sent by the proxy when determining whether the connection is secure.
+    This is required for some features, such as server side encryption.
+    (Never enable this setting if you do not have a trusted proxy in front of
+    radosgw, or else malicious users will be able to set these headers in
+    any request.)
+  default: false
+  services:
+  - rgw
+  see_also:
+  - rgw_crypt_require_ssl
+  with_legacy: true
+- name: rgw_crypt_require_ssl
+  type: bool
+  level: advanced
+  desc: Requests including encryption key headers must be sent over ssl
+  default: true
+  services:
+  - rgw
+  with_legacy: true
+# base64 encoded key for encryption of rgw objects
+- name: rgw_crypt_default_encryption_key
+  type: str
+  level: dev
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_crypt_s3_kms_backend
+  type: str
+  level: advanced
+  desc: Where the SSE-KMS encryption keys are stored. Supported KMS systems are OpenStack
+    Barbican ('barbican', the default) and HashiCorp Vault ('vault').
+  fmt_desc: Where the SSE-KMS encryption keys are stored. Supported KMS
+    systems are OpenStack Barbican (``barbican``, the default) and
+    HashiCorp Vault (``vault``).
+  default: barbican
+  services:
+  - rgw
+  enum_values:
+  - barbican
+  - vault
+  - testing
+  - kmip
+  with_legacy: true
+# extra keys that may be used for aws:kms
+# defined as map "key1=YmluCmJvb3N0CmJvb3N0LQ== key2=b3V0CnNyYwpUZXN0aW5nCg=="
+- name: rgw_crypt_s3_kms_encryption_keys
+  type: str
+  level: dev
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_crypt_vault_auth
+  type: str
+  level: advanced
+  desc: Type of authentication method to be used with Vault.
+  fmt_desc: Type of authentication method to be used. The only method
+    currently supported is ``token``.
+  default: token
+  services:
+  - rgw
+  see_also:
+  - rgw_crypt_s3_kms_backend
+  - rgw_crypt_vault_addr
+  - rgw_crypt_vault_token_file
+  enum_values:
+  - token
+  - agent
+  with_legacy: true
+- name: rgw_crypt_vault_token_file
+  type: str
+  level: advanced
+  desc: If authentication method is 'token', provide a path to the token file, which
+    for security reasons should readable only by Rados Gateway.
+  services:
+  - rgw
+  see_also:
+  - rgw_crypt_s3_kms_backend
+  - rgw_crypt_vault_auth
+  - rgw_crypt_vault_addr
+  with_legacy: true
+- name: rgw_crypt_vault_addr
+  type: str
+  level: advanced
+  desc: Vault server base address.
+  fmt_desc: Vault server base address, e.g. ``http://vaultserver:8200``.
+  services:
+  - rgw
+  see_also:
+  - rgw_crypt_s3_kms_backend
+  - rgw_crypt_vault_auth
+  - rgw_crypt_vault_prefix
+  with_legacy: true
+# Optional URL prefix to Vault secret path
+- name: rgw_crypt_vault_prefix
+  type: str
+  level: advanced
+  desc: Vault secret URL prefix, which can be used to restrict access to a particular
+    subset of the Vault secret space.
+  fmt_desc: The Vault secret URL prefix, which can be used to restrict access
+    to a particular subset of the secret space, e.g. ``/v1/secret/data``.
+  services:
+  - rgw
+  see_also:
+  - rgw_crypt_s3_kms_backend
+  - rgw_crypt_vault_addr
+  - rgw_crypt_vault_auth
+  with_legacy: true
+# kv, transit or other supported secret engines
+- name: rgw_crypt_vault_secret_engine
+  type: str
+  level: advanced
+  desc: Vault Secret Engine to be used to retrieve encryption keys.
+  fmt_desc: |
+    Vault Secret Engine to be used to retrieve encryption keys: choose
+    between kv-v2, transit.
+  default: transit
+  services:
+  - rgw
+  see_also:
+  - rgw_crypt_s3_kms_backend
+  - rgw_crypt_vault_auth
+  - rgw_crypt_vault_addr
+  with_legacy: true
+#  Vault Namespace (only availabe in Vault Enterprise Version)
+- name: rgw_crypt_vault_namespace
+  type: str
+  level: advanced
+  desc: Vault Namespace to be used to select your tenant
+  fmt_desc: If set, Vault Namespace provides tenant isolation for teams and individuals
+    on the same Vault Enterprise instance, e.g. ``acme/tenant1``
+  services:
+  - rgw
+  see_also:
+  - rgw_crypt_s3_kms_backend
+  - rgw_crypt_vault_auth
+  - rgw_crypt_vault_addr
+  with_legacy: true
+# Enable TLS authentication rgw and vault
+- name: rgw_crypt_vault_verify_ssl
+  type: bool
+  level: advanced
+  desc: Should RGW verify the vault server SSL certificate.
+  default: true
+  services:
+  - rgw
+  with_legacy: true
+# TLS certs options
+- name: rgw_crypt_vault_ssl_cacert
+  type: str
+  level: advanced
+  desc: Path for custom ca certificate for accessing vault server
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_crypt_vault_ssl_clientcert
+  type: str
+  level: advanced
+  desc: Path for custom client certificate for accessing vault server
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_crypt_vault_ssl_clientkey
+  type: str
+  level: advanced
+  desc: Path for private key required for client cert
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_crypt_kmip_addr
+  type: str
+  level: advanced
+  desc: kmip server address
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_crypt_kmip_ca_path
+  type: str
+  level: advanced
+  desc: ca for kmip servers
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_crypt_kmip_username
+  type: str
+  level: advanced
+  desc: when authenticating via username
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_crypt_kmip_password
+  type: str
+  level: advanced
+  desc: optional w/ username
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_crypt_kmip_client_cert
+  type: str
+  level: advanced
+  desc: connect using client certificate
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_crypt_kmip_client_key
+  type: str
+  level: advanced
+  desc: connect using client certificate
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_crypt_kmip_kms_key_template
+  type: str
+  level: advanced
+  desc: sse-kms; kmip key names
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_crypt_kmip_s3_key_template
+  type: str
+  level: advanced
+  desc: sse-s3; kmip key template
+  default: $keyid
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_crypt_suppress_logs
+  type: bool
+  level: advanced
+  desc: Suppress logs that might print client key
+  default: true
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_crypt_sse_s3_backend
+  type: str
+  level: advanced
+  desc: Where the SSE-S3 encryption keys are stored. The only valid choice here is
+    HashiCorp Vault ('vault').
+  fmt_desc: Where the SSE-S3 encryption keys are stored. The only valid
+    choice is HashiCorp Vault (``vault``).
+  default: vault
+  services:
+  - rgw
+  enum_values:
+  - vault
+  with_legacy: true
+
+- name: rgw_crypt_sse_s3_vault_secret_engine
+  type: str
+  level: advanced
+  desc: Vault Secret Engine to be used to retrieve encryption keys.
+  fmt_desc: |
+    Vault Secret Engine to be used to retrieve encryption keys.  The
+    only valid choice here is transit.
+  default: transit
+  services:
+  - rgw
+  see_also:
+  - rgw_crypt_sse_s3_backend
+  - rgw_crypt_sse_s3_vault_auth
+  - rgw_crypt_sse_s3_vault_addr
+  with_legacy: true
+- name: rgw_crypt_sse_s3_key_template
+  type: str
+  level: advanced
+  desc: template for per-bucket sse-s3 keys in vault.
+  long_desc: This is the template for per-bucket sse-s3 keys.
+    This string may include ``%bucket_id`` which will be expanded out to
+    the bucket marker, a unique uuid assigned to that bucket.
+    It could contain ``%owner_id``, which will expand out to the owner's id.
+    Any other use of % is reserved and should not be used.
+    If the template contains ``%bucket_id``, associated bucket keys
+    will be automatically removed when the bucket is removed.
+  services:
+  - rgw
+  default: "%bucket_id"
+  see_also:
+  - rgw_crypt_sse_s3_backend
+  - rgw_crypt_sse_s3_vault_auth
+  - rgw_crypt_sse_s3_vault_addr
+  with_legacy: true
+- name: rgw_crypt_sse_s3_vault_auth
+  type: str
+  level: advanced
+  desc: Type of authentication method to be used with SSE-S3 and Vault.
+  fmt_desc: Type of authentication method to be used. The only method
+    currently supported is ``token``.
+  default: token
+  services:
+  - rgw
+  see_also:
+  - rgw_crypt_sse_s3_backend
+  - rgw_crypt_sse_s3_vault_addr
+  - rgw_crypt_sse_s3_vault_token_file
+  enum_values:
+  - token
+  - agent
+  with_legacy: true
+- name: rgw_crypt_sse_s3_vault_token_file
+  type: str
+  level: advanced
+  desc: If authentication method is 'token', provide a path to the token file, which
+    for security reasons should readable only by Rados Gateway.
+  services:
+  - rgw
+  see_also:
+  - rgw_crypt_sse_s3_backend
+  - rgw_crypt_sse_s3_vault_auth
+  - rgw_crypt_sse_s3_vault_addr
+  with_legacy: true
+- name: rgw_crypt_sse_s3_vault_addr
+  type: str
+  level: advanced
+  desc: SSE-S3 Vault server base address.
+  fmt_desc: Vault server base address, e.g. ``http://vaultserver:8200``.
+  services:
+  - rgw
+  see_also:
+  - rgw_crypt_sse_s3_backend
+  - rgw_crypt_sse_s3_vault_auth
+  - rgw_crypt_sse_s3_vault_prefix
+  with_legacy: true
+# Optional URL prefix to Vault secret path
+- name: rgw_crypt_sse_s3_vault_prefix
+  type: str
+  level: advanced
+  desc: SSE-S3 Vault secret URL prefix, which can be used to restrict access to a particular
+    subset of the Vault secret space.
+  fmt_desc: The Vault secret URL prefix, which can be used to restrict access
+    to a particular subset of the secret space, e.g. ``/v1/secret/data``.
+  services:
+  - rgw
+  see_also:
+  - rgw_crypt_sse_s3_backend
+  - rgw_crypt_sse_s3_vault_addr
+  - rgw_crypt_sse_s3_vault_auth
+  with_legacy: true
+#  Vault Namespace (only availabe in Vault Enterprise Version)
+- name: rgw_crypt_sse_s3_vault_namespace
+  type: str
+  level: advanced
+  desc: Vault Namespace to be used to select your tenant
+  fmt_desc: If set, Vault Namespace provides tenant isolation for teams and individuals
+    on the same Vault Enterprise instance, e.g. ``acme/tenant1``
+  services:
+  - rgw
+  see_also:
+  - rgw_crypt_sse_s3_backend
+  - rgw_crypt_sse_s3_vault_auth
+  - rgw_crypt_sse_s3_vault_addr
+  with_legacy: true
+# Enable TLS authentication rgw and vault
+- name: rgw_crypt_sse_s3_vault_verify_ssl
+  type: bool
+  level: advanced
+  desc: Should RGW verify the vault server SSL certificate.
+  default: true
+  services:
+  - rgw
+  with_legacy: true
+# TLS certs options
+- name: rgw_crypt_sse_s3_vault_ssl_cacert
+  type: str
+  level: advanced
+  desc: Path for custom ca certificate for accessing vault server
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_crypt_sse_s3_vault_ssl_clientcert
+  type: str
+  level: advanced
+  desc: Path for custom client certificate for accessing vault server
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_crypt_sse_s3_vault_ssl_clientkey
+  type: str
+  level: advanced
+  desc: Path for private key required for client cert
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_list_bucket_min_readahead
+  type: int
+  level: advanced
+  desc: Minimum number of entries to request from rados for bucket listing
+  default: 1000
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_rest_getusage_op_compat
+  type: bool
+  level: advanced
+  desc: REST GetUsage request backward compatibility
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+# The following are tunables for torrent data
+- name: rgw_torrent_flag
+  type: bool
+  level: advanced
+  desc: When true, uploaded objects will calculate and store a SHA256 hash of object
+    data so the object can be retrieved as a torrent file
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_torrent_tracker
+  type: str
+  level: advanced
+  desc: Torrent field announce and announce list
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_torrent_createby
+  type: str
+  level: advanced
+  desc: torrent field created by
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_torrent_comment
+  type: str
+  level: advanced
+  desc: Torrent field comment
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_torrent_encoding
+  type: str
+  level: advanced
+  desc: torrent field encoding
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_data_notify_interval_msec
+  type: int
+  level: advanced
+  desc: data changes notification interval to followers
+  long_desc: In multisite, radosgw will occasionally broadcast new entries in its
+    data changes log to peer zones, so they can prioritize sync of some
+    of the most recent changes. Can be disabled with 0.
+  default: 0
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_torrent_origin
+  type: str
+  level: advanced
+  desc: Torrent origin
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_torrent_sha_unit
+  type: size
+  level: advanced
+  default: 512_K
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_dynamic_resharding
+  type: bool
+  level: basic
+  desc: Enable dynamic resharding
+  long_desc: If true, RGW will dynamically increase the number of shards in buckets
+    that have a high number of objects per shard.
+  default: true
+  services:
+  - rgw
+  see_also:
+  - rgw_max_objs_per_shard
+  - rgw_max_dynamic_shards
+- name: rgw_max_objs_per_shard
+  type: uint
+  level: basic
+  desc: Max objects per shard for dynamic resharding
+  long_desc: This is the max number of objects per bucket index shard that RGW will
+    allow with dynamic resharding. RGW will trigger an automatic reshard operation
+    on the bucket if it exceeds this number.
+  default: 100000
+  services:
+  - rgw
+  see_also:
+  - rgw_dynamic_resharding
+  - rgw_max_dynamic_shards
+- name: rgw_max_dynamic_shards
+  type: uint
+  level: advanced
+  desc: Max shards that dynamic resharding can create
+  long_desc: This is the maximum number of bucket index shards that dynamic sharding
+    is able to create on its own. This does not limit user requested resharding. Ideally
+    this value is a prime number.
+  default: 1999
+  services:
+  - rgw
+  see_also:
+  - rgw_dynamic_resharding
+  - rgw_max_objs_per_shard
+  min: 1
+- name: rgw_reshard_thread_interval
+  type: uint
+  level: advanced
+  desc: Number of seconds between processing of reshard log entries
+  default: 600
+  services:
+  - rgw
+  min: 10
+- name: rgw_cache_expiry_interval
+  type: uint
+  level: advanced
+  desc: Number of seconds before entries in the cache are assumed stale and re-fetched.
+    Zero is never.
+  long_desc: The Rados Gateway stores metadata and objects in an internal cache. This
+    should be kept consistent by the OSD's relaying notify events between multiple
+    watching RGW processes. In the event that this notification protocol fails, bounding
+    the length of time that any data in the cache will be assumed valid will ensure
+    that any RGW instance that falls out of sync will eventually recover. This seems
+    to be an issue mostly for large numbers of RGW instances under heavy use. If you
+    would like to turn off cache expiry, set this value to zero.
+  default: 900
+  tags:
+  - performance
+  services:
+  - rgw
+  - rgw
+- name: rgw_inject_notify_timeout_probability
+  type: float
+  level: dev
+  desc: Likelihood of ignoring a notify
+  long_desc: This is the probability that the RGW cache will ignore a cache notify
+    message. It exists to help with the development and testing of cache consistency
+    and recovery improvements. Please do not set it in a production cluster, as it
+    actively causes failures. Set this to a floating point value between 0 and 1.
+  default: 0
+  tags:
+  - fault injection
+  - testing
+  services:
+  - rgw
+  - rgw
+  min: 0
+  max: 1
+- name: rgw_max_notify_retries
+  type: uint
+  level: advanced
+  desc: Number of attempts to notify peers before giving up.
+  long_desc: The number of times we will attempt to update a peer's cache in the event
+    of error before giving up. This is unlikely to be an issue unless your cluster
+    is very heavily loaded. Beware that increasing this value may cause some operations
+    to take longer in exceptional cases and thus may, rarely, cause clients to time
+    out.
+  default: 10
+  tags:
+  - error recovery
+  services:
+  - rgw
+  - rgw
+- name: rgw_sts_entry
+  type: str
+  level: advanced
+  desc: STS URL prefix
+  long_desc: URL path prefix for internal STS requests.
+  default: sts
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_sts_key
+  type: str
+  level: advanced
+  desc: STS Key
+  long_desc: Key used for encrypting/ decrypting session token.
+  default: sts
+  services:
+  - rgw
+  with_legacy: true
+# should we try to use sts for s3?
+- name: rgw_s3_auth_use_sts
+  type: bool
+  level: advanced
+  desc: Should S3 authentication use STS.
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_sts_max_session_duration
+  type: uint
+  level: advanced
+  desc: Session token max duration
+  long_desc: Max duration in seconds for which the session token is valid.
+  default: 43200
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_sts_min_session_duration
+  type: uint
+  level: advanced
+  desc: Minimum allowed duration of a session
+  default: 900
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_max_listing_results
+  type: uint
+  level: advanced
+  desc: Upper bound on results in listing operations, ListBucket max-keys
+  long_desc: This caps the maximum permitted value for listing-like operations in
+    RGW S3. Affects ListBucket(max-keys), ListBucketVersions(max-keys), ListBucketMultipartUploads(max-uploads),
+    ListMultipartUploadParts(max-parts)
+  default: 1000
+  services:
+  - rgw
+  - rgw
+  min: 1
+  max: 100000
+- name: rgw_sts_token_introspection_url
+  type: str
+  level: advanced
+  desc: STS Web Token introspection URL
+  long_desc: URL for introspecting an STS Web Token.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_sts_client_id
+  type: str
+  level: advanced
+  desc: Client Id
+  long_desc: Client Id needed for introspecting a Web Token.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_sts_client_secret
+  type: str
+  level: advanced
+  desc: Client Secret
+  long_desc: Client Secret needed for introspecting a Web Token.
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_max_concurrent_requests
+  type: int
+  level: basic
+  desc: Maximum number of concurrent HTTP requests.
+  long_desc: Maximum number of concurrent HTTP requests that the beast frontend will
+    process. Tuning this can help to limit memory usage under heavy load.
+  default: 1024
+  tags:
+  - performance
+  services:
+  - rgw
+  see_also:
+  - rgw_frontends
+- name: rgw_scheduler_type
+  type: str
+  level: advanced
+  desc: Set the type of dmclock scheduler, defaults to throttler Other valid values
+    are dmclock which is experimental
+  fmt_desc: |
+    The RGW scheduler to use. Valid values are ``throttler` and
+    ``dmclock``. Currently defaults to ``throttler`` which throttles Beast
+    frontend requests. ``dmclock` is *experimental* and requires the
+    ``dmclock`` to be included in the ``experimental_feature_enabled``
+    configuration option.
+
+    The options below tune the experimental dmclock scheduler. For
+    additional reading on dmclock, see :ref:`dmclock-qos`. `op_class` for the flags below is
+    one of ``admin``, ``auth``, ``metadata``, or ``data``.
+  default: throttler
+  services:
+  - rgw
+- name: rgw_dmclock_admin_res
+  type: float
+  level: advanced
+  desc: mclock reservation for admin requests
+  default: 100
+  services:
+  - rgw
+  see_also:
+  - rgw_dmclock_admin_wgt
+  - rgw_dmclock_admin_lim
+- name: rgw_dmclock_admin_wgt
+  type: float
+  level: advanced
+  desc: mclock weight for admin requests
+  default: 100
+  services:
+  - rgw
+  see_also:
+  - rgw_dmclock_admin_res
+  - rgw_dmclock_admin_lim
+- name: rgw_dmclock_admin_lim
+  type: float
+  level: advanced
+  desc: mclock limit for admin requests
+  default: 0
+  services:
+  - rgw
+  see_also:
+  - rgw_dmclock_admin_res
+  - rgw_dmclock_admin_wgt
+- name: rgw_dmclock_auth_res
+  type: float
+  level: advanced
+  desc: mclock reservation for object data requests
+  default: 200
+  services:
+  - rgw
+  see_also:
+  - rgw_dmclock_auth_wgt
+  - rgw_dmclock_auth_lim
+- name: rgw_dmclock_auth_wgt
+  type: float
+  level: advanced
+  desc: mclock weight for object data requests
+  default: 100
+  services:
+  - rgw
+  see_also:
+  - rgw_dmclock_auth_res
+  - rgw_dmclock_auth_lim
+- name: rgw_dmclock_auth_lim
+  type: float
+  level: advanced
+  desc: mclock limit for object data requests
+  default: 0
+  services:
+  - rgw
+  see_also:
+  - rgw_dmclock_auth_res
+  - rgw_dmclock_auth_wgt
+- name: rgw_dmclock_data_res
+  type: float
+  level: advanced
+  desc: mclock reservation for object data requests
+  default: 500
+  services:
+  - rgw
+  see_also:
+  - rgw_dmclock_data_wgt
+  - rgw_dmclock_data_lim
+- name: rgw_dmclock_data_wgt
+  type: float
+  level: advanced
+  desc: mclock weight for object data requests
+  default: 500
+  services:
+  - rgw
+  see_also:
+  - rgw_dmclock_data_res
+  - rgw_dmclock_data_lim
+- name: rgw_dmclock_data_lim
+  type: float
+  level: advanced
+  desc: mclock limit for object data requests
+  default: 0
+  services:
+  - rgw
+  see_also:
+  - rgw_dmclock_data_res
+  - rgw_dmclock_data_wgt
+- name: rgw_dmclock_metadata_res
+  type: float
+  level: advanced
+  desc: mclock reservation for metadata requests
+  default: 500
+  services:
+  - rgw
+  see_also:
+  - rgw_dmclock_metadata_wgt
+  - rgw_dmclock_metadata_lim
+- name: rgw_dmclock_metadata_wgt
+  type: float
+  level: advanced
+  desc: mclock weight for metadata requests
+  default: 500
+  services:
+  - rgw
+  see_also:
+  - rgw_dmclock_metadata_res
+  - rgw_dmclock_metadata_lim
+- name: rgw_dmclock_metadata_lim
+  type: float
+  level: advanced
+  desc: mclock limit for metadata requests
+  default: 0
+  services:
+  - rgw
+  see_also:
+  - rgw_dmclock_metadata_res
+  - rgw_dmclock_metadata_wgt
+- name: rgw_default_data_log_backing
+  type: str
+  level: advanced
+  desc: Default backing store for the RGW data sync log
+  long_desc: Whether to use the older OMAP backing store or the high performance FIFO
+    based backing store by default. This only covers the creation of the log on startup
+    if none exists.
+  default: fifo
+  services:
+  - rgw
+  enum_values:
+  - fifo
+  - omap
+- name: rgw_d3n_l1_local_datacache_enabled
+  type: bool
+  level: advanced
+  desc: Enable datacenter-scale dataset delivery local cache
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_d3n_l1_datacache_persistent_path
+  type: str
+  level: advanced
+  desc: path for the directory for storing the local cache objects data
+  default: /tmp/rgw_datacache/
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_d3n_l1_datacache_size
+  type: size
+  level: advanced
+  desc: datacache maximum size on disk in bytes
+  default: 1_G
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_d3n_l1_evict_cache_on_start
+  type: bool
+  level: advanced
+  desc: clear the content of the persistent data cache directory on start
+  default: true
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_d3n_l1_fadvise
+  type: int
+  level: advanced
+  desc: posix_fadvise() flag for access pattern of cache files
+  long_desc: for example to bypass the page-cache -
+    POSIX_FADV_DONTNEED=4
+  default: 4
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_d3n_l1_eviction_policy
+  type: str
+  level: advanced
+  desc: select the d3n cache eviction policy
+  default: lru
+  services:
+  - rgw
+  enum_values:
+  - lru
+  - random
+  with_legacy: true
+- name: rgw_d3n_libaio_aio_threads
+  type: int
+  level: advanced
+  desc: specifies the maximum number of worker threads that may be used by libaio
+  default: 20
+  services:
+  - rgw
+  see_also:
+  - rgw_thread_pool_size
+  with_legacy: true
+- name: rgw_d3n_libaio_aio_num
+  type: int
+  level: advanced
+  desc: specifies the maximum number of simultaneous I/O requests that libaio expects to enqueue
+  default: 64
+  services:
+  - rgw
+  see_also:
+  - rgw_thread_pool_size
+  with_legacy: true
+- name: rgw_backend_store
+  type: str
+  level: advanced
+  desc: experimental Option to set backend store type
+  long_desc: defaults to rados. Other valid values are dbstore, motr, and daos (All experimental).
+  default: rados
+  services:
+  - rgw
+  enum_values:
+  - rados
+  - dbstore
+  - motr
+  - daos
+- name: rgw_config_store
+  type: str
+  level: advanced
+  desc: Configuration storage backend
+  default: rados
+  services:
+  - rgw
+  enum_values:
+  - rados
+  - dbstore
+  - json
+- name: rgw_filter
+  type: str
+  level: advanced
+  desc: experimental Option to set a filter
+  long_desc: defaults to none. Other valid values are base and trace (both experimental).
+  default: none
+  services:
+  - rgw
+  enum_values:
+  - none
+  - base
+  - trace
+- name: dbstore_db_dir
+  type: str
+  level: advanced
+  desc: path for the directory for storing the db backend store data
+  default: /var/lib/ceph/radosgw
+  services:
+  - rgw
+- name: dbstore_db_name_prefix
+  type: str
+  level: advanced
+  desc: prefix to the file names created by db backend store
+  default: dbstore
+  services:
+  - rgw
+- name: dbstore_config_uri
+  type: str
+  level: advanced
+  desc: 'Config database URI. URIs beginning with file: refer to local files opened with SQLite.'
+  default: file:/var/lib/ceph/radosgw/dbstore-config.db
+  see_also:
+  - rgw_config_store
+  services:
+  - rgw
+- name: rgw_json_config
+  type: str
+  level: advanced
+  desc: Path to a json file that contains the static zone and zonegroup configuration. Requires rgw_config_store=json.
+  default: /var/lib/ceph/radosgw/config.json
+  see_also:
+  - rgw_config_store
+  services:
+  - rgw
+- name: motr_profile_fid
+  type: str
+  level: advanced
+  desc: experimental Option to set Motr profile fid
+  long_desc: example value 0x7000000000000001:0x4f
+  default: 0x7000000000000001:0x0
+  services:
+  - rgw
+- name: motr_my_fid
+  type: str
+  level: advanced
+  desc: experimental Option to set my Motr fid
+  long_desc: example value 0x7200000000000001:0x29
+  default: 0x7200000000000001:0x0
+  services:
+  - rgw
+- name: motr_admin_fid
+  type: str
+  level: advanced
+  desc: Admin Tool Motr FID for admin-level access.
+  long_desc: example value 0x7200000000000001:0x2c
+  default: 0x7200000000000001:0x0
+  services:
+  - rgw
+- name: motr_admin_endpoint
+  type: str
+  level: advanced
+  desc:  experimental Option to set Admin Motr endpoint address
+  long_desc: example value 192.168.180.182@tcp:12345:4:1
+  default: 192.168.180.182@tcp:12345:4:1
+  services:
+  - rgw
+- name: motr_my_endpoint
+  type: str
+  level: advanced
+  desc: experimental Option to set my Motr endpoint address
+  long_desc: example value 192.168.180.182@tcp:12345:4:1
+  default: 192.168.180.182@tcp:12345:4:1
+  services:
+  - rgw
+- name: motr_ha_endpoint
+  type: str
+  level: advanced
+  desc: experimental Option to set Motr HA agent endpoint address
+  long_desc: example value 192.168.180.182@tcp:12345:1:1
+  default: 192.168.180.182@tcp:12345:1:1
+  services:
+  - rgw
+- name: motr_tracing_enabled
+  type: bool
+  level: advanced
+  desc: Set to true when Motr client debugging is needed
+  default: false
+  services:
+  - rgw
+- name: rgw_luarocks_location
+  type: str
+  level: advanced
+  desc: Directory where luarocks install packages from allowlist
+  default: @rgw_luarocks_location@
+  services:
+  - rgw
+  flags:
+  - startup
+- name: rgwlc_auto_session_clear
+  type: bool
+  level: advanced
+  desc: Automatically clear stale lifecycle sessions (i.e., after 2 idle processing cycles)
+  default: true
+  services:
+  - rgw
+  with_legacy: true
+- name: rgwlc_skip_bucket_step
+  type: bool
+  level: advanced
+  desc: Conditionally skip the processing (but not the scheduling) of bucket lifecycle
+  default: false
+  services:
+  - rgw
+  with_legacy: true
+- name: rgw_pending_bucket_index_op_expiration
+  type: uint
+  level: advanced
+  default: 120
+  desc: Number of seconds a pending operation can remain in bucket index shard.
+  long_desc: Number of seconds a pending operation can remain in bucket
+    index shard before it expires. Used for transactional bucket index
+    operations, and if the operation does not complete in this time
+    period, the operation will be dropped.
+  services:
+  - rgw
+  - osd
+  with_legacy: true
+- name: rgw_bucket_index_transaction_instrumentation
+  type: bool
+  level: dev
+  default: false
+  desc: Turns on extra instrumentation surrounding bucket index transactions.
+  services:
+  - rgw
+  - osd
+  with_legacy: true
+- name: rgw_allow_notification_secrets_in_cleartext
+  type: bool
+  level: advanced
+  desc: Allows sending secrets (e.g. passwords) over non encrypted HTTP messages.
+  long_desc: When bucket notification endpoint require secrets (e.g. passwords),
+    we allow the topic creation only over HTTPS messages. 
+    This parameter can be set to "true" to bypass this check.
+    Use this only if radosgw is on a trusted private network, and the message 
+    broker cannot be configured without password authentication. Otherwise, this will 
+    leak the credentials of your message broker and compromise its security.
+  default: false
+  services:
+  - rgw
+  see_also:
+  - rgw_trust_forwarded_https
+- name: daos_pool
+  type: str
+  level: advanced
+  desc: DAOS Pool to use
+  default: tank
+  services:
+  - rgw
+- name: rgw_policy_reject_invalid_principals
+  type: bool
+  level: basic
+  desc: Whether to reject policies with invalid principals
+  long_desc: If true, policies with invalid principals will be
+    rejected. We don't support Canonical User identifiers or some
+    other form of policies that Amazon does, so if you are mirroring
+    policies between RGW and AWS, you may wish to set this to false.
+  default: true
+  services:
+  - rgw
diff --git a/src/common/options/validate-options.py b/src/common/options/validate-options.py
new file mode 100755
index 000000000..5bc5d4d46
--- /dev/null
+++ b/src/common/options/validate-options.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+
+import argparse
+import fileinput
+import sys
+import yaml
+from typing import Any, Dict
+
+
+class ValidationError(Exception):
+    pass
+
+
+OptionType = Dict[str, Any]
+
+
+def validate_see_also(opt: OptionType, opts: Dict[str, OptionType]) -> None:
+    see_also = opt.get('see_also')
+    if see_also is None:
+        return
+    for ref in see_also:
+        if ref not in opts:
+            msg = f'see_also contains "{ref}". But it is not found.'
+            raise ValidationError(msg)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('yamls', nargs='*')
+    opts = parser.parse_args()
+    options = {}
+    for yaml_file in opts.yamls:
+        with open(yaml_file) as f:
+            yml = yaml.load(f, yaml.SafeLoader)
+            options.update({opt['name']: opt for opt in yml['options']})
+    for name, opt in options.items():
+        try:
+            validate_see_also(opt, options)
+        except ValidationError as e:
+            raise Exception(f'failed to validate "{name}": {e}')
+
+
+if __name__ == '__main__':
+    try:
+        main()
+    except Exception as e:
+        print(e, file=sys.stderr)
+        sys.exit(1)
diff --git a/src/common/options/y2c.py b/src/common/options/y2c.py
new file mode 100755
index 000000000..0b64bec58
--- /dev/null
+++ b/src/common/options/y2c.py
@@ -0,0 +1,366 @@
+#!/usr/bin/env python3
+
+import yaml
+import argparse
+import math
+import os
+import sys
+
+# flake8: noqa: E127
+
+def type_to_cxx(t):
+    return f'Option::TYPE_{t.upper()}'
+
+
+def level_to_cxx(lv):
+    return f'Option::LEVEL_{lv.upper()}'
+
+
+def eval_str(v):
+    if v == "":
+        return v
+    v = v.strip('"').replace('"', '\\"')
+    return f'"{v}"'
+
+
+def eval_value(v, typ):
+    try:
+        if typ == 'str':
+            return eval_str(v)
+        if typ == 'float':
+            return float(v)
+        if typ in ('uint', 'int', 'size', 'secs', 'millisecs'):
+            return int(v)
+        if typ == 'bool':
+            return 'true' if v else 'false'
+        else:
+            return f'"{v}"'
+    except ValueError:
+        times = dict(_min=60,
+                     _hr=60*60,
+                     _day=24*60*60,
+                     _K=1 << 10,
+                     _M=1 << 20,
+                     _G=1 << 30,
+                     _T=1 << 40)
+        for unit, m in times.items():
+            if v.endswith(unit):
+                int(v[:-len(unit)])
+                # user defined literals
+                return v
+        raise ValueError(f'unknown value: {v}')
+
+
+def set_default(default, typ):
+    if default is None:
+        return ''
+    v = eval_value(default, typ)
+    return f'.set_default({v})\n'
+
+
+def set_daemon_default(default, typ):
+    if default is None:
+        return ''
+    v = eval_value(default, typ)
+    return f'.set_daemon_default({v})\n'
+
+
+def add_tags(tags):
+    if tags is None:
+        return ''
+    cxx = ''
+    for tag in tags:
+        v = eval_str(tag)
+        cxx += f'.add_tag({v})\n'
+    return cxx
+
+
+def add_services(services):
+    if services is None:
+        return ''
+    if len(services) == 1:
+        return f'.add_service("{services[0]}")\n'
+    else:
+        param = ', '.join(f'"{s}"' for s in services)
+        return f'.add_service({{{param}}})\n'
+
+
+def add_see_also(see_also):
+    if see_also is None:
+        return ''
+    param = ', '.join(f'"{v}"' for v in see_also)
+    return f'.add_see_also({{{param}}})\n'
+
+
+def set_desc(desc):
+    if desc is None:
+        return ''
+    v = eval_str(desc)
+    return f'.set_description({v})\n'
+
+
+def set_long_desc(desc):
+    if desc is None:
+        return ''
+    v = eval_str(desc)
+    return f'.set_long_description({v})\n'
+
+
+def set_min_max(mi, ma, typ):
+    if mi is None and ma is None:
+        return ''
+    if mi is not None and ma is not None:
+        min_v = eval_value(mi, typ)
+        max_v = eval_value(ma, typ)
+        if isinstance(min_v, str) and isinstance(max_v, int):
+            return f'.set_min_max({min_v}, {max_v}ULL)\n'
+        elif isinstance(min_v, int) and isinstance(max_v, str):
+            return f'.set_min_max({min_v}ULL, {max_v})\n'
+        else:
+            return f'.set_min_max({min_v}, {max_v})\n'
+    if mi is not None:
+        min_v = eval_value(mi, typ)
+        return f'.set_min({min_v})\n'
+    raise ValueError('set_max() is not implemented')
+
+
+def set_enum_allowed(values):
+    if values is None:
+        return ''
+    param = ', '.join(f'"{v}"' for v in values)
+    return f'.set_enum_allowed({{{param}}})\n'
+
+
+def add_flags(flags):
+    if flags is None:
+        return ''
+    cxx = ''
+    for flag in flags:
+        cxx += f'.set_flag(Option::FLAG_{flag.upper()})\n'
+    return cxx
+
+
+def set_validator(validator):
+    if validator is None:
+        return ''
+    validator = validator.rstrip()
+    return f'.set_validator({validator})\n'
+
+
+def add_verbatim(verbatim):
+    if verbatim is None:
+        return ''
+    return verbatim + '\n'
+
+
+def yaml_to_cxx(opt, indent):
+    name = opt['name']
+    typ = opt['type']
+    ctyp = type_to_cxx(typ)
+    level = level_to_cxx(opt['level'])
+    cxx = f'Option("{name}", {ctyp}, {level})\n'
+    cxx += set_desc(opt.get('desc'))
+    cxx += set_long_desc(opt.get('long_desc'))
+    cxx += set_default(opt.get('default'), typ)
+    cxx += set_daemon_default(opt.get('daemon_default'), typ)
+    cxx += set_min_max(opt.get('min'), opt.get('max'), typ)
+    cxx += set_enum_allowed(opt.get('enum_values'))
+    cxx += set_validator(opt.get('validator'))
+    cxx += add_flags(opt.get('flags'))
+    cxx += add_services(opt.get('services'))
+    cxx += add_tags(opt.get('tags'))
+    cxx += add_see_also(opt.get('see_also'))
+    verbatim = add_verbatim(opt.get('verbatim'))
+    cxx += verbatim
+    if verbatim:
+        cxx += '\n'
+    else:
+        cxx = cxx.rstrip()
+    cxx += ',\n'
+    if indent > 0:
+        indented = []
+        for line in cxx.split('\n'):
+            if line:
+                indented.append(' ' * indent + line + '\n')
+        cxx = ''.join(indented)
+    return cxx
+
+
+def type_to_h(t):
+    if t == 'uint':
+        return 'OPT_U32'
+    return f'OPT_{t.upper()}'
+
+
+def yaml_to_h(opt):
+    if opt.get('with_legacy', False):
+        name = opt['name']
+        typ = opt['type']
+        htyp = type_to_h(typ)
+        return f'OPTION({name}, {htyp})'
+    else:
+        return ''
+
+
+TEMPLATE_CC = '''#include "common/options.h"
+{headers}
+
+std::vector<Option> get_{name}_options() {{
+  return std::vector<Option>({{
+@body@
+  }});
+}}
+'''
+
+
+# PyYAML doesn't check for duplicates even though the YAML spec says
+# that mapping keys must be unique and that duplicates must be treated
+# as an error.  See https://github.com/yaml/pyyaml/issues/165.
+#
+# This workaround breaks merge keys -- in "<<: *xyz", duplicate keys
+# from xyz mapping raise an error instead of being discarded.
+class UniqueKeySafeLoader(yaml.SafeLoader):
+    def construct_mapping(self, node, deep=False):
+        mapping = super().construct_mapping(node, deep)
+        keys = set()
+        for key_node, _ in node.value:
+            key = self.construct_object(key_node, deep=deep)
+            if key in keys:
+                raise yaml.constructor.ConstructorError(None, None,
+                                                        "found duplicate key",
+                                                        key_node.start_mark)
+            keys.add(key)
+        return mapping
+
+
+def translate(opts):
+    if opts.raw:
+        prelude, epilogue = '', ''
+    else:
+        prelude, epilogue = TEMPLATE_CC.split('@body@')
+
+    if opts.name:
+        name = opts.name
+    else:
+        name = os.path.split(opts.input)[-1]
+        name = name.rsplit('.', 1)[0]
+    name = name.replace('-', '_')
+    # noqa: E127
+    with open(opts.input) as infile, \
+         open(opts.output, 'w') as cc_file, \
+         open(opts.legacy, 'w') as h_file:
+        yml = yaml.load(infile, Loader=UniqueKeySafeLoader)
+        headers = yml.get('headers', '')
+        cc_file.write(prelude.format(name=name, headers=headers))
+        options = yml['options']
+        for option in options:
+            try:
+                cc_file.write(yaml_to_cxx(option, opts.indent) + '\n')
+                if option.get('with_legacy', False):
+                    h_file.write(yaml_to_h(option) + '\n')
+            except ValueError as e:
+                print(f'failed to translate option "{name}": {e}',
+                      file=sys.stderr)
+                return 1
+        cc_file.write(epilogue.replace("}}", "}"))
+
+
+def readable_size(value, typ):
+    times = dict(T=1 << 40,
+                 G=1 << 30,
+                 M=1 << 20,
+                 K=1 << 10)
+    if isinstance(value, str):
+        value = value.strip('"')
+    try:
+        v = int(value)
+        if v == 0:
+            return 0
+        for unit, m in times.items():
+            if v % m == 0:
+                v = int(v / m)
+                return f'{v}_{unit}'
+        return v
+    except ValueError:
+        return value
+
+
+def readable_duration(value, typ):
+    times = dict(day=24*60*60,
+                 hr=60*60,
+                 min=60)
+    if isinstance(value, str):
+        value = value.strip('"')
+    try:
+        v = float(value)
+        if math.floor(v) != v:
+            return v
+        v = int(v)
+        if v == 0:
+            return 0
+        for unit, m in times.items():
+            if v % m == 0:
+                v = int(v / m)
+                return f'{v}_{unit}'
+        return v
+    except ValueError:
+        return value
+
+
+def readable_millisecs(value, typ):
+    return int(value)
+
+
+def readable(opts):
+    with open(opts.input) as infile, open(opts.output, 'w') as outfile:
+        yml = yaml.load(infile, Loader=UniqueKeySafeLoader)
+        options = yml['options']
+        for option in options:
+            typ = option['type']
+            if typ in ('size', 'uint'):
+                do_readable = readable_size
+            elif typ in ('float', 'int', 'secs'):
+                do_readable = readable_duration
+            elif typ == 'millisecs':
+                do_readable = readable_millisecs
+            else:
+                continue
+            for field in ['default', 'min', 'max', 'daemon_default']:
+                v = option.get(field)
+                if v is not None:
+                    option[field] = do_readable(v, typ)
+        yml['options'] = options
+        yaml.dump(yml, outfile, sort_keys=False, indent=2)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('-i', '--input', dest='input',
+                        default='options.yaml',
+                        help='the YAML file to be processed')
+    parser.add_argument('-o', '--output', dest='output',
+                        default='options',
+                        help='the path to the generated .cc file')
+    parser.add_argument('--legacy', dest='legacy',
+                        default='legacy_options',
+                        help='the path to the generated legacy .h file')
+    parser.add_argument('--indent', type=int,
+                        default=4,
+                        help='the number of spaces added before each line')
+    parser.add_argument('--name',
+                        help='the name of the option group')
+    parser.add_argument('--raw', action='store_true',
+                        help='output the array without the full function')
+    parser.add_argument('--op', choices=('readable', 'translate'),
+                        default='translate',
+                        help='operation to perform.')
+    opts = parser.parse_args(sys.argv[1:])
+    if opts.op == 'translate':
+        translate(opts)
+    else:
+        readable(opts)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/common/ostream_temp.cc b/src/common/ostream_temp.cc
new file mode 100644
index 000000000..61ae5b741
--- /dev/null
+++ b/src/common/ostream_temp.cc
@@ -0,0 +1,15 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ostream_temp.h"
+
+OstreamTemp::OstreamTemp(clog_type type_, OstreamTempSink *parent_)
+  : type(type_), parent(parent_)
+{
+}
+
+OstreamTemp::~OstreamTemp()
+{
+  if (ss.peek() != EOF && parent)
+    parent->do_log(type, ss);
+}
diff --git a/src/common/ostream_temp.h b/src/common/ostream_temp.h
new file mode 100644
index 000000000..73e9e3f25
--- /dev/null
+++ b/src/common/ostream_temp.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <sstream>
+
+typedef enum {
+  CLOG_DEBUG = 0,
+  CLOG_INFO = 1,
+  CLOG_SEC = 2,
+  CLOG_WARN = 3,
+  CLOG_ERROR = 4,
+  CLOG_UNKNOWN = -1,
+} clog_type;
+
+class OstreamTemp
+{
+public:
+  class OstreamTempSink {
+  public:
+    virtual void do_log(clog_type prio, std::stringstream& ss) = 0;
+    virtual ~OstreamTempSink() {}
+  };
+  OstreamTemp(clog_type type_, OstreamTempSink *parent_);
+  OstreamTemp(OstreamTemp &&rhs) = default;
+  ~OstreamTemp();
+
+  template<typename T>
+  std::ostream& operator<<(const T& rhs)
+  {
+    return ss << rhs;
+  }
+
+private:
+  clog_type type;
+  OstreamTempSink *parent;
+  std::stringstream ss;
+};
+
+class LoggerSinkSet : public OstreamTemp::OstreamTempSink {
+public:
+  virtual void info(std::stringstream &s) = 0;
+  virtual void warn(std::stringstream &s) = 0;
+  virtual void error(std::stringstream &s) = 0;
+  virtual void sec(std::stringstream &s) = 0;
+  virtual void debug(std::stringstream &s) = 0;
+  virtual OstreamTemp info() = 0;
+  virtual OstreamTemp warn() = 0;
+  virtual OstreamTemp error() = 0;
+  virtual OstreamTemp sec() = 0;
+  virtual OstreamTemp debug() = 0;
+  virtual void do_log(clog_type prio, std::stringstream& ss) = 0;
+  virtual void do_log(clog_type prio, const std::string& ss) = 0;
+  virtual ~LoggerSinkSet() {};
+};
diff --git a/src/common/page.cc b/src/common/page.cc
new file mode 100644
index 000000000..e3dc34a43
--- /dev/null
+++ b/src/common/page.cc
@@ -0,0 +1,33 @@
+#include <unistd.h>
+
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+namespace ceph {
+
+  // page size crap, see page.h
+  int _get_bits_of(int v) {
+    int n = 0;
+    while (v) {
+      n++;
+      v = v >> 1;
+    }
+    return n;
+  }
+
+  #ifdef _WIN32
+  unsigned _get_page_size() {
+    SYSTEM_INFO system_info;
+    GetSystemInfo(&system_info);
+    return system_info.dwPageSize;
+  }
+
+  unsigned _page_size = _get_page_size();
+  #else
+  unsigned _page_size = sysconf(_SC_PAGESIZE);
+  #endif
+  unsigned long _page_mask = ~(unsigned long)(_page_size - 1);
+  unsigned _page_shift = _get_bits_of(_page_size - 1);
+
+}
diff --git a/src/common/perf_counters.cc b/src/common/perf_counters.cc
new file mode 100644
index 000000000..b5e361b50
--- /dev/null
+++ b/src/common/perf_counters.cc
@@ -0,0 +1,635 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ * Copyright (C) 2017 OVH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/perf_counters.h"
+#include "common/perf_counters_key.h"
+#include "common/dout.h"
+#include "common/valgrind.h"
+#include "include/common_fwd.h"
+
+using std::ostringstream;
+using std::make_pair;
+using std::pair;
+
+namespace TOPNSPC::common {
+PerfCountersCollectionImpl::PerfCountersCollectionImpl()
+{
+}
+
+PerfCountersCollectionImpl::~PerfCountersCollectionImpl()
+{
+  clear();
+}
+
+void PerfCountersCollectionImpl::add(PerfCounters *l)
+{
+  // make sure the name is unique
+  perf_counters_set_t::iterator i;
+  i = m_loggers.find(l);
+  while (i != m_loggers.end()) {
+    ostringstream ss;
+    ss << l->get_name() << "-" << (void*)l;
+    l->set_name(ss.str());
+    i = m_loggers.find(l);
+  }
+
+  m_loggers.insert(l);
+
+  for (unsigned int i = 0; i < l->m_data.size(); ++i) {
+    PerfCounters::perf_counter_data_any_d &data = l->m_data[i];
+
+    std::string path = l->get_name();
+    path += ".";
+    path += data.name;
+
+    by_path[path] = {&data, l};
+  }
+}
+
+void PerfCountersCollectionImpl::remove(PerfCounters *l)
+{
+  for (unsigned int i = 0; i < l->m_data.size(); ++i) {
+    PerfCounters::perf_counter_data_any_d &data = l->m_data[i];
+
+    std::string path = l->get_name();
+    path += ".";
+    path += data.name;
+
+    by_path.erase(path);
+  }
+
+  perf_counters_set_t::iterator i = m_loggers.find(l);
+  ceph_assert(i != m_loggers.end());
+  m_loggers.erase(i);
+}
+
+void PerfCountersCollectionImpl::clear()
+{
+  perf_counters_set_t::iterator i = m_loggers.begin();
+  perf_counters_set_t::iterator i_end = m_loggers.end();
+  for (; i != i_end; ) {
+    delete *i;
+    m_loggers.erase(i++);
+  }
+
+  by_path.clear();
+}
+
+bool PerfCountersCollectionImpl::reset(const std::string &name)
+{
+  bool result = false;
+  perf_counters_set_t::iterator i = m_loggers.begin();
+  perf_counters_set_t::iterator i_end = m_loggers.end();
+
+  if (!strcmp(name.c_str(), "all"))  {
+    while (i != i_end) {
+      (*i)->reset();
+      ++i;
+    }
+    result = true;
+  } else {
+    while (i != i_end) {
+      if (!name.compare((*i)->get_name())) {
+	(*i)->reset();
+	result = true;
+	break;
+      }
+      ++i;
+    }
+  }
+
+  return result;
+}
+
+
+/**
+ * Serialize current values of performance counters.  Optionally
+ * output the schema instead, or filter output to a particular
+ * PerfCounters or particular named counter.
+ *
+ * @param logger name of subsystem logger, e.g. "mds_cache", may be empty
+ * @param counter name of counter within subsystem, e.g. "num_strays",
+ *                may be empty.
+ * @param schema if true, output schema instead of current data.
+ * @param histograms if true, dump histogram values,
+ *                   if false dump all non-histogram counters
+ */
+void PerfCountersCollectionImpl::dump_formatted_generic(
+    Formatter *f,
+    bool schema,
+    bool histograms,
+    bool dump_labeled,
+    const std::string &logger,
+    const std::string &counter) const
+{
+  f->open_object_section("perfcounter_collection");
+  
+  if (dump_labeled) {
+    std::string prev_key_name;
+    for (auto l = m_loggers.begin(); l != m_loggers.end(); ++l) {
+      std::string_view key_name = ceph::perf_counters::key_name((*l)->get_name());
+      if (key_name != prev_key_name) {
+        // close previous set of counters before dumping new one
+        if (!prev_key_name.empty()) {
+          f->close_section(); // array section
+        }
+        prev_key_name = key_name;
+
+        f->open_array_section(key_name);
+        (*l)->dump_formatted_generic(f, schema, histograms, true, "");
+      } else {
+        (*l)->dump_formatted_generic(f, schema, histograms, true, "");
+      }
+    }
+    if (!m_loggers.empty()) {
+      f->close_section(); // final array section
+    }
+  } else {
+    for (auto l = m_loggers.begin(); l != m_loggers.end(); ++l) {
+      // Optionally filter on logger name, pass through counter filter
+      if (logger.empty() || (*l)->get_name() == logger) {
+        (*l)->dump_formatted_generic(f, schema, histograms, false, counter);
+      }
+    }
+  }
+  f->close_section();
+}
+
+void PerfCountersCollectionImpl::with_counters(std::function<void(
+      const PerfCountersCollectionImpl::CounterMap &)> fn) const
+{
+  fn(by_path);
+}
+
+// ---------------------------
+
+PerfCounters::~PerfCounters()
+{
+}
+
+void PerfCounters::inc(int idx, uint64_t amt)
+{
+#ifndef WITH_SEASTAR
+  if (!m_cct->_conf->perf)
+    return;
+#endif
+
+  ceph_assert(idx > m_lower_bound);
+  ceph_assert(idx < m_upper_bound);
+  perf_counter_data_any_d& data(m_data[idx - m_lower_bound - 1]);
+  if (!(data.type & PERFCOUNTER_U64))
+    return;
+  if (data.type & PERFCOUNTER_LONGRUNAVG) {
+    data.avgcount++;
+    data.u64 += amt;
+    data.avgcount2++;
+  } else {
+    data.u64 += amt;
+  }
+}
+
+void PerfCounters::dec(int idx, uint64_t amt)
+{
+#ifndef WITH_SEASTAR
+  if (!m_cct->_conf->perf)
+    return;
+#endif
+
+  ceph_assert(idx > m_lower_bound);
+  ceph_assert(idx < m_upper_bound);
+  perf_counter_data_any_d& data(m_data[idx - m_lower_bound - 1]);
+  ceph_assert(!(data.type & PERFCOUNTER_LONGRUNAVG));
+  if (!(data.type & PERFCOUNTER_U64))
+    return;
+  data.u64 -= amt;
+}
+
+void PerfCounters::set(int idx, uint64_t amt)
+{
+#ifndef WITH_SEASTAR
+  if (!m_cct->_conf->perf)
+    return;
+#endif
+
+  ceph_assert(idx > m_lower_bound);
+  ceph_assert(idx < m_upper_bound);
+  perf_counter_data_any_d& data(m_data[idx - m_lower_bound - 1]);
+  if (!(data.type & PERFCOUNTER_U64))
+    return;
+
+  ANNOTATE_BENIGN_RACE_SIZED(&data.u64, sizeof(data.u64),
+                             "perf counter atomic");
+  if (data.type & PERFCOUNTER_LONGRUNAVG) {
+    data.avgcount++;
+    data.u64 = amt;
+    data.avgcount2++;
+  } else {
+    data.u64 = amt;
+  }
+}
+
+uint64_t PerfCounters::get(int idx) const
+{
+#ifndef WITH_SEASTAR
+  if (!m_cct->_conf->perf)
+    return 0;
+#endif
+
+  ceph_assert(idx > m_lower_bound);
+  ceph_assert(idx < m_upper_bound);
+  const perf_counter_data_any_d& data(m_data[idx - m_lower_bound - 1]);
+  if (!(data.type & PERFCOUNTER_U64))
+    return 0;
+  return data.u64;
+}
+
+void PerfCounters::tinc(int idx, utime_t amt)
+{
+#ifndef WITH_SEASTAR
+  if (!m_cct->_conf->perf)
+    return;
+#endif
+
+  ceph_assert(idx > m_lower_bound);
+  ceph_assert(idx < m_upper_bound);
+  perf_counter_data_any_d& data(m_data[idx - m_lower_bound - 1]);
+  if (!(data.type & PERFCOUNTER_TIME))
+    return;
+  if (data.type & PERFCOUNTER_LONGRUNAVG) {
+    data.avgcount++;
+    data.u64 += amt.to_nsec();
+    data.avgcount2++;
+  } else {
+    data.u64 += amt.to_nsec();
+  }
+}
+
+void PerfCounters::tinc(int idx, ceph::timespan amt)
+{
+#ifndef WITH_SEASTAR
+  if (!m_cct->_conf->perf)
+    return;
+#endif
+
+  ceph_assert(idx > m_lower_bound);
+  ceph_assert(idx < m_upper_bound);
+  perf_counter_data_any_d& data(m_data[idx - m_lower_bound - 1]);
+  if (!(data.type & PERFCOUNTER_TIME))
+    return;
+  if (data.type & PERFCOUNTER_LONGRUNAVG) {
+    data.avgcount++;
+    data.u64 += amt.count();
+    data.avgcount2++;
+  } else {
+    data.u64 += amt.count();
+  }
+}
+
+void PerfCounters::tset(int idx, utime_t amt)
+{
+#ifndef WITH_SEASTAR
+  if (!m_cct->_conf->perf)
+    return;
+#endif
+
+  ceph_assert(idx > m_lower_bound);
+  ceph_assert(idx < m_upper_bound);
+  perf_counter_data_any_d& data(m_data[idx - m_lower_bound - 1]);
+  if (!(data.type & PERFCOUNTER_TIME))
+    return;
+  data.u64 = amt.to_nsec();
+  if (data.type & PERFCOUNTER_LONGRUNAVG)
+    ceph_abort();
+}
+
+utime_t PerfCounters::tget(int idx) const
+{
+#ifndef WITH_SEASTAR
+  if (!m_cct->_conf->perf)
+    return utime_t();
+#endif
+
+  ceph_assert(idx > m_lower_bound);
+  ceph_assert(idx < m_upper_bound);
+  const perf_counter_data_any_d& data(m_data[idx - m_lower_bound - 1]);
+  if (!(data.type & PERFCOUNTER_TIME))
+    return utime_t();
+  uint64_t v = data.u64;
+  return utime_t(v / 1000000000ull, v % 1000000000ull);
+}
+
+void PerfCounters::hinc(int idx, int64_t x, int64_t y)
+{
+#ifndef WITH_SEASTAR
+  if (!m_cct->_conf->perf)
+    return;
+#endif
+
+  ceph_assert(idx > m_lower_bound);
+  ceph_assert(idx < m_upper_bound);
+
+  perf_counter_data_any_d& data(m_data[idx - m_lower_bound - 1]);
+  ceph_assert(data.type == (PERFCOUNTER_HISTOGRAM | PERFCOUNTER_COUNTER | PERFCOUNTER_U64));
+  ceph_assert(data.histogram);
+
+  data.histogram->inc(x, y);
+}
+
+pair<uint64_t, uint64_t> PerfCounters::get_tavg_ns(int idx) const
+{
+#ifndef WITH_SEASTAR
+  if (!m_cct->_conf->perf)
+    return make_pair(0, 0);
+#endif
+
+  ceph_assert(idx > m_lower_bound);
+  ceph_assert(idx < m_upper_bound);
+  const perf_counter_data_any_d& data(m_data[idx - m_lower_bound - 1]);
+  if (!(data.type & PERFCOUNTER_TIME))
+    return make_pair(0, 0);
+  if (!(data.type & PERFCOUNTER_LONGRUNAVG))
+    return make_pair(0, 0);
+  pair<uint64_t,uint64_t> a = data.read_avg();
+  return make_pair(a.second, a.first);
+}
+
+void PerfCounters::reset()
+{
+  perf_counter_data_vec_t::iterator d = m_data.begin();
+  perf_counter_data_vec_t::iterator d_end = m_data.end();
+
+  while (d != d_end) {
+    d->reset();
+    ++d;
+  }
+}
+
+void PerfCounters::dump_formatted_generic(Formatter *f, bool schema,
+    bool histograms, bool dump_labeled, const std::string &counter) const
+{
+  if (dump_labeled) {
+    f->open_object_section(""); // should be enclosed by array
+    f->open_object_section("labels");
+    for (auto label : ceph::perf_counters::key_labels(m_name)) {
+      // don't dump labels with empty label names
+      if (!label.first.empty()) {
+        f->dump_string(label.first, label.second);
+      }
+    }
+    f->close_section(); // labels
+    f->open_object_section("counters");
+  } else {
+    auto labels = ceph::perf_counters::key_labels(m_name);
+    // do not dump counters when counter instance is labeled and dump_labeled is not set
+    if (labels.begin() != labels.end()) {
+      return;
+    }
+
+    f->open_object_section(m_name.c_str());
+  }
+  
+  for (perf_counter_data_vec_t::const_iterator d = m_data.begin();
+       d != m_data.end(); ++d) {
+    if (!counter.empty() && counter != d->name) {
+      // Optionally filter on counter name
+      continue;
+    }
+
+    // Switch between normal and histogram view
+    bool is_histogram = (d->type & PERFCOUNTER_HISTOGRAM) != 0;
+    if (is_histogram != histograms) {
+      continue;
+    }
+
+    if (schema) {
+      f->open_object_section(d->name);
+      // we probably should not have exposed this raw field (with bit
+      // values), but existing plugins rely on it so we're stuck with
+      // it.
+      f->dump_int("type", d->type);
+
+      if (d->type & PERFCOUNTER_COUNTER) {
+	f->dump_string("metric_type", "counter");
+      } else {
+	f->dump_string("metric_type", "gauge");
+      }
+
+      if (d->type & PERFCOUNTER_LONGRUNAVG) {
+	if (d->type & PERFCOUNTER_TIME) {
+	  f->dump_string("value_type", "real-integer-pair");
+	} else {
+	  f->dump_string("value_type", "integer-integer-pair");
+	}
+      } else if (d->type & PERFCOUNTER_HISTOGRAM) {
+	if (d->type & PERFCOUNTER_TIME) {
+	  f->dump_string("value_type", "real-2d-histogram");
+	} else {
+	  f->dump_string("value_type", "integer-2d-histogram");
+	}
+      } else {
+	if (d->type & PERFCOUNTER_TIME) {
+	  f->dump_string("value_type", "real");
+	} else {
+	  f->dump_string("value_type", "integer");
+	}
+      }
+
+      f->dump_string("description", d->description ? d->description : "");
+      if (d->nick != NULL) {
+        f->dump_string("nick", d->nick);
+      } else {
+        f->dump_string("nick", "");
+      }
+      f->dump_int("priority", get_adjusted_priority(d->prio));
+      
+      if (d->unit == UNIT_NONE) {
+	f->dump_string("units", "none"); 
+      } else if (d->unit == UNIT_BYTES) {
+	f->dump_string("units", "bytes");
+      }
+      f->close_section();
+    } else {
+      if (d->type & PERFCOUNTER_LONGRUNAVG) {
+	f->open_object_section(d->name);
+	pair<uint64_t,uint64_t> a = d->read_avg();
+	if (d->type & PERFCOUNTER_U64) {
+	  f->dump_unsigned("avgcount", a.second);
+	  f->dump_unsigned("sum", a.first);
+	} else if (d->type & PERFCOUNTER_TIME) {
+	  f->dump_unsigned("avgcount", a.second);
+	  f->dump_format_unquoted("sum", "%" PRId64 ".%09" PRId64,
+				  a.first / 1000000000ull,
+				  a.first % 1000000000ull);
+          uint64_t count = a.second;
+          uint64_t sum_ns = a.first;
+          if (count) {
+            uint64_t avg_ns = sum_ns / count;
+            f->dump_format_unquoted("avgtime", "%" PRId64 ".%09" PRId64,
+                                    avg_ns / 1000000000ull,
+                                    avg_ns % 1000000000ull);
+          } else {
+            f->dump_format_unquoted("avgtime", "%" PRId64 ".%09" PRId64, 0, 0);
+          }
+	} else {
+	  ceph_abort();
+	}
+	f->close_section();
+      } else if (d->type & PERFCOUNTER_HISTOGRAM) {
+        ceph_assert(d->type == (PERFCOUNTER_HISTOGRAM | PERFCOUNTER_COUNTER | PERFCOUNTER_U64));
+        ceph_assert(d->histogram);
+        f->open_object_section(d->name);
+        d->histogram->dump_formatted(f);
+        f->close_section();
+      } else {
+	uint64_t v = d->u64;
+	if (d->type & PERFCOUNTER_U64) {
+	  f->dump_unsigned(d->name, v);
+	} else if (d->type & PERFCOUNTER_TIME) {
+	  f->dump_format_unquoted(d->name, "%" PRId64 ".%09" PRId64,
+				  v / 1000000000ull,
+				  v % 1000000000ull);
+	} else {
+	  ceph_abort();
+	}
+      }
+    }
+  }
+  if (dump_labeled) {
+    f->close_section(); // counters
+  }
+  f->close_section();
+}
+
+const std::string &PerfCounters::get_name() const
+{
+  return m_name;
+}
+
+PerfCounters::PerfCounters(CephContext *cct, const std::string &name,
+	   int lower_bound, int upper_bound)
+  : m_cct(cct),
+    m_lower_bound(lower_bound),
+    m_upper_bound(upper_bound),
+    m_name(name)
+#if !defined(WITH_SEASTAR) || defined(WITH_ALIEN)
+    ,
+    m_lock_name(std::string("PerfCounters::") + name.c_str()),
+    m_lock(ceph::make_mutex(m_lock_name))
+#endif
+{
+  m_data.resize(upper_bound - lower_bound - 1);
+}
+
+PerfCountersBuilder::PerfCountersBuilder(CephContext *cct, const std::string &name,
+                  int first, int last)
+  : m_perf_counters(new PerfCounters(cct, name, first, last))
+{
+}
+
+PerfCountersBuilder::~PerfCountersBuilder()
+{
+  if (m_perf_counters)
+    delete m_perf_counters;
+  m_perf_counters = NULL;
+}
+
+void PerfCountersBuilder::add_u64_counter(
+  int idx, const char *name,
+  const char *description, const char *nick, int prio, int unit)
+{
+  add_impl(idx, name, description, nick, prio,
+	   PERFCOUNTER_U64 | PERFCOUNTER_COUNTER, unit);
+}
+
+void PerfCountersBuilder::add_u64(
+  int idx, const char *name,
+  const char *description, const char *nick, int prio, int unit)
+{
+  add_impl(idx, name, description, nick, prio, PERFCOUNTER_U64, unit);
+}
+
+void PerfCountersBuilder::add_u64_avg(
+  int idx, const char *name,
+  const char *description, const char *nick, int prio, int unit)
+{
+  add_impl(idx, name, description, nick, prio,
+	   PERFCOUNTER_U64 | PERFCOUNTER_LONGRUNAVG, unit);
+}
+
+void PerfCountersBuilder::add_time(
+  int idx, const char *name,
+  const char *description, const char *nick, int prio)
+{
+  add_impl(idx, name, description, nick, prio, PERFCOUNTER_TIME);
+}
+
+void PerfCountersBuilder::add_time_avg(
+  int idx, const char *name,
+  const char *description, const char *nick, int prio)
+{
+  add_impl(idx, name, description, nick, prio,
+	   PERFCOUNTER_TIME | PERFCOUNTER_LONGRUNAVG);
+}
+
+void PerfCountersBuilder::add_u64_counter_histogram(
+  int idx, const char *name,
+  PerfHistogramCommon::axis_config_d x_axis_config,
+  PerfHistogramCommon::axis_config_d y_axis_config,
+  const char *description, const char *nick, int prio, int unit)
+{
+  add_impl(idx, name, description, nick, prio,
+	   PERFCOUNTER_U64 | PERFCOUNTER_HISTOGRAM | PERFCOUNTER_COUNTER, unit,
+           std::unique_ptr<PerfHistogram<>>{new PerfHistogram<>{x_axis_config, y_axis_config}});
+}
+
+void PerfCountersBuilder::add_impl(
+  int idx, const char *name,
+  const char *description, const char *nick, int prio, int ty, int unit,
+  std::unique_ptr<PerfHistogram<>> histogram)
+{
+  ceph_assert(idx > m_perf_counters->m_lower_bound);
+  ceph_assert(idx < m_perf_counters->m_upper_bound);
+  PerfCounters::perf_counter_data_vec_t &vec(m_perf_counters->m_data);
+  PerfCounters::perf_counter_data_any_d
+    &data(vec[idx - m_perf_counters->m_lower_bound - 1]);
+  ceph_assert(data.type == PERFCOUNTER_NONE);
+  data.name = name;
+  data.description = description;
+  // nick must be <= 4 chars
+  if (nick) {
+    ceph_assert(strlen(nick) <= 4);
+  }
+  data.nick = nick;
+  data.prio = prio ? prio : prio_default;
+  data.type = (enum perfcounter_type_d)ty;
+  data.unit = (enum unit_t) unit;
+  data.histogram = std::move(histogram);
+}
+
+PerfCounters *PerfCountersBuilder::create_perf_counters()
+{
+  PerfCounters::perf_counter_data_vec_t::const_iterator d = m_perf_counters->m_data.begin();
+  PerfCounters::perf_counter_data_vec_t::const_iterator d_end = m_perf_counters->m_data.end();
+  for (; d != d_end; ++d) {
+    ceph_assert(d->type != PERFCOUNTER_NONE);
+    ceph_assert(d->type & (PERFCOUNTER_U64 | PERFCOUNTER_TIME));
+  }
+
+  PerfCounters *ret = m_perf_counters;
+  m_perf_counters = NULL;
+  return ret;
+}
+
+}
diff --git a/src/common/perf_counters.h b/src/common/perf_counters.h
new file mode 100644
index 000000000..942edf6d7
--- /dev/null
+++ b/src/common/perf_counters.h
@@ -0,0 +1,384 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ * Copyright (C) 2017 OVH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_COMMON_PERF_COUNTERS_H
+#define CEPH_COMMON_PERF_COUNTERS_H
+
+#include <string>
+#include <vector>
+#include <memory>
+#include <atomic>
+#include <cstdint>
+
+#include "common/perf_histogram.h"
+#include "include/utime.h"
+#include "include/common_fwd.h"
+#include "common/ceph_mutex.h"
+#include "common/ceph_time.h"
+
+namespace TOPNSPC::common {
+  class CephContext;
+  class PerfCountersBuilder;
+  class PerfCounters;
+}
+
+enum perfcounter_type_d : uint8_t
+{
+  PERFCOUNTER_NONE = 0,
+  PERFCOUNTER_TIME = 0x1,       // float (measuring seconds)
+  PERFCOUNTER_U64 = 0x2,        // integer (note: either TIME or U64 *must* be set)
+  PERFCOUNTER_LONGRUNAVG = 0x4, // paired counter + sum (time)
+  PERFCOUNTER_COUNTER = 0x8,    // counter (vs gauge)
+  PERFCOUNTER_HISTOGRAM = 0x10, // histogram (vector) of values
+};
+
+enum unit_t : uint8_t
+{
+  UNIT_BYTES,
+  UNIT_NONE
+};
+
+/* Class for constructing a PerfCounters object.
+ *
+ * This class performs some validation that the parameters we have supplied are
+ * correct in create_perf_counters().
+ *
+ * In the future, we will probably get rid of the first/last arguments, since
+ * PerfCountersBuilder can deduce them itself.
+ */
+namespace TOPNSPC::common {
+class PerfCountersBuilder
+{
+public:
+  PerfCountersBuilder(CephContext *cct, const std::string &name,
+		    int first, int last);
+  ~PerfCountersBuilder();
+
+  // prio values: higher is better, and higher values get included in
+  // 'ceph daemonperf' (and similar) results.
+  // Use of priorities enables us to add large numbers of counters
+  // internally without necessarily overwhelming consumers.
+  enum {
+    PRIO_CRITICAL = 10,
+    // 'interesting' is the default threshold for `daemonperf` output
+    PRIO_INTERESTING = 8,
+    // `useful` is the default threshold for transmission to ceph-mgr
+    // and inclusion in prometheus/influxdb plugin output
+    PRIO_USEFUL = 5,
+    PRIO_UNINTERESTING = 2,
+    PRIO_DEBUGONLY = 0,
+  };
+  void add_u64(int key, const char *name,
+	       const char *description=NULL, const char *nick = NULL,
+	       int prio=0, int unit=UNIT_NONE);
+  void add_u64_counter(int key, const char *name,
+		       const char *description=NULL,
+		       const char *nick = NULL,
+		       int prio=0, int unit=UNIT_NONE);
+  void add_u64_avg(int key, const char *name,
+		   const char *description=NULL,
+		   const char *nick = NULL,
+		   int prio=0, int unit=UNIT_NONE);
+  void add_time(int key, const char *name,
+		const char *description=NULL,
+		const char *nick = NULL,
+		int prio=0);
+  void add_time_avg(int key, const char *name,
+		    const char *description=NULL,
+		    const char *nick = NULL,
+		    int prio=0);
+  void add_u64_counter_histogram(
+    int key, const char* name,
+    PerfHistogramCommon::axis_config_d x_axis_config,
+    PerfHistogramCommon::axis_config_d y_axis_config,
+    const char *description=NULL,
+    const char* nick = NULL,
+    int prio=0, int unit=UNIT_NONE);
+
+  void set_prio_default(int prio_)
+  {
+    prio_default = prio_;
+  }
+
+  PerfCounters* create_perf_counters();
+private:
+  PerfCountersBuilder(const PerfCountersBuilder &rhs);
+  PerfCountersBuilder& operator=(const PerfCountersBuilder &rhs);
+  void add_impl(int idx, const char *name,
+                const char *description, const char *nick, int prio, int ty, int unit=UNIT_NONE,
+                std::unique_ptr<PerfHistogram<>> histogram = nullptr);
+
+  PerfCounters *m_perf_counters;
+
+  int prio_default = 0;
+};
+
+/*
+ * A PerfCounters object is usually associated with a single subsystem.
+ * It contains counters which we modify to track performance and throughput
+ * over time. 
+ *
+ * PerfCounters can track several different types of values:
+ * 1) integer values & counters
+ * 2) floating-point values & counters
+ * 3) floating-point averages
+ * 4) 2D histograms of quantized value pairs
+ *
+ * The difference between values, counters and histograms is in how they are initialized
+ * and accessed. For a counter, use the inc(counter, amount) function (note
+ * that amount defaults to 1 if you don't set it). For a value, use the
+ * set(index, value) function. For histogram use the hinc(value1, value2) function.
+ * (For time, use the tinc and tset variants.)
+ *
+ * If for some reason you would like to reset your counters, you can do so using
+ * the set functions even if they are counters, and you can also
+ * increment your values if for some reason you wish to.
+ *
+ * For the time average, it returns the current value and
+ * the "avgcount" member when read off. avgcount is incremented when you call
+ * tinc. Calling tset on an average is an error and will assert out.
+ */
+class PerfCounters
+{
+public:
+  /** Represents a PerfCounters data element. */
+  struct perf_counter_data_any_d {
+    perf_counter_data_any_d()
+      : name(NULL),
+        description(NULL),
+        nick(NULL),
+	 type(PERFCOUNTER_NONE),
+	 unit(UNIT_NONE)
+    {}
+    perf_counter_data_any_d(const perf_counter_data_any_d& other)
+      : name(other.name),
+        description(other.description),
+        nick(other.nick),
+	 type(other.type),
+	 unit(other.unit),
+	 u64(other.u64.load()) {
+      auto a = other.read_avg();
+      u64 = a.first;
+      avgcount = a.second;
+      avgcount2 = a.second;
+      if (other.histogram) {
+        histogram.reset(new PerfHistogram<>(*other.histogram));
+      }
+    }
+
+    const char *name;
+    const char *description;
+    const char *nick;
+    uint8_t prio = 0;
+    enum perfcounter_type_d type;
+    enum unit_t unit;
+    std::atomic<uint64_t> u64 = { 0 };
+    std::atomic<uint64_t> avgcount = { 0 };
+    std::atomic<uint64_t> avgcount2 = { 0 };
+    std::unique_ptr<PerfHistogram<>> histogram;
+
+    void reset()
+    {
+      if (type != PERFCOUNTER_U64) {
+	    u64 = 0;
+	    avgcount = 0;
+	    avgcount2 = 0;
+      }
+      if (histogram) {
+        histogram->reset();
+      }
+    }
+
+    // read <sum, count> safely by making sure the post- and pre-count
+    // are identical; in other words the whole loop needs to be run
+    // without any intervening calls to inc, set, or tinc.
+    std::pair<uint64_t,uint64_t> read_avg() const {
+      uint64_t sum, count;
+      do {
+	count = avgcount2;
+	sum = u64;
+      } while (avgcount != count);
+      return { sum, count };
+    }
+  };
+
+  template <typename T>
+  struct avg_tracker {
+    std::pair<uint64_t, T> last;
+    std::pair<uint64_t, T> cur;
+    avg_tracker() : last(0, 0), cur(0, 0) {}
+    T current_avg() const {
+      if (cur.first == last.first)
+        return 0;
+      return (cur.second - last.second) / (cur.first - last.first);
+    }
+    void consume_next(const std::pair<uint64_t, T> &next) {
+      last = cur;
+      cur = next;
+    }
+  };
+
+  ~PerfCounters();
+
+  void inc(int idx, uint64_t v = 1);
+  void dec(int idx, uint64_t v = 1);
+  void set(int idx, uint64_t v);
+  uint64_t get(int idx) const;
+
+  void tset(int idx, utime_t v);
+  void tinc(int idx, utime_t v);
+  void tinc(int idx, ceph::timespan v);
+  utime_t tget(int idx) const;
+
+  void hinc(int idx, int64_t x, int64_t y);
+
+  void reset();
+  void dump_formatted(ceph::Formatter *f, bool schema, bool dump_labeled,
+                      const std::string &counter = "") const {
+    dump_formatted_generic(f, schema, false, dump_labeled, counter);
+  }
+  void dump_formatted_histograms(ceph::Formatter *f, bool schema,
+                                 const std::string &counter = "") const {
+    dump_formatted_generic(f, schema, true, false, counter);
+  }
+  std::pair<uint64_t, uint64_t> get_tavg_ns(int idx) const;
+
+  const std::string& get_name() const;
+  void set_name(std::string s) {
+    m_name = s;
+  }
+
+  /// adjust priority values by some value
+  void set_prio_adjust(int p) {
+    prio_adjust = p;
+  }
+
+  int get_adjusted_priority(int p) const {
+    return std::max(std::min(p + prio_adjust,
+                             (int)PerfCountersBuilder::PRIO_CRITICAL),
+                    0);
+  }
+
+private:
+  PerfCounters(CephContext *cct, const std::string &name,
+	     int lower_bound, int upper_bound);
+  PerfCounters(const PerfCounters &rhs);
+  PerfCounters& operator=(const PerfCounters &rhs);
+  void dump_formatted_generic(ceph::Formatter *f, bool schema, bool histograms,
+                              bool dump_labeled,
+                              const std::string &counter = "") const;
+
+  typedef std::vector<perf_counter_data_any_d> perf_counter_data_vec_t;
+
+  CephContext *m_cct;
+  int m_lower_bound;
+  int m_upper_bound;
+  std::string m_name;
+
+  int prio_adjust = 0;
+
+#if !defined(WITH_SEASTAR) || defined(WITH_ALIEN)
+  const std::string m_lock_name;
+  /** Protects m_data */
+  ceph::mutex m_lock;
+#endif
+
+  perf_counter_data_vec_t m_data;
+
+  friend class PerfCountersBuilder;
+  friend class PerfCountersCollectionImpl;
+};
+
+class SortPerfCountersByName {
+public:
+  bool operator()(const PerfCounters* lhs, const PerfCounters* rhs) const {
+    return (lhs->get_name() < rhs->get_name());
+  }
+};
+
+typedef std::set <PerfCounters*, SortPerfCountersByName> perf_counters_set_t;
+
+/*
+ * PerfCountersCollectionImp manages PerfCounters objects for a Ceph process.
+ */
+class PerfCountersCollectionImpl
+{
+public:
+  PerfCountersCollectionImpl();
+  ~PerfCountersCollectionImpl();
+  void add(PerfCounters *l);
+  void remove(PerfCounters *l);
+  void clear();
+  bool reset(const std::string &name);
+
+  void dump_formatted(ceph::Formatter *f, bool schema, bool dump_labeled,
+                      const std::string &logger = "",
+                      const std::string &counter = "") const {
+    dump_formatted_generic(f, schema, false, dump_labeled, logger, counter);
+  }
+
+  void dump_formatted_histograms(ceph::Formatter *f, bool schema,
+                                 const std::string &logger = "",
+                                 const std::string &counter = "") const {
+    dump_formatted_generic(f, schema, true, false, logger, counter);
+  }
+
+  // A reference to a perf_counter_data_any_d, with an accompanying
+  // pointer to the enclosing PerfCounters, in order that the consumer
+  // can see the prio_adjust
+  class PerfCounterRef
+  {
+    public:
+    PerfCounters::perf_counter_data_any_d *data;
+    PerfCounters *perf_counters;
+  };
+  typedef std::map<std::string,
+          PerfCounterRef> CounterMap;
+
+  void with_counters(std::function<void(const CounterMap &)>) const;
+
+private:
+  void dump_formatted_generic(ceph::Formatter *f, bool schema, bool histograms,
+                              bool dump_labeled,
+                              const std::string &logger = "",
+                              const std::string &counter = "") const;
+
+  perf_counters_set_t m_loggers;
+
+  CounterMap by_path; 
+};
+
+
+class PerfGuard {
+  const ceph::real_clock::time_point start;
+  PerfCounters* const counters;
+  const int event;
+
+public:
+  PerfGuard(PerfCounters* const counters,
+            const int event)
+  : start(ceph::real_clock::now()),
+    counters(counters),
+    event(event) {
+  }
+
+  ~PerfGuard() {
+    counters->tinc(event, ceph::real_clock::now() - start);
+  }
+};
+
+}
+#endif
diff --git a/src/common/perf_counters_collection.cc b/src/common/perf_counters_collection.cc
new file mode 100644
index 000000000..f03980eba
--- /dev/null
+++ b/src/common/perf_counters_collection.cc
@@ -0,0 +1,63 @@
+#include "common/perf_counters_collection.h"
+#include "common/ceph_mutex.h"
+#include "common/ceph_context.h"
+
+namespace ceph::common {
+/* PerfcounterCollection hold the lock for PerfCounterCollectionImp */
+PerfCountersCollection::PerfCountersCollection(CephContext *cct)
+  : m_cct(cct),
+    m_lock(ceph::make_mutex("PerfCountersCollection"))
+{
+}
+PerfCountersCollection::~PerfCountersCollection()
+{
+  clear();
+}
+void PerfCountersCollection::add(PerfCounters *l)
+{
+  std::lock_guard lck(m_lock);
+  perf_impl.add(l);
+}
+void PerfCountersCollection::remove(PerfCounters *l)
+{
+  std::lock_guard lck(m_lock);
+  perf_impl.remove(l);
+}
+void PerfCountersCollection::clear()
+{
+  std::lock_guard lck(m_lock);
+  perf_impl.clear();
+}
+bool PerfCountersCollection::reset(const std::string &name)
+{
+  std::lock_guard lck(m_lock);
+  return perf_impl.reset(name);
+}
+void PerfCountersCollection::dump_formatted(ceph::Formatter *f, bool schema,
+                      bool dump_labeled,
+                      const std::string &logger,
+                      const std::string &counter)
+{
+  std::lock_guard lck(m_lock);
+  perf_impl.dump_formatted(f, schema, dump_labeled, logger, counter);
+}
+void PerfCountersCollection::dump_formatted_histograms(ceph::Formatter *f, bool schema,
+                                 const std::string &logger,
+                                 const std::string &counter)
+{
+  std::lock_guard lck(m_lock);
+  perf_impl.dump_formatted_histograms(f,schema,logger,counter);
+}
+void PerfCountersCollection::with_counters(std::function<void(const PerfCountersCollectionImpl::CounterMap &)> fn) const
+{
+  std::lock_guard lck(m_lock);
+  perf_impl.with_counters(fn);
+}
+void PerfCountersDeleter::operator()(PerfCounters* p) noexcept
+{
+  if (cct)
+    cct->get_perfcounters_collection()->remove(p);
+  delete p;
+}
+
+}
diff --git a/src/common/perf_counters_collection.h b/src/common/perf_counters_collection.h
new file mode 100644
index 000000000..4608a8243
--- /dev/null
+++ b/src/common/perf_counters_collection.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include "common/perf_counters.h"
+#include "common/ceph_mutex.h"
+#include "include/common_fwd.h"
+
+namespace ceph::common {
+class PerfCountersCollection
+{
+  CephContext *m_cct;
+
+  /** Protects perf_impl->m_loggers */
+  mutable ceph::mutex m_lock;
+  PerfCountersCollectionImpl perf_impl;
+public:
+  PerfCountersCollection(CephContext *cct);
+  ~PerfCountersCollection();
+  void add(PerfCounters *l);
+  void remove(PerfCounters *l);
+  void clear();
+  bool reset(const std::string &name);
+
+  void dump_formatted(ceph::Formatter *f, bool schema, bool dump_labeled,
+                      const std::string &logger = "",
+                      const std::string &counter = "");
+  void dump_formatted_histograms(ceph::Formatter *f, bool schema,
+                                 const std::string &logger = "",
+                                 const std::string &counter = "");
+
+  void with_counters(std::function<void(const PerfCountersCollectionImpl::CounterMap &)>) const;
+
+  friend class PerfCountersCollectionTest;
+};
+
+class PerfCountersDeleter {
+  CephContext* cct;
+
+public:
+  PerfCountersDeleter() noexcept : cct(nullptr) {}
+  PerfCountersDeleter(CephContext* cct) noexcept : cct(cct) {}
+  void operator()(PerfCounters* p) noexcept;
+};
+}
+using PerfCountersRef = std::unique_ptr<ceph::common::PerfCounters, ceph::common::PerfCountersDeleter>;
diff --git a/src/common/perf_counters_key.cc b/src/common/perf_counters_key.cc
new file mode 100644
index 000000000..eaa3886bd
--- /dev/null
+++ b/src/common/perf_counters_key.cc
@@ -0,0 +1,224 @@
+#include "common/perf_counters_key.h"
+
+#include <algorithm>
+#include <iterator>
+#include <numeric>
+
+namespace ceph::perf_counters {
+namespace detail {
+
+// use a null character to delimit strings
+constexpr char DELIMITER = '\0';
+
+
+// write a delimited string to the output
+auto write(std::string_view str, std::output_iterator<char> auto out)
+{
+  out = std::copy(str.begin(), str.end(), out);
+  *(out++) = DELIMITER;
+  return out;
+}
+
+// return the encoded size of a label
+inline std::size_t label_size(const label_pair& l)
+{
+  return l.first.size() + sizeof(DELIMITER)
+      + l.second.size() + sizeof(DELIMITER);
+}
+
+// an output iterator that writes label_pairs to a flat buffer
+template <std::contiguous_iterator Iterator>
+class label_insert_iterator {
+  using base_iterator = Iterator;
+
+  struct label_writer {
+    base_iterator pos; // write position
+
+    label_writer& operator=(const label_pair& l) {
+      pos = write(l.first, pos);
+      pos = write(l.second, pos);
+      return *this;
+    }
+  };
+  label_writer label;
+
+ public:
+  using difference_type = std::ptrdiff_t;
+  using value_type = label_writer;
+  using reference = value_type&;
+
+  label_insert_iterator() = default;
+  label_insert_iterator(base_iterator begin) : label{begin} {
+    static_assert(std::output_iterator<label_insert_iterator, label_pair>);
+  }
+
+  // increments are noops
+  label_insert_iterator& operator++() { return *this; }
+  label_insert_iterator operator++(int) { return *this; }
+
+  // can only dereference to assign
+  reference operator*() { return label; }
+
+  // return the wrapped iterator position
+  base_iterator base() { return label.pos; }
+};
+
+// compare label_pairs by their key only
+bool label_key_less(const label_pair& lhs, const label_pair& rhs)
+{
+  return lhs.first < rhs.first;
+}
+bool label_key_equal(const label_pair& lhs, const label_pair& rhs)
+{
+  return lhs.first == rhs.first;
+}
+
+std::string create(std::string_view counter_name,
+                   label_pair* begin, label_pair* end)
+{
+  // sort the input labels and remove duplicate keys
+  std::sort(begin, end, label_key_less);
+  end = std::unique(begin, end, label_key_equal);
+
+  // calculate the total size and preallocate the buffer
+  auto size = std::accumulate(begin, end,
+                              counter_name.size() + sizeof(DELIMITER),
+                              [] (std::size_t sum, const label_pair& l) {
+                                return sum + label_size(l);
+                              });
+  std::string result;
+  result.resize(size);
+
+  // copy out the counter name and labels
+  auto out = result.begin();
+  out = write(counter_name, out);
+  std::copy(begin, end, label_insert_iterator{out});
+
+  return result;
+}
+
+std::string insert(const char* begin1, const char* end1,
+                   label_pair* begin2, label_pair* end2)
+{
+  // sort the input labels and remove duplicate keys
+  std::sort(begin2, end2, label_key_less);
+  end2 = std::unique(begin2, end2, label_key_equal);
+
+  // find the first delimiter that marks the end of the counter name
+  auto pos = std::find(begin1, end1, DELIMITER);
+
+  // calculate the total size and preallocate the buffer
+  auto size = std::distance(begin1, end1);
+  if (pos == end1) { // add a delimiter if the key doesn't have one
+    size += sizeof(DELIMITER);
+  }
+  size = std::accumulate(begin2, end2, size,
+                         [] (std::size_t sum, const label_pair& l) {
+                           return sum + label_size(l);
+                         });
+  std::string result;
+  result.resize(size);
+
+  // copy the counter name without the delimiter
+  auto out = std::copy(begin1, pos, result.begin());
+  if (pos != end1) {
+    ++pos; // advance past the delimiter
+  }
+  *(out++) = DELIMITER;
+
+  // merge the two sorted input ranges, drop any duplicate keys, and write
+  // them to output. the begin2 range is first so that new input labels can
+  // replace existing duplicates
+  auto end = std::set_union(begin2, end2,
+                            label_iterator{pos, end1},
+                            label_iterator{end1, end1},
+                            label_insert_iterator{out},
+                            label_key_less);
+  // fix up the size in case set_union() removed any duplicates
+  result.resize(std::distance(result.begin(), end.base()));
+
+  return result;
+}
+
+std::string_view name(const char* begin, const char* end)
+{
+  auto pos = std::find(begin, end, DELIMITER);
+  return {begin, pos};
+}
+
+std::string_view labels(const char* begin, const char* end)
+{
+  auto pos = std::find(begin, end, DELIMITER);
+  if (pos == end) {
+    return {};
+  }
+  return {std::next(pos), end};
+}
+
+} // namespace detail
+
+
+std::string key_create(std::string_view counter_name)
+{
+  label_pair* end = nullptr;
+  return detail::create(counter_name, end, end);
+}
+
+std::string_view key_name(std::string_view key)
+{
+  return detail::name(key.begin(), key.end());
+}
+
+label_range key_labels(std::string_view key)
+{
+  return detail::labels(key.begin(), key.end());
+}
+
+
+label_iterator::label_iterator(base_iterator begin, base_iterator end)
+    : state(make_state(begin, end))
+{
+  static_assert(std::forward_iterator<label_iterator>);
+}
+
+void label_iterator::advance(std::optional<iterator_state>& s)
+{
+  auto d = std::find(s->pos, s->end, detail::DELIMITER);
+  if (d == s->end) { // no delimiter for label key
+    s = std::nullopt;
+    return;
+  }
+  s->label.first = std::string_view{s->pos, d};
+  s->pos = std::next(d);
+
+  d = std::find(s->pos, s->end, detail::DELIMITER);
+  if (d == s->end) { // no delimiter for label name
+    s = std::nullopt;
+    return;
+  }
+  s->label.second = std::string_view{s->pos, d};
+  s->pos = std::next(d);
+}
+
+auto label_iterator::make_state(base_iterator begin, base_iterator end)
+    -> std::optional<iterator_state>
+{
+  std::optional state = iterator_state{begin, end};
+  advance(state);
+  return state;
+}
+
+label_iterator& label_iterator::operator++()
+{
+  advance(state);
+  return *this;
+}
+
+label_iterator label_iterator::operator++(int)
+{
+  label_iterator tmp = *this;
+  advance(state);
+  return tmp;
+}
+
+} // namespace ceph::perf_counters
diff --git a/src/common/perf_counters_key.h b/src/common/perf_counters_key.h
new file mode 100644
index 000000000..a476369b1
--- /dev/null
+++ b/src/common/perf_counters_key.h
@@ -0,0 +1,139 @@
+#pragma once
+
+#include <optional>
+#include <string>
+#include <utility>
+
+namespace ceph::perf_counters {
+
+/// A key/value pair representing a perf counter label
+using label_pair = std::pair<std::string_view, std::string_view>;
+
+
+/// \brief Construct a key for a perf counter and set of labels.
+///
+/// Returns a string of the form "counter_name\0key1\0val1\0key2\0val2\0",
+/// where label pairs are sorted by key with duplicates removed.
+///
+/// This string representation avoids extra memory allocations associated
+/// with map<string, string>. It also supports the hashing and comparison
+/// operators required for use as a key in unordered and ordered containers.
+///
+/// Example:
+/// \code
+///   std::string key = key_create("counter_name", {
+///         {"key1", "val1"}, {"key2", "val2"}
+///       });
+/// \endcode
+template <std::size_t Count>
+std::string key_create(std::string_view counter_name,
+                       label_pair (&&labels)[Count]);
+
+/// \brief Construct a key for a perf counter without labels.
+/// \overload
+std::string key_create(std::string_view counter_name);
+
+/// \brief Insert additional labels into an existing key.
+///
+/// This returns a new string without modifying the input. The returned
+/// string has labels in sorted order and no duplicate keys.
+template <std::size_t Count>
+std::string key_insert(std::string_view key,
+                       label_pair (&&labels)[Count]);
+
+/// \brief Return the counter name for a given key.
+std::string_view key_name(std::string_view key);
+
+
+/// A forward iterator over label_pairs encoded in a key
+class label_iterator {
+ public:
+  using base_iterator = const char*;
+  using difference_type = std::ptrdiff_t;
+  using value_type = label_pair;
+  using pointer = const value_type*;
+  using reference = const value_type&;
+
+  label_iterator() = default;
+  label_iterator(base_iterator begin, base_iterator end);
+
+  label_iterator& operator++();
+  label_iterator operator++(int);
+
+  reference operator*() const { return state->label; }
+  pointer operator->() const { return &state->label; }
+
+  auto operator<=>(const label_iterator& rhs) const = default;
+
+ private:
+  struct iterator_state {
+    base_iterator pos; // end of current label
+    base_iterator end; // end of buffer
+    label_pair label; // current label
+
+    auto operator<=>(const iterator_state& rhs) const = default;
+  };
+  // an empty state represents a past-the-end iterator
+  std::optional<iterator_state> state;
+
+  // find the next two delimiters and construct the label string views
+  static void advance(std::optional<iterator_state>& s);
+
+  // try to parse the first label pair
+  static auto make_state(base_iterator begin, base_iterator end)
+      -> std::optional<iterator_state>;
+};
+
+/// A sorted range of label_pairs
+class label_range {
+  std::string_view buffer;
+ public:
+  using iterator = label_iterator;
+  using const_iterator = label_iterator;
+
+  label_range(std::string_view buffer) : buffer(buffer) {}
+
+  const_iterator begin() const { return {buffer.begin(), buffer.end()}; }
+  const_iterator cbegin() const { return {buffer.begin(), buffer.end()}; }
+
+  const_iterator end() const { return {}; }
+  const_iterator cend() const { return {}; }
+};
+
+/// \brief Return the sorted range of label_pairs for a given key.
+///
+/// Example:
+/// \code
+///   for (label_pair label : key_labels(key)) {
+///     std::cout << label.first << ":" << label.second << std::endl;
+///   }
+/// \endcode
+label_range key_labels(std::string_view key);
+
+
+namespace detail {
+
+std::string create(std::string_view counter_name,
+                   label_pair* begin, label_pair* end);
+
+std::string insert(const char* begin1, const char* end1,
+                   label_pair* begin2, label_pair* end2);
+
+} // namespace detail
+
+template <std::size_t Count>
+std::string key_create(std::string_view counter_name,
+                       label_pair (&&labels)[Count])
+{
+  return detail::create(counter_name, std::begin(labels), std::end(labels));
+}
+
+template <std::size_t Count>
+std::string key_insert(std::string_view key,
+                       label_pair (&&labels)[Count])
+{
+  return detail::insert(key.begin(), key.end(),
+                        std::begin(labels), std::end(labels));
+}
+
+} // namespace ceph::perf_counters
diff --git a/src/common/perf_histogram.cc b/src/common/perf_histogram.cc
new file mode 100644
index 000000000..13528764a
--- /dev/null
+++ b/src/common/perf_histogram.cc
@@ -0,0 +1,121 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 OVH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/perf_histogram.h"
+
+#include <limits>
+
+void PerfHistogramCommon::dump_formatted_axis(
+    ceph::Formatter *f, const PerfHistogramCommon::axis_config_d &ac) {
+  f->open_object_section("axis");
+
+  // Dump axis configuration
+  f->dump_string("name", ac.m_name);
+  f->dump_int("min", ac.m_min);
+  f->dump_int("quant_size", ac.m_quant_size);
+  f->dump_int("buckets", ac.m_buckets);
+  switch (ac.m_scale_type) {
+    case SCALE_LINEAR:
+      f->dump_string("scale_type", "linear");
+      break;
+    case SCALE_LOG2:
+      f->dump_string("scale_type", "log2");
+      break;
+    default:
+      ceph_assert(false && "Invalid scale type");
+  }
+
+  {
+    // Dump concrete ranges for axis buckets
+    f->open_array_section("ranges");
+    auto ranges = get_axis_bucket_ranges(ac);
+    for (int i = 0; i < ac.m_buckets; ++i) {
+      f->open_object_section("bucket");
+      if (i > 0) {
+        f->dump_int("min", ranges[i].first);
+      }
+      if (i < ac.m_buckets - 1) {
+        f->dump_int("max", ranges[i].second);
+      }
+      f->close_section();
+    }
+    f->close_section();
+  }
+
+  f->close_section();
+}
+
+int64_t get_quants(int64_t i, PerfHistogramCommon::scale_type_d st) {
+  switch (st) {
+    case PerfHistogramCommon::SCALE_LINEAR:
+      return i;
+    case PerfHistogramCommon::SCALE_LOG2:
+      return int64_t(1) << (i - 1);
+  }
+  ceph_assert(false && "Invalid scale type");
+}
+
+int64_t PerfHistogramCommon::get_bucket_for_axis(
+    int64_t value, const PerfHistogramCommon::axis_config_d &ac) {
+  if (value < ac.m_min) {
+    return 0;
+  }
+
+  value -= ac.m_min;
+  value /= ac.m_quant_size;
+
+  switch (ac.m_scale_type) {
+    case SCALE_LINEAR:
+      return std::min<int64_t>(value + 1, ac.m_buckets - 1);
+
+    case SCALE_LOG2:
+      for (int64_t i = 1; i < ac.m_buckets; ++i) {
+        if (value < get_quants(i, SCALE_LOG2)) {
+          return i;
+        }
+      }
+      return ac.m_buckets - 1;
+  }
+  ceph_assert(false && "Invalid scale type");
+}
+
+std::vector<std::pair<int64_t, int64_t>>
+PerfHistogramCommon::get_axis_bucket_ranges(
+    const PerfHistogramCommon::axis_config_d &ac) {
+  std::vector<std::pair<int64_t, int64_t>> ret;
+  ret.resize(ac.m_buckets);
+
+  // First bucket is for value < min
+  int64_t min = ac.m_min;
+  for (int64_t i = 1; i < ac.m_buckets - 1; i++) {
+    int64_t max_exclusive =
+        ac.m_min + get_quants(i, ac.m_scale_type) * ac.m_quant_size;
+
+    // Dump bucket range
+    ret[i].first = min;
+    ret[i].second = max_exclusive - 1;
+
+    // Shift min to next bucket
+    min = max_exclusive;
+  }
+
+  // Fill up first and last element, note that in case m_buckets == 1
+  // those will point to the same element, the order is important here
+  ret.front().second = ac.m_min - 1;
+  ret.back().first = min;
+
+  ret.front().first = std::numeric_limits<int64_t>::min();
+  ret.back().second = std::numeric_limits<int64_t>::max();
+  return ret;
+}
diff --git a/src/common/perf_histogram.h b/src/common/perf_histogram.h
new file mode 100644
index 000000000..3052106be
--- /dev/null
+++ b/src/common/perf_histogram.h
@@ -0,0 +1,227 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 OVH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_PERF_HISTOGRAM_H
+#define CEPH_COMMON_PERF_HISTOGRAM_H
+
+#include <array>
+#include <atomic>
+#include <memory>
+
+#include "common/Formatter.h"
+#include "include/int_types.h"
+#include "include/ceph_assert.h"
+
+class PerfHistogramCommon {
+public:
+  enum scale_type_d : uint8_t {
+    SCALE_LINEAR = 1,
+    SCALE_LOG2 = 2,
+  };
+
+  struct axis_config_d {
+    const char *m_name = nullptr;
+    scale_type_d m_scale_type = SCALE_LINEAR;
+    int64_t m_min = 0;
+    int64_t m_quant_size = 0;
+    int32_t m_buckets = 0;
+    axis_config_d() = default;
+    axis_config_d(const char* name,
+		  scale_type_d scale_type,
+		  int64_t min,
+		  int64_t quant_size,
+		  int32_t buckets)
+      : m_name(name),
+	m_scale_type(scale_type),
+	m_min(min),
+	m_quant_size(quant_size),
+	m_buckets(buckets)
+    {}
+  };
+
+protected:
+  /// Dump configuration of one axis to a formatter
+  static void dump_formatted_axis(ceph::Formatter *f, const axis_config_d &ac);
+
+  /// Quantize given value and convert to bucket number on given axis
+  static int64_t get_bucket_for_axis(int64_t value, const axis_config_d &ac);
+
+  /// Calculate inclusive ranges of axis values for each bucket on that axis
+  static std::vector<std::pair<int64_t, int64_t>> get_axis_bucket_ranges(
+      const axis_config_d &ac);
+};
+
+/// PerfHistogram does trace a histogram of input values. It's an extended
+/// version of a standard histogram which does trace characteristics of a single
+/// one value only. In this implementation, values can be traced in multiple
+/// dimensions - i.e. we can create a histogram of input request size (first
+/// dimension) and processing latency (second dimension). Creating standard
+/// histogram out of such multidimensional one is trivial and requires summing
+/// values across dimensions we're not interested in.
+template <int DIM = 2>
+class PerfHistogram : public PerfHistogramCommon {
+public:
+  /// Initialize new histogram object
+  PerfHistogram(std::initializer_list<axis_config_d> axes_config) {
+    ceph_assert(axes_config.size() == DIM &&
+		"Invalid number of axis configuration objects");
+
+    int i = 0;
+    for (const auto &ac : axes_config) {
+      ceph_assertf(ac.m_buckets > 0, "Must have at least one bucket on axis");
+      ceph_assertf(ac.m_quant_size > 0,
+             "Quantization unit must be non-zero positive integer value");
+
+      m_axes_config[i++] = ac;
+    }
+
+    m_rawData.reset(new std::atomic<uint64_t>[get_raw_size()] {});
+  }
+
+  /// Copy from other histogram object
+  PerfHistogram(const PerfHistogram &other)
+      : m_axes_config(other.m_axes_config) {
+    int64_t size = get_raw_size();
+    m_rawData.reset(new std::atomic<uint64_t>[size] {});
+    for (int64_t i = 0; i < size; i++) {
+      m_rawData[i] = other.m_rawData[i].load();
+    }
+  }
+
+  /// Set all histogram values to 0
+  void reset() {
+    auto size = get_raw_size();
+    for (auto i = size; --i >= 0;) {
+      m_rawData[i] = 0;
+    }
+  }
+
+  /// Increase counter for given axis values by one
+  template <typename... T>
+  void inc(T... axis) {
+    auto index = get_raw_index_for_value(axis...);
+    m_rawData[index]++;
+  }
+
+  /// Increase counter for given axis buckets by one
+  template <typename... T>
+  void inc_bucket(T... bucket) {
+    auto index = get_raw_index_for_bucket(bucket...);
+    m_rawData[index]++;
+  }
+
+  /// Read value from given bucket
+  template <typename... T>
+  uint64_t read_bucket(T... bucket) const {
+    auto index = get_raw_index_for_bucket(bucket...);
+    return m_rawData[index];
+  }
+
+  /// Dump data to a Formatter object
+  void dump_formatted(ceph::Formatter *f) const {
+    // Dump axes configuration
+    f->open_array_section("axes");
+    for (auto &ac : m_axes_config) {
+      dump_formatted_axis(f, ac);
+    }
+    f->close_section();
+
+    // Dump histogram values
+    dump_formatted_values(f);
+  }
+
+protected:
+  /// Raw data stored as linear space, internal indexes are calculated on
+  /// demand.
+  std::unique_ptr<std::atomic<uint64_t>[]> m_rawData;
+
+  /// Configuration of axes
+  std::array<axis_config_d, DIM> m_axes_config;
+
+  /// Dump histogram counters to a formatter
+  void dump_formatted_values(ceph::Formatter *f) const {
+    visit_values([f](int) { f->open_array_section("values"); },
+                 [f](int64_t value) { f->dump_unsigned("value", value); },
+                 [f](int) { f->close_section(); });
+  }
+
+  /// Get number of all histogram counters
+  int64_t get_raw_size() {
+    int64_t ret = 1;
+    for (const auto &ac : m_axes_config) {
+      ret *= ac.m_buckets;
+    }
+    return ret;
+  }
+
+  /// Calculate m_rawData index from axis values
+  template <typename... T>
+  int64_t get_raw_index_for_value(T... axes) const {
+    static_assert(sizeof...(T) == DIM, "Incorrect number of arguments");
+    return get_raw_index_internal<0>(get_bucket_for_axis, 0, axes...);
+  }
+
+  /// Calculate m_rawData index from axis bucket numbers
+  template <typename... T>
+  int64_t get_raw_index_for_bucket(T... buckets) const {
+    static_assert(sizeof...(T) == DIM, "Incorrect number of arguments");
+    return get_raw_index_internal<0>(
+        [](int64_t bucket, const axis_config_d &ac) {
+          ceph_assertf(bucket >= 0, "Bucket index can not be negative");
+          ceph_assertf(bucket < ac.m_buckets, "Bucket index too large");
+          return bucket;
+        },
+        0, buckets...);
+  }
+
+  template <int level = 0, typename F, typename... T>
+  int64_t get_raw_index_internal(F bucket_evaluator, int64_t startIndex,
+                                 int64_t value, T... tail) const {
+    static_assert(level + 1 + sizeof...(T) == DIM,
+                  "Internal consistency check");
+    auto &ac = m_axes_config[level];
+    auto bucket = bucket_evaluator(value, ac);
+    return get_raw_index_internal<level + 1>(
+        bucket_evaluator, ac.m_buckets * startIndex + bucket, tail...);
+  }
+
+  template <int level, typename F>
+  int64_t get_raw_index_internal(F, int64_t startIndex) const {
+    static_assert(level == DIM, "Internal consistency check");
+    return startIndex;
+  }
+
+  /// Visit all histogram counters, call onDimensionEnter / onDimensionLeave
+  /// when starting / finishing traversal
+  /// on given axis, call onValue when dumping raw histogram counter value.
+  template <typename FDE, typename FV, typename FDL>
+  void visit_values(FDE onDimensionEnter, FV onValue, FDL onDimensionLeave,
+                    int level = 0, int startIndex = 0) const {
+    if (level == DIM) {
+      onValue(m_rawData[startIndex]);
+      return;
+    }
+
+    onDimensionEnter(level);
+    auto &ac = m_axes_config[level];
+    startIndex *= ac.m_buckets;
+    for (int32_t i = 0; i < ac.m_buckets; ++i, ++startIndex) {
+      visit_values(onDimensionEnter, onValue, onDimensionLeave, level + 1,
+                   startIndex);
+    }
+    onDimensionLeave(level);
+  }
+};
+
+#endif
diff --git a/src/common/pick_address.cc b/src/common/pick_address.cc
new file mode 100644
index 000000000..b1a36e072
--- /dev/null
+++ b/src/common/pick_address.cc
@@ -0,0 +1,633 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2012 Inktank
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/pick_address.h"
+
+#include <bitset>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <string>
+#include <string.h>
+#include <vector>
+
+#include <boost/algorithm/string/predicate.hpp>
+#include <fmt/format.h>
+
+#include "include/ipaddr.h"
+#include "include/str_list.h"
+#include "common/ceph_context.h"
+#ifndef WITH_SEASTAR
+#include "common/config.h"
+#include "common/config_obs.h"
+#endif
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/numa.h"
+
+#ifndef HAVE_IN_ADDR_T
+typedef uint32_t in_addr_t;
+#endif
+
+#ifndef IN_LOOPBACKNET
+#define IN_LOOPBACKNET 127
+#endif
+
+#define dout_subsys ceph_subsys_
+
+using std::string;
+using std::vector;
+
+namespace {
+
+bool matches_with_name(const ifaddrs& ifa, const std::string& if_name)
+{
+  return if_name.compare(ifa.ifa_name) == 0;
+}
+
+static int is_loopback_addr(sockaddr* addr)
+{
+  if (addr->sa_family == AF_INET) {
+    const sockaddr_in* sin = (struct sockaddr_in *)(addr);
+    const in_addr_t net = ntohl(sin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT;
+    return net == IN_LOOPBACKNET ? 1 : 0;
+  } else if (addr->sa_family == AF_INET6) {
+    sockaddr_in6* sin6 = (struct sockaddr_in6 *)(addr);
+    return IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr) ? 1 : 0;
+  } else {
+    return -1;
+  }
+}
+
+static int grade_addr(const ifaddrs& ifa)
+{
+  if (ifa.ifa_addr == nullptr) {
+    return -1;
+  }
+  int score = 0;
+  if (ifa.ifa_flags & IFF_UP) {
+    score += 4;
+  }
+  switch (is_loopback_addr(ifa.ifa_addr)) {
+  case 0:
+    // prefer non-loopback addresses
+    score += 2;
+    break;
+  case 1:
+    score += 0;
+    break;
+  default:
+    score = -1;
+    break;
+  }
+  return score;
+}
+
+bool matches_with_net(const ifaddrs& ifa,
+                      const sockaddr* net,
+                      unsigned int prefix_len,
+                      unsigned ipv)
+{
+  switch (net->sa_family) {
+  case AF_INET:
+    if (ipv & CEPH_PICK_ADDRESS_IPV4) {
+      return matches_ipv4_in_subnet(ifa, (struct sockaddr_in*)net, prefix_len);
+    }
+    break;
+  case AF_INET6:
+    if (ipv & CEPH_PICK_ADDRESS_IPV6) {
+      return matches_ipv6_in_subnet(ifa, (struct sockaddr_in6*)net, prefix_len);
+    }
+    break;
+  }
+  return false;
+}
+
+bool matches_with_net(CephContext *cct,
+                      const ifaddrs& ifa,
+                      const std::string& s,
+                      unsigned ipv)
+{
+  struct sockaddr_storage net;
+  unsigned int prefix_len;
+  if (!parse_network(s.c_str(), &net, &prefix_len)) {
+    lderr(cct) << "unable to parse network: " << s << dendl;
+    exit(1);
+  }
+  return matches_with_net(ifa, (sockaddr*)&net, prefix_len, ipv);
+}
+
+int grade_with_numa_node(const ifaddrs& ifa, int numa_node)
+{
+#if defined(WITH_SEASTAR) || defined(_WIN32)
+  return 0;
+#else
+  if (numa_node < 0) {
+    return 0;
+  }
+  int if_node = -1;
+  int r = get_iface_numa_node(ifa.ifa_name, &if_node);
+  if (r < 0) {
+    return 0;
+  }
+  return if_node == numa_node ? 1 : 0;
+#endif
+}
+}
+
+const struct sockaddr *find_ip_in_subnet_list(
+  CephContext *cct,
+  const struct ifaddrs *ifa,
+  unsigned ipv,
+  const std::string &networks,
+  const std::string &interfaces,
+  int numa_node)
+{
+  const auto ifs = get_str_list(interfaces);
+  const auto nets = get_str_list(networks);
+  if (!ifs.empty() && nets.empty()) {
+      lderr(cct) << "interface names specified but not network names" << dendl;
+      exit(1);
+  }
+
+  int best_score = 0;
+  const sockaddr* best_addr = nullptr;
+  for (const auto* addr = ifa; addr != nullptr; addr = addr->ifa_next) {
+    if (!ifs.empty() &&
+	std::none_of(std::begin(ifs), std::end(ifs),
+                     [&](const auto& if_name) {
+                       return matches_with_name(*addr, if_name);
+                     })) {
+      continue;
+    }
+    if (!nets.empty() &&
+	std::none_of(std::begin(nets), std::end(nets),
+                     [&](const auto& net) {
+                       return matches_with_net(cct, *addr, net, ipv);
+                     })) {
+      continue;
+    }
+    int score = grade_addr(*addr);
+    if (score < 0) {
+      continue;
+    }
+    score += grade_with_numa_node(*addr, numa_node);
+    if (score > best_score) {
+      best_score = score;
+      best_addr = addr->ifa_addr;
+    }
+  }
+  return best_addr;
+}
+
+#ifndef WITH_SEASTAR
+// observe this change
+struct Observer : public md_config_obs_t {
+  const char *keys[2];
+  explicit Observer(const char *c) {
+    keys[0] = c;
+    keys[1] = NULL;
+  }
+
+  const char** get_tracked_conf_keys() const override {
+    return (const char **)keys;
+  }
+  void handle_conf_change(const ConfigProxy& conf,
+			  const std::set <std::string> &changed) override {
+    // do nothing.
+  }
+};
+
+static void fill_in_one_address(CephContext *cct,
+				const struct ifaddrs *ifa,
+				const string &networks,
+				const string &interfaces,
+				const char *conf_var,
+				int numa_node = -1)
+{
+  const struct sockaddr *found = find_ip_in_subnet_list(
+    cct,
+    ifa,
+    CEPH_PICK_ADDRESS_IPV4|CEPH_PICK_ADDRESS_IPV6,
+    networks,
+    interfaces,
+    numa_node);
+  if (!found) {
+    lderr(cct) << "unable to find any IP address in networks '" << networks
+	       << "' interfaces '" << interfaces << "'" << dendl;
+    exit(1);
+  }
+
+  char buf[INET6_ADDRSTRLEN];
+  int err;
+
+  err = getnameinfo(found,
+		    (found->sa_family == AF_INET)
+		    ? sizeof(struct sockaddr_in)
+		    : sizeof(struct sockaddr_in6),
+
+		    buf, sizeof(buf),
+		    nullptr, 0,
+		    NI_NUMERICHOST);
+  if (err != 0) {
+    lderr(cct) << "unable to convert chosen address to string: " << gai_strerror(err) << dendl;
+    exit(1);
+  }
+
+  Observer obs(conf_var);
+
+  cct->_conf.add_observer(&obs);
+
+  cct->_conf.set_val_or_die(conf_var, buf);
+  cct->_conf.apply_changes(nullptr);
+
+  cct->_conf.remove_observer(&obs);
+}
+
+void pick_addresses(CephContext *cct, int needs)
+{
+  auto public_addr = cct->_conf.get_val<entity_addr_t>("public_addr");
+  auto public_network = cct->_conf.get_val<std::string>("public_network");
+  auto public_network_interface =
+    cct->_conf.get_val<std::string>("public_network_interface");
+  auto cluster_addr = cct->_conf.get_val<entity_addr_t>("cluster_addr");
+  auto cluster_network = cct->_conf.get_val<std::string>("cluster_network");
+  auto cluster_network_interface =
+    cct->_conf.get_val<std::string>("cluster_network_interface");
+
+  struct ifaddrs *ifa;
+  int r = getifaddrs(&ifa);
+  if (r < 0) {
+    string err = cpp_strerror(errno);
+    lderr(cct) << "unable to fetch interfaces and addresses: " << err << dendl;
+    exit(1);
+  }
+  auto free_ifa = make_scope_guard([ifa] { freeifaddrs(ifa); });
+  if ((needs & CEPH_PICK_ADDRESS_PUBLIC) &&
+    public_addr.is_blank_ip() && !public_network.empty()) {
+    fill_in_one_address(cct, ifa, public_network, public_network_interface,
+			"public_addr");
+  }
+
+  if ((needs & CEPH_PICK_ADDRESS_CLUSTER) && cluster_addr.is_blank_ip()) {
+    if (!cluster_network.empty()) {
+      fill_in_one_address(cct, ifa, cluster_network, cluster_network_interface,
+			  "cluster_addr");
+    } else {
+      if (!public_network.empty()) {
+        lderr(cct) << "Public network was set, but cluster network was not set " << dendl;
+        lderr(cct) << "    Using public network also for cluster network" << dendl;
+        fill_in_one_address(cct, ifa, public_network, public_network_interface,
+			    "cluster_addr");
+      }
+    }
+  }
+}
+#endif	// !WITH_SEASTAR
+
+static std::optional<entity_addr_t> get_one_address(
+  CephContext *cct,
+  const struct ifaddrs *ifa,
+  unsigned ipv,
+  const string &networks,
+  const string &interfaces,
+  int numa_node = -1)
+{
+  const struct sockaddr *found = find_ip_in_subnet_list(cct, ifa, ipv,
+							networks,
+							interfaces,
+							numa_node);
+  if (!found) {
+    std::string_view ip_type;
+    if ((ipv & CEPH_PICK_ADDRESS_IPV4) && (ipv & CEPH_PICK_ADDRESS_IPV6)) {
+      ip_type = "IPv4 or IPv6";
+    } else if (ipv & CEPH_PICK_ADDRESS_IPV4) {
+      ip_type = "IPv4";
+    } else {
+      ip_type = "IPv6";
+    }
+    lderr(cct) << "unable to find any " << ip_type << " address in networks '"
+               << networks << "' interfaces '" << interfaces << "'" << dendl;
+    return {};
+  }
+
+  char buf[INET6_ADDRSTRLEN];
+  int err;
+
+  err = getnameinfo(found,
+		    (found->sa_family == AF_INET)
+		    ? sizeof(struct sockaddr_in)
+		    : sizeof(struct sockaddr_in6),
+
+		    buf, sizeof(buf),
+		    nullptr, 0,
+		    NI_NUMERICHOST);
+  if (err != 0) {
+    lderr(cct) << "unable to convert chosen address to string: " << gai_strerror(err) << dendl;
+    return {};
+  }
+
+  entity_addr_t addr;
+  if (addr.parse(buf)) {
+    return addr;
+  } else {
+    return {};
+  }
+}
+
+int pick_addresses(
+  CephContext *cct,
+  unsigned flags,
+  struct ifaddrs *ifa,
+  entity_addrvec_t *addrs,
+  int preferred_numa_node)
+{
+  addrs->v.clear();
+
+  unsigned addrt = (flags & (CEPH_PICK_ADDRESS_PUBLIC |
+			     CEPH_PICK_ADDRESS_PUBLIC_BIND |
+			     CEPH_PICK_ADDRESS_CLUSTER));
+  // TODO: move to std::popcount when it's available for all release lines
+  // we are interested in (quincy was a blocker at the time of writing)
+  if (std::bitset<sizeof(addrt)*CHAR_BIT>(addrt).count() != 1) {
+    // these flags are mutually exclusive and one of them must be
+    // always set (in other words: it's mode selection).
+    return -EINVAL;
+  }
+  unsigned msgrv = flags & (CEPH_PICK_ADDRESS_MSGR1 |
+			    CEPH_PICK_ADDRESS_MSGR2);
+  if (msgrv == 0) {
+    if (cct->_conf.get_val<bool>("ms_bind_msgr1")) {
+      msgrv |= CEPH_PICK_ADDRESS_MSGR1;
+    }
+    if (cct->_conf.get_val<bool>("ms_bind_msgr2")) {
+      msgrv |= CEPH_PICK_ADDRESS_MSGR2;
+    }
+    if (msgrv == 0) {
+      return -EINVAL;
+    }
+  }
+  unsigned ipv = flags & (CEPH_PICK_ADDRESS_IPV4 |
+			  CEPH_PICK_ADDRESS_IPV6);
+  if (ipv == 0) {
+    if (cct->_conf.get_val<bool>("ms_bind_ipv4")) {
+      ipv |= CEPH_PICK_ADDRESS_IPV4;
+    }
+    if (cct->_conf.get_val<bool>("ms_bind_ipv6")) {
+      ipv |= CEPH_PICK_ADDRESS_IPV6;
+    }
+    if (ipv == 0) {
+      return -EINVAL;
+    }
+    if (cct->_conf.get_val<bool>("ms_bind_prefer_ipv4")) {
+      flags |= CEPH_PICK_ADDRESS_PREFER_IPV4;
+    } else {
+      flags &= ~CEPH_PICK_ADDRESS_PREFER_IPV4;
+    }
+  }
+
+  entity_addr_t addr;
+  string networks;
+  string interfaces;
+  if (addrt & CEPH_PICK_ADDRESS_PUBLIC) {
+    addr = cct->_conf.get_val<entity_addr_t>("public_addr");
+    networks = cct->_conf.get_val<std::string>("public_network");
+    interfaces =
+      cct->_conf.get_val<std::string>("public_network_interface");
+  } else if (addrt & CEPH_PICK_ADDRESS_PUBLIC_BIND) {
+    addr = cct->_conf.get_val<entity_addr_t>("public_bind_addr");
+    // XXX: we don't support _network nor _network_interface for
+    // the public_bind addrs yet.
+    if (addr.is_blank_ip()) {
+      return -ENOENT;
+    }
+  } else {
+    addr = cct->_conf.get_val<entity_addr_t>("cluster_addr");
+    networks = cct->_conf.get_val<std::string>("cluster_network");
+    interfaces =
+      cct->_conf.get_val<std::string>("cluster_network_interface");
+    if (networks.empty()) {
+      lderr(cct) << "Falling back to public interface" << dendl;
+      // fall back to public_ network and interface if cluster is not set
+      networks = cct->_conf.get_val<std::string>("public_network");
+      interfaces =
+	cct->_conf.get_val<std::string>("public_network_interface");
+    }
+  }
+  if (addr.is_blank_ip() &&
+      !networks.empty()) {
+    // note: pass in ipv to filter the matching addresses
+    for (auto pick_mask :  {CEPH_PICK_ADDRESS_IPV4, CEPH_PICK_ADDRESS_IPV6}) {
+      if (ipv & pick_mask) {
+        auto ip_addr = get_one_address(cct, ifa, pick_mask,
+                                       networks, interfaces,
+                                       preferred_numa_node);
+        if (ip_addr) {
+          addrs->v.push_back(*ip_addr);
+        } else {
+          // picked but not found
+          return -1;
+        }
+      }
+    }
+  }
+
+  // note: we may have a blank addr here
+
+  // ipv4 and/or ipv6?
+  if (addrs->v.empty()) {
+    addr.set_type(entity_addr_t::TYPE_MSGR2);
+    for (auto pick_mask : {CEPH_PICK_ADDRESS_IPV4, CEPH_PICK_ADDRESS_IPV6}) {
+      if (ipv & pick_mask) {
+        addr.set_family(pick_mask == CEPH_PICK_ADDRESS_IPV4 ? AF_INET : AF_INET6);
+        addrs->v.push_back(addr);
+      }
+    }
+  }
+
+  std::sort(addrs->v.begin(), addrs->v.end(),
+	    [flags] (entity_addr_t& lhs, entity_addr_t& rhs) {
+	      if (flags & CEPH_PICK_ADDRESS_PREFER_IPV4) {
+		return lhs.is_ipv4() && rhs.is_ipv6();
+	      } else {
+		return lhs.is_ipv6() && rhs.is_ipv4();
+	      }
+	    });
+
+  // msgr2 or legacy or both?
+  if (msgrv == (CEPH_PICK_ADDRESS_MSGR1 | CEPH_PICK_ADDRESS_MSGR2)) {
+    vector<entity_addr_t> v;
+    v.swap(addrs->v);
+    for (auto a : v) {
+      a.set_type(entity_addr_t::TYPE_MSGR2);
+      if (flags & CEPH_PICK_ADDRESS_DEFAULT_MON_PORTS) {
+	a.set_port(CEPH_MON_PORT_IANA);
+      }
+      addrs->v.push_back(a);
+      a.set_type(entity_addr_t::TYPE_LEGACY);
+      if (flags & CEPH_PICK_ADDRESS_DEFAULT_MON_PORTS) {
+	a.set_port(CEPH_MON_PORT_LEGACY);
+      }
+      addrs->v.push_back(a);
+    }
+  } else if (msgrv == CEPH_PICK_ADDRESS_MSGR1) {
+    for (auto& a : addrs->v) {
+      a.set_type(entity_addr_t::TYPE_LEGACY);
+    }
+  } else {
+    for (auto& a : addrs->v) {
+      a.set_type(entity_addr_t::TYPE_MSGR2);
+    }
+  }
+
+  return 0;
+}
+
+int pick_addresses(
+  CephContext *cct,
+  unsigned flags,
+  entity_addrvec_t *addrs,
+  int preferred_numa_node)
+{
+  struct ifaddrs *ifa;
+  int r = getifaddrs(&ifa);
+  if (r < 0) {
+    r = -errno;
+    string err = cpp_strerror(r);
+    lderr(cct) << "unable to fetch interfaces and addresses: "
+	       <<  cpp_strerror(r) << dendl;
+    return r;
+  }
+  r = pick_addresses(cct, flags, ifa, addrs, preferred_numa_node);
+  freeifaddrs(ifa);
+  return r;
+}
+
+std::string pick_iface(CephContext *cct, const struct sockaddr_storage &network)
+{
+  struct ifaddrs *ifa;
+  int r = getifaddrs(&ifa);
+  if (r < 0) {
+    string err = cpp_strerror(errno);
+    lderr(cct) << "unable to fetch interfaces and addresses: " << err << dendl;
+    return {};
+  }
+  auto free_ifa = make_scope_guard([ifa] { freeifaddrs(ifa); });
+  const unsigned int prefix_len = std::max(sizeof(in_addr::s_addr), sizeof(in6_addr::s6_addr)) * CHAR_BIT;
+  for (auto addr = ifa; addr != nullptr; addr = addr->ifa_next) {
+    if (matches_with_net(*ifa, (const struct sockaddr *) &network, prefix_len,
+			 CEPH_PICK_ADDRESS_IPV4 | CEPH_PICK_ADDRESS_IPV6)) {
+      return addr->ifa_name;
+    }
+  }
+  return {};
+}
+
+
+bool have_local_addr(CephContext *cct, const std::list<entity_addr_t>& ls, entity_addr_t *match)
+{
+  struct ifaddrs *ifa;
+  int r = getifaddrs(&ifa);
+  if (r < 0) {
+    lderr(cct) << "unable to fetch interfaces and addresses: " << cpp_strerror(errno) << dendl;
+    exit(1);
+  }
+  auto free_ifa = make_scope_guard([ifa] { freeifaddrs(ifa); });
+
+  for (struct ifaddrs *addrs = ifa; addrs != nullptr; addrs = addrs->ifa_next) {
+    if (addrs->ifa_addr) {
+      entity_addr_t a;
+      a.set_sockaddr(addrs->ifa_addr);
+      for (auto& p : ls) {
+        if (a.is_same_host(p)) {
+          *match = p;
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+int get_iface_numa_node(
+  const std::string& iface,
+  int *node)
+{
+  enum class iface_t {
+    PHY_PORT,
+    BOND_PORT
+  } ifatype = iface_t::PHY_PORT;
+  std::string_view ifa{iface};
+  if (auto pos = ifa.find(":"); pos != ifa.npos) {
+    ifa.remove_suffix(ifa.size() - pos);
+  }
+  string fn = fmt::format("/sys/class/net/{}/device/numa_node", ifa);
+  int fd = ::open(fn.c_str(), O_RDONLY);
+  if (fd < 0) {
+    fn = fmt::format("/sys/class/net/{}/bonding/slaves", ifa);
+    fd = ::open(fn.c_str(), O_RDONLY);
+    if (fd < 0) {
+      return -errno;
+    }
+    ifatype = iface_t::BOND_PORT;
+  }
+
+  int r = 0;
+  char buf[1024];
+  char *endptr = 0;
+  r = safe_read(fd, &buf, sizeof(buf));
+  if (r < 0) {
+    goto out;
+  }
+  buf[r] = 0;
+  while (r > 0 && ::isspace(buf[--r])) {
+    buf[r] = 0;
+  }
+
+  switch (ifatype) {
+  case iface_t::PHY_PORT:
+    *node = strtoll(buf, &endptr, 10);
+    if (endptr != buf + strlen(buf)) {
+      r = -EINVAL;
+      goto out;
+    }
+    r = 0;
+    break;
+  case iface_t::BOND_PORT:
+    int bond_node = -1;
+    std::vector<std::string> sv;
+    std::string ifacestr = buf;
+    get_str_vec(ifacestr, " ", sv);
+    for (auto& iter : sv) {
+      int bn = -1;
+      r = get_iface_numa_node(iter, &bn);
+      if (r >= 0) {
+        if (bond_node == -1 || bn == bond_node) {
+          bond_node = bn;
+        } else {
+          *node = -2;
+          goto out;
+        }
+      } else {
+        goto out;
+      }
+    }
+    *node = bond_node;
+    break;
+  }
+
+  out:
+  ::close(fd);
+  return r;
+}
+
diff --git a/src/common/pick_address.h b/src/common/pick_address.h
new file mode 100644
index 000000000..4fd77e546
--- /dev/null
+++ b/src/common/pick_address.h
@@ -0,0 +1,98 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_PICK_ADDRESS_H
+#define CEPH_PICK_ADDRESS_H
+
+#include <string>
+#include <list>
+
+#include "include/common_fwd.h"
+
+struct entity_addr_t;
+class entity_addrvec_t;
+
+
+#define CEPH_PICK_ADDRESS_PUBLIC      0x01
+#define CEPH_PICK_ADDRESS_CLUSTER     0x02
+#define CEPH_PICK_ADDRESS_MSGR1       0x04
+#define CEPH_PICK_ADDRESS_MSGR2       0x08
+#define CEPH_PICK_ADDRESS_IPV4        0x10
+#define CEPH_PICK_ADDRESS_IPV6        0x20
+#define CEPH_PICK_ADDRESS_PREFER_IPV4 0x40
+#define CEPH_PICK_ADDRESS_DEFAULT_MON_PORTS  0x80
+#define CEPH_PICK_ADDRESS_PUBLIC_BIND 0x100
+
+#ifndef WITH_SEASTAR
+/*
+  Pick addresses based on subnets if needed.
+
+  If an address is not explicitly given, and a list of subnets is
+  given, find an assigned IP address in the subnets and set that.
+
+  cluster_addr is set based on cluster_network, public_addr is set
+  based on public_network.
+
+  cluster_network and public_network are a list of ip/prefix pairs.
+
+  All IP addresses assigned to all local network interfaces are
+  potential matches.
+
+  If multiple IP addresses match the subnet, one of them will be
+  picked, effectively randomly.
+
+  This function will exit on error.
+ */
+void pick_addresses(CephContext *cct, int needs);
+
+#endif	// !WITH_SEASTAR
+
+int pick_addresses(CephContext *cct, unsigned flags, entity_addrvec_t *addrs,
+		   int preferred_numa_node = -1);
+int pick_addresses(CephContext *cct, unsigned flags, struct ifaddrs *ifa,
+		   entity_addrvec_t *addrs,
+		   int preferred_numa_node = -1);
+
+/**
+ * Find a network interface whose address matches the address/netmask
+ * in `network`.
+ */
+std::string pick_iface(CephContext *cct, const struct sockaddr_storage &network);
+
+/**
+ * check for a locally configured address
+ *
+ * check if any of the listed addresses is configured on the local host.
+ *
+ * @param cct context
+ * @param ls list of addresses
+ * @param match [out] pointer to match, if an item in @a ls is found configured locally.
+ */
+bool have_local_addr(CephContext *cct, const std::list<entity_addr_t>& ls, entity_addr_t *match);
+
+/**
+ * filter the addresses in @c ifa with specified interfaces, networks and IPv
+ *
+ * @param cct
+ * @param ifa a list of network interface addresses to be filtered
+ * @param ipv bitmask of CEPH_PICK_ADDRESS_IPV4 and CEPH_PICK_ADDRESS_IPV6.
+ *        it is used to filter the @c networks
+ * @param networks a comma separated list of networks as the allow list. only
+ *        the addresses in the specified networks are allowed. all addresses
+ *        are accepted if it is empty.
+ * @param interfaces a comma separated list of interfaces for the allow list.
+ *        all addresses are accepted if it is empty
+ * @param exclude_lo_iface filter out network interface named "lo"
+ */
+const struct sockaddr *find_ip_in_subnet_list(
+  CephContext *cct,
+  const struct ifaddrs *ifa,
+  unsigned ipv,
+  const std::string &networks,
+  const std::string &interfaces,
+  int numa_node=-1);
+
+int get_iface_numa_node(
+  const std::string& iface,
+  int *node);
+
+#endif
diff --git a/src/common/ppc-asm.h b/src/common/ppc-asm.h
new file mode 100644
index 000000000..be34cf8fe
--- /dev/null
+++ b/src/common/ppc-asm.h
@@ -0,0 +1,381 @@
+/* PowerPC asm definitions for GNU C.
+
+Copyright (C) 2002-2017 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/* Under winnt, 1) gas supports the following as names and 2) in particular
+   defining "toc" breaks the FUNC_START macro as ".toc" becomes ".2" */
+
+#define r0	0
+#define sp	1
+#define toc	2
+#define r3	3
+#define r4	4
+#define r5	5
+#define r6	6
+#define r7	7
+#define r8	8
+#define r9	9
+#define r10	10
+#define r11	11
+#define r12	12
+#define r13	13
+#define r14	14
+#define r15	15
+#define r16	16
+#define r17	17
+#define r18	18
+#define r19     19
+#define r20	20
+#define r21	21
+#define r22	22
+#define r23	23
+#define r24	24
+#define r25	25
+#define r26	26
+#define r27	27
+#define r28	28
+#define r29	29
+#define r30	30
+#define r31	31
+
+#define cr0	0
+#define cr1	1
+#define cr2	2
+#define cr3	3
+#define cr4	4
+#define cr5	5
+#define cr6	6
+#define cr7	7
+
+#define f0	0
+#define f1	1
+#define f2	2
+#define f3	3
+#define f4	4
+#define f5	5
+#define f6	6
+#define f7	7
+#define f8	8
+#define f9	9
+#define f10	10
+#define f11	11
+#define f12	12
+#define f13	13
+#define f14	14
+#define f15	15
+#define f16	16
+#define f17	17
+#define f18	18
+#define f19	19
+#define f20	20
+#define f21	21
+#define f22	22
+#define f23	23
+#define f24	24
+#define f25	25
+#define f26	26
+#define f27	27
+#define f28	28
+#define f29	29
+#define f30	30
+#define f31	31
+
+#ifdef __VSX__
+#define f32	32
+#define f33	33
+#define f34	34
+#define f35	35
+#define f36	36
+#define f37	37
+#define f38	38
+#define f39	39
+#define f40	40
+#define f41	41
+#define f42	42
+#define f43	43
+#define f44	44
+#define f45	45
+#define f46	46
+#define f47	47
+#define f48	48
+#define f49	49
+#define f50	30
+#define f51	51
+#define f52	52
+#define f53	53
+#define f54	54
+#define f55	55
+#define f56	56
+#define f57	57
+#define f58	58
+#define f59	59
+#define f60	60
+#define f61	61
+#define f62	62
+#define f63	63
+#endif
+
+#ifdef __ALTIVEC__
+#define v0	0
+#define v1	1
+#define v2	2
+#define v3	3
+#define v4	4
+#define v5	5
+#define v6	6
+#define v7	7
+#define v8	8
+#define v9	9
+#define v10	10
+#define v11	11
+#define v12	12
+#define v13	13
+#define v14	14
+#define v15	15
+#define v16	16
+#define v17	17
+#define v18	18
+#define v19	19
+#define v20	20
+#define v21	21
+#define v22	22
+#define v23	23
+#define v24	24
+#define v25	25
+#define v26	26
+#define v27	27
+#define v28	28
+#define v29	29
+#define v30	30
+#define v31	31
+#endif
+
+#ifdef __VSX__
+#define vs0	0
+#define vs1	1
+#define vs2	2
+#define vs3	3
+#define vs4	4
+#define vs5	5
+#define vs6	6
+#define vs7	7
+#define vs8	8
+#define vs9	9
+#define vs10	10
+#define vs11	11
+#define vs12	12
+#define vs13	13
+#define vs14	14
+#define vs15	15
+#define vs16	16
+#define vs17	17
+#define vs18	18
+#define vs19	19
+#define vs20	20
+#define vs21	21
+#define vs22	22
+#define vs23	23
+#define vs24	24
+#define vs25	25
+#define vs26	26
+#define vs27	27
+#define vs28	28
+#define vs29	29
+#define vs30	30
+#define vs31	31
+#define vs32	32
+#define vs33	33
+#define vs34	34
+#define vs35	35
+#define vs36	36
+#define vs37	37
+#define vs38	38
+#define vs39	39
+#define vs40	40
+#define vs41	41
+#define vs42	42
+#define vs43	43
+#define vs44	44
+#define vs45	45
+#define vs46	46
+#define vs47	47
+#define vs48	48
+#define vs49	49
+#define vs50	30
+#define vs51	51
+#define vs52	52
+#define vs53	53
+#define vs54	54
+#define vs55	55
+#define vs56	56
+#define vs57	57
+#define vs58	58
+#define vs59	59
+#define vs60	60
+#define vs61	61
+#define vs62	62
+#define vs63	63
+#endif
+
+/*
+ * Macros to glue together two tokens.
+ */
+
+#ifdef __STDC__
+#define XGLUE(a,b) a##b
+#else
+#define XGLUE(a,b) a/**/b
+#endif
+
+#define GLUE(a,b) XGLUE(a,b)
+
+/*
+ * Macros to begin and end a function written in assembler.  If -mcall-aixdesc
+ * or -mcall-nt, create a function descriptor with the given name, and create
+ * the real function with one or two leading periods respectively.
+ */
+
+#if defined(__powerpc64__) && _CALL_ELF == 2
+
+/* Defining "toc" above breaks @toc in assembler code.  */
+#undef toc
+
+#define FUNC_NAME(name) GLUE(__USER_LABEL_PREFIX__,name)
+#define JUMP_TARGET(name) FUNC_NAME(name)
+#define FUNC_START(name) \
+	.type FUNC_NAME(name),@function; \
+	.globl FUNC_NAME(name); \
+FUNC_NAME(name): \
+0:	addis 2,12,(.TOC.-0b)@ha; \
+	addi 2,2,(.TOC.-0b)@l; \
+	.localentry FUNC_NAME(name),.-FUNC_NAME(name)
+
+#define HIDDEN_FUNC(name) \
+  FUNC_START(name) \
+  .hidden FUNC_NAME(name);
+
+#define FUNC_END(name) \
+	.size FUNC_NAME(name),.-FUNC_NAME(name)
+
+#elif defined (__powerpc64__)
+
+#define FUNC_NAME(name) GLUE(.,name)
+#define JUMP_TARGET(name) FUNC_NAME(name)
+#define FUNC_START(name) \
+	.section ".opd","aw"; \
+name: \
+	.quad GLUE(.,name); \
+	.quad .TOC.@tocbase; \
+	.quad 0; \
+	.previous; \
+	.type GLUE(.,name),@function; \
+	.globl name; \
+	.globl GLUE(.,name); \
+GLUE(.,name):
+
+#define HIDDEN_FUNC(name) \
+  FUNC_START(name) \
+  .hidden name;	\
+  .hidden GLUE(.,name);
+
+#define FUNC_END(name) \
+GLUE(.L,name): \
+	.size GLUE(.,name),GLUE(.L,name)-GLUE(.,name)
+
+#elif defined(_CALL_AIXDESC)
+
+#ifdef _RELOCATABLE
+#define DESC_SECTION ".got2"
+#else
+#define DESC_SECTION ".got1"
+#endif
+
+#define FUNC_NAME(name) GLUE(.,name)
+#define JUMP_TARGET(name) FUNC_NAME(name)
+#define FUNC_START(name) \
+	.section DESC_SECTION,"aw"; \
+name: \
+	.long GLUE(.,name); \
+	.long _GLOBAL_OFFSET_TABLE_; \
+	.long 0; \
+	.previous; \
+	.type GLUE(.,name),@function; \
+	.globl name; \
+	.globl GLUE(.,name); \
+GLUE(.,name):
+
+#define HIDDEN_FUNC(name) \
+  FUNC_START(name) \
+  .hidden name; \
+  .hidden GLUE(.,name);
+
+#define FUNC_END(name) \
+GLUE(.L,name): \
+	.size GLUE(.,name),GLUE(.L,name)-GLUE(.,name)
+
+#else
+
+#define FUNC_NAME(name) GLUE(__USER_LABEL_PREFIX__,name)
+#if defined __PIC__ || defined __pic__
+#define JUMP_TARGET(name) FUNC_NAME(name@plt)
+#else
+#define JUMP_TARGET(name) FUNC_NAME(name)
+#endif
+#define FUNC_START(name) \
+	.type FUNC_NAME(name),@function; \
+	.globl FUNC_NAME(name); \
+FUNC_NAME(name):
+
+#define HIDDEN_FUNC(name) \
+  FUNC_START(name) \
+  .hidden FUNC_NAME(name);
+
+#define FUNC_END(name) \
+GLUE(.L,name): \
+	.size FUNC_NAME(name),GLUE(.L,name)-FUNC_NAME(name)
+#endif
+
+#ifdef IN_GCC
+/* For HAVE_GAS_CFI_DIRECTIVE.  */
+#include "auto-host.h"
+
+#ifdef HAVE_GAS_CFI_DIRECTIVE
+# define CFI_STARTPROC			.cfi_startproc
+# define CFI_ENDPROC			.cfi_endproc
+# define CFI_OFFSET(reg, off)		.cfi_offset reg, off
+# define CFI_DEF_CFA_REGISTER(reg)	.cfi_def_cfa_register reg
+# define CFI_RESTORE(reg)		.cfi_restore reg
+#else
+# define CFI_STARTPROC
+# define CFI_ENDPROC
+# define CFI_OFFSET(reg, off)
+# define CFI_DEF_CFA_REGISTER(reg)
+# define CFI_RESTORE(reg)
+#endif
+#endif
+
+#if defined __linux__
+	.section .note.GNU-stack
+	.previous
+#endif
diff --git a/src/common/ppc-opcode.h b/src/common/ppc-opcode.h
new file mode 100644
index 000000000..16d5b2e37
--- /dev/null
+++ b/src/common/ppc-opcode.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of either:
+ *
+ *  a) the GNU General Public License as published by the Free Software
+ *     Foundation; either version 2 of the License, or (at your option)
+ *     any later version, or
+ *  b) the Apache License, Version 2.0
+ */
+#ifndef __OPCODES_H
+#define __OPCODES_H
+
+#define __PPC_RA(a)		(((a) & 0x1f) << 16)
+#define __PPC_RB(b)		(((b) & 0x1f) << 11)
+#define __PPC_XA(a)		((((a) & 0x1f) << 16) | (((a) & 0x20) >> 3))
+#define __PPC_XB(b)		((((b) & 0x1f) << 11) | (((b) & 0x20) >> 4))
+#define __PPC_XS(s)		((((s) & 0x1f) << 21) | (((s) & 0x20) >> 5))
+#define __PPC_XT(s)		__PPC_XS(s)
+#define VSX_XX3(t, a, b)	(__PPC_XT(t) | __PPC_XA(a) | __PPC_XB(b))
+#define VSX_XX1(s, a, b)	(__PPC_XS(s) | __PPC_RA(a) | __PPC_RB(b))
+
+#define PPC_INST_VPMSUMW	0x10000488
+#define PPC_INST_VPMSUMD	0x100004c8
+#define PPC_INST_MFVSRD		0x7c000066
+#define PPC_INST_MTVSRD		0x7c000166
+
+#define VPMSUMW(t, a, b)	.long PPC_INST_VPMSUMW | VSX_XX3((t), a, b)
+#define VPMSUMD(t, a, b)	.long PPC_INST_VPMSUMD | VSX_XX3((t), a, b)
+#define MFVRD(a, t)		.long PPC_INST_MFVSRD | VSX_XX1((t)+32, a, 0)
+#define MTVRD(t, a)		.long PPC_INST_MTVSRD | VSX_XX1((t)+32, a, 0)
+
+#endif
+/* Copyright (C) 2017 International Business Machines Corp.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef __OPCODES_H
+#define __OPCODES_H
+
+#define __PPC_RA(a)		(((a) & 0x1f) << 16)
+#define __PPC_RB(b)		(((b) & 0x1f) << 11)
+#define __PPC_XA(a)		((((a) & 0x1f) << 16) | (((a) & 0x20) >> 3))
+#define __PPC_XB(b)		((((b) & 0x1f) << 11) | (((b) & 0x20) >> 4))
+#define __PPC_XS(s)		((((s) & 0x1f) << 21) | (((s) & 0x20) >> 5))
+#define __PPC_XT(s)		__PPC_XS(s)
+#define VSX_XX3(t, a, b)	(__PPC_XT(t) | __PPC_XA(a) | __PPC_XB(b))
+#define VSX_XX1(s, a, b)	(__PPC_XS(s) | __PPC_RA(a) | __PPC_RB(b))
+
+#define PPC_INST_VPMSUMW	0x10000488
+#define PPC_INST_VPMSUMD	0x100004c8
+#define PPC_INST_MFVSRD		0x7c000066
+#define PPC_INST_MTVSRD		0x7c000166
+
+#define VPMSUMW(t, a, b)	.long PPC_INST_VPMSUMW | VSX_XX3((t), a, b)
+#define VPMSUMD(t, a, b)	.long PPC_INST_VPMSUMD | VSX_XX3((t), a, b)
+#define MFVRD(a, t)		.long PPC_INST_MFVSRD | VSX_XX1((t)+32, a, 0)
+#define MTVRD(t, a)		.long PPC_INST_MTVSRD | VSX_XX1((t)+32, a, 0)
+
+#endif
diff --git a/src/common/pretty_binary.cc b/src/common/pretty_binary.cc
new file mode 100644
index 000000000..d58b0fb6e
--- /dev/null
+++ b/src/common/pretty_binary.cc
@@ -0,0 +1,95 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "pretty_binary.h"
+#include <stdexcept>
+#include <sstream>
+
+std::string pretty_binary_string_reverse(const std::string& pretty)
+{
+  size_t i = 0;
+  auto raise = [&](size_t failpos) {
+    std::ostringstream ss;
+    ss << "invalid char at pos " << failpos << " of " << pretty;
+    throw std::invalid_argument(ss.str());
+  };
+  auto hexdigit = [&](unsigned char c) -> int32_t {
+    if (c >= '0' && c <= '9') return c - '0';
+    if (c >= 'a' && c <= 'f') return c - 'a' + 10;
+    if (c >= 'A' && c <= 'F') return c - 'A' + 10;
+    return -1;
+  };
+  auto require = [&](unsigned char c) {
+    if (i >= pretty.length() || pretty[i] != c) {
+      raise(i);
+    }
+    ++i;
+  };
+  std::string bin;
+  if (pretty.empty())
+    return bin;
+  bin.reserve(pretty.length());
+  bool strmode;
+  switch (pretty[0]) {
+    case '\'':
+      ++i;
+      strmode = true;
+      break;
+    case '0':
+      ++i;
+      require('x');
+      if (i == pretty.length()) {
+	raise(i);
+      }
+      strmode = false;
+      break;
+    default:
+      raise(0);
+  }
+  for (; i < pretty.length();) {
+    if (strmode) {
+      if (pretty[i] == '\'') {
+	if (i + 1 < pretty.length() && pretty[i + 1] == '\'') {
+	  bin.push_back('\'');
+	  i += 2;
+	} else {
+	  ++i;
+	  strmode = false;
+	  if (i + 1 < pretty.length()) {
+	    require('0');
+	    require('x');
+	    if (i == pretty.length()) {
+	      raise(i);
+	    }
+	  }
+	}
+      } else {
+	bin.push_back(pretty[i]);
+	++i;
+      }
+    } else {
+      if (pretty[i] != '\'') {
+	int32_t hex0 = hexdigit(pretty[i]);
+	if (hex0 < 0) {
+	  raise(i);
+	}
+	++i;
+	if (i >= pretty.length()) {
+	  raise(i);
+	}
+	int32_t hex1 = hexdigit(pretty[i]);
+	if (hex1 < 0) {
+	  raise(i);
+	}
+	bin.push_back(hex0 * 0x10 + hex1);
+	++i;
+      } else {
+	strmode = true;
+	++i;
+      }
+    }
+  }
+  if (strmode)
+    raise(i);
+  return bin;
+}
diff --git a/src/common/pretty_binary.h b/src/common/pretty_binary.h
new file mode 100644
index 000000000..5f1829747
--- /dev/null
+++ b/src/common/pretty_binary.h
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+
+template<typename S>
+static std::string pretty_binary_string(const S& bin)
+{
+  std::string pretty;
+  if (bin.empty())
+    return pretty;
+  pretty.reserve(bin.length() * 3);
+  auto printable = [](unsigned char c) -> bool {
+    return (c >= 32) && (c <= 126);
+  };
+  auto append_hex = [&](unsigned char c) {
+    static const char hex[16] = {'0', '1', '2', '3',
+				 '4', '5', '6', '7',
+				 '8', '9', 'A', 'B',
+				 'C', 'D', 'E', 'F'};
+    pretty.push_back(hex[c / 16]);
+    pretty.push_back(hex[c % 16]);
+  };
+  // prologue
+  bool strmode = printable(bin[0]);
+  if (strmode) {
+    pretty.push_back('\'');
+  } else {
+    pretty.push_back('0');
+    pretty.push_back('x');
+  }
+  for (size_t i = 0; i < bin.length(); ++i) {
+    // change mode from hex to str if following 3 characters are printable
+    if (strmode) {
+      if (!printable(bin[i])) {
+	pretty.push_back('\'');
+	pretty.push_back('0');
+	pretty.push_back('x');
+	strmode = false;
+      }
+    } else {
+      if (i + 2 < bin.length() &&
+	  printable(bin[i]) &&
+	  printable(bin[i + 1]) &&
+	  printable(bin[i + 2])) {
+	pretty.push_back('\'');
+	strmode = true;
+      }
+    }
+    if (strmode) {
+      if (bin[i] == '\'')
+	pretty.push_back('\'');
+      pretty.push_back(bin[i]);
+    } else {
+      append_hex(bin[i]);
+    }
+  }
+  // epilog
+  if (strmode) {
+    pretty.push_back('\'');
+  }
+  return pretty;
+}
+
+std::string pretty_binary_string_reverse(const std::string& pretty);
diff --git a/src/common/ptr_wrapper.h b/src/common/ptr_wrapper.h
new file mode 100644
index 000000000..2780a64bf
--- /dev/null
+++ b/src/common/ptr_wrapper.h
@@ -0,0 +1,45 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+template <typename T, int id>
+struct ptr_wrapper
+{
+  T *_p;
+
+  ptr_wrapper(T *v) : _p(v) {}
+  ptr_wrapper() : _p(nullptr) {}
+
+  T *operator=(T *p) {
+    _p = p;
+    return p;
+  }
+
+  T& operator*() {
+    return *_p;
+  }
+
+  T* operator->() {
+    return _p;
+  }
+
+  T *get() {
+    return _p;
+  }
+};
+
+
diff --git a/src/common/random_string.cc b/src/common/random_string.cc
new file mode 100644
index 000000000..c72895618
--- /dev/null
+++ b/src/common/random_string.cc
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <string_view>
+#include "auth/Crypto.h"
+#include "common/armor.h"
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "random_string.h"
+
+int gen_rand_base64(CephContext *cct, char *dest, size_t size) /* size should be the required string size + 1 */
+{
+  char buf[size];
+  char tmp_dest[size + 4]; /* so that there's space for the extra '=' characters, and some */
+  int ret;
+
+  cct->random()->get_bytes(buf, sizeof(buf));
+
+  ret = ceph_armor(tmp_dest, &tmp_dest[sizeof(tmp_dest)],
+		   (const char *)buf, ((const char *)buf) + ((size - 1) * 3 + 4 - 1) / 4);
+  if (ret < 0) {
+    lderr(cct) << "ceph_armor failed" << dendl;
+    return ret;
+  }
+  tmp_dest[ret] = '\0';
+  memcpy(dest, tmp_dest, size);
+  dest[size-1] = '\0';
+
+  return 0;
+}
+
+// choose 'size' random characters from the given string table
+static void choose_from(CryptoRandom* random, std::string_view table,
+                        char *dest, size_t size)
+{
+  random->get_bytes(dest, size);
+
+  for (size_t i = 0; i < size; i++) {
+    auto pos = static_cast<unsigned>(dest[i]);
+    dest[i] = table[pos % table.size()];
+  }
+}
+
+
+void gen_rand_alphanumeric(CephContext *cct, char *dest, size_t size) /* size should be the required string size + 1 */
+{
+  // this is basically a modified base64 charset, url friendly
+  static constexpr char table[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
+  choose_from(cct->random(), table, dest, size-1);
+  dest[size-1] = 0;
+}
+
+std::string gen_rand_alphanumeric(CephContext *cct, size_t size)
+{
+  std::string str;
+  str.resize(size + 1);
+  gen_rand_alphanumeric(cct, str.data(), str.size());
+  str.pop_back(); // pop the extra \0
+  return str;
+}
+
+void gen_rand_alphanumeric_lower(CephContext *cct, char *dest, size_t size) /* size should be the required string size + 1 */
+{
+  static constexpr char table[] = "0123456789abcdefghijklmnopqrstuvwxyz";
+  choose_from(cct->random(), table, dest, size-1);
+  dest[size-1] = 0;
+}
+
+std::string gen_rand_alphanumeric_lower(CephContext *cct, size_t size)
+{
+  std::string str;
+  str.resize(size + 1);
+  gen_rand_alphanumeric_lower(cct, str.data(), str.size());
+  str.pop_back(); // pop the extra \0
+  return str;
+}
+
+
+void gen_rand_alphanumeric_upper(CephContext *cct, char *dest, size_t size) /* size should be the required string size + 1 */
+{
+  static constexpr char table[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+  choose_from(cct->random(), table, dest, size-1);
+  dest[size-1] = 0;
+}
+
+std::string gen_rand_alphanumeric_upper(CephContext *cct, size_t size)
+{
+  std::string str;
+  str.resize(size + 1);
+  gen_rand_alphanumeric_upper(cct, str.data(), str.size());
+  str.pop_back(); // pop the extra \0
+  return str;
+}
+
+
+void gen_rand_alphanumeric_no_underscore(CephContext *cct, char *dest, size_t size) /* size should be the required string size + 1 */
+{
+  static constexpr char table[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-.";
+  choose_from(cct->random(), table, dest, size-1);
+  dest[size-1] = 0;
+}
+
+std::string gen_rand_alphanumeric_no_underscore(CephContext *cct, size_t size)
+{
+  std::string str;
+  str.resize(size + 1);
+  gen_rand_alphanumeric_no_underscore(cct, str.data(), str.size());
+  str.pop_back(); // pop the extra \0
+  return str;
+}
+
+
+void gen_rand_alphanumeric_plain(CephContext *cct, char *dest, size_t size) /* size should be the required string size + 1 */
+{
+  static constexpr char table[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
+  choose_from(cct->random(), table, dest, size-1);
+  dest[size-1] = 0;
+}
+
+std::string gen_rand_alphanumeric_plain(CephContext *cct, size_t size)
+{
+  std::string str;
+  str.resize(size + 1);
+  gen_rand_alphanumeric_plain(cct, str.data(), str.size());
+  str.pop_back(); // pop the extra \0
+  return str;
+}
diff --git a/src/common/random_string.h b/src/common/random_string.h
new file mode 100644
index 000000000..b5dd9825e
--- /dev/null
+++ b/src/common/random_string.h
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2009 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2015 Yehuda Sadeh <yehuda@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <string>
+#include "include/common_fwd.h"
+
+/* size should be the required string size + 1 */
+int gen_rand_base64(CephContext *cct, char *dest, size_t size);
+void gen_rand_alphanumeric(CephContext *cct, char *dest, size_t size);
+void gen_rand_alphanumeric_lower(CephContext *cct, char *dest, size_t size);
+void gen_rand_alphanumeric_upper(CephContext *cct, char *dest, size_t size);
+void gen_rand_alphanumeric_no_underscore(CephContext *cct, char *dest, size_t size);
+void gen_rand_alphanumeric_plain(CephContext *cct, char *dest, size_t size);
+
+// returns a std::string with 'size' random characters
+std::string gen_rand_alphanumeric(CephContext *cct, size_t size);
+std::string gen_rand_alphanumeric_lower(CephContext *cct, size_t size);
+std::string gen_rand_alphanumeric_upper(CephContext *cct, size_t size);
+std::string gen_rand_alphanumeric_no_underscore(CephContext *cct, size_t size);
+std::string gen_rand_alphanumeric_plain(CephContext *cct, size_t size);
diff --git a/src/common/ref.h b/src/common/ref.h
new file mode 100644
index 000000000..055c9a07a
--- /dev/null
+++ b/src/common/ref.h
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef COMMON_REF_H
+#define COMMON_REF_H
+
+#include <boost/intrusive_ptr.hpp>
+
+namespace ceph {
+template<typename T> using ref_t = boost::intrusive_ptr<T>;
+template<typename T> using cref_t = boost::intrusive_ptr<const T>;
+template<class T, class U>
+ref_t<T> ref_cast(const ref_t<U>& r) noexcept {
+  return static_cast<T*>(r.get());
+}
+template<class T, class U>
+ref_t<T> ref_cast(ref_t<U>&& r) noexcept {
+  return {static_cast<T*>(r.detach()), false};
+}
+template<class T, class U>
+cref_t<T> ref_cast(const cref_t<U>& r) noexcept {
+  return static_cast<const T*>(r.get());
+}
+template<class T, typename... Args>
+ceph::ref_t<T> make_ref(Args&&... args) {
+  return {new T(std::forward<Args>(args)...), false};
+}
+}
+
+// Friends cannot be partial specializations: https://en.cppreference.com/w/cpp/language/friend
+#define FRIEND_MAKE_REF(C) \
+template<class T, typename... Args> friend ceph::ref_t<T> ceph::make_ref(Args&&... args)
+
+#endif
diff --git a/src/common/reverse.c b/src/common/reverse.c
new file mode 100644
index 000000000..f65540d54
--- /dev/null
+++ b/src/common/reverse.c
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "reverse.h"
+
+uint32_t reverse_bits(uint32_t v) {
+  if (v == 0)
+    return v;
+
+  /* reverse bits
+   * swap odd and even bits
+   */
+  v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+  /* swap consecutive pairs */
+  v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+  /* swap nibbles ... */
+  v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+  /* swap bytes */
+  v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
+  /* swap 2-byte long pairs */
+  v = ( v >> 16             ) | ( v               << 16);                                
+  return v;
+}
+
+uint32_t reverse_nibbles(uint32_t retval) {
+  /* reverse nibbles */
+  retval = ((retval & 0x0f0f0f0f) << 4) | ((retval & 0xf0f0f0f0) >> 4);
+  retval = ((retval & 0x00ff00ff) << 8) | ((retval & 0xff00ff00) >> 8);
+  retval = ((retval & 0x0000ffff) << 16) | ((retval & 0xffff0000) >> 16);
+  return retval;
+}
diff --git a/src/common/reverse.h b/src/common/reverse.h
new file mode 100644
index 000000000..9a199a847
--- /dev/null
+++ b/src/common/reverse.h
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __CEPH_OS_REVERSE_H
+#define __CEPH_OS_REVERSE_H
+
+#include "include/int_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint32_t reverse_bits(uint32_t v);
+extern uint32_t reverse_nibbles(uint32_t retval);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif    
diff --git a/src/common/run_cmd.cc b/src/common/run_cmd.cc
new file mode 100644
index 000000000..855b6e537
--- /dev/null
+++ b/src/common/run_cmd.cc
@@ -0,0 +1,107 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/errno.h"
+
+#ifndef _WIN32
+#include <sstream>
+#include <stdarg.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <vector>
+#else
+#include "common/SubProcess.h"
+#endif /* _WIN32 */
+
+using std::ostringstream;
+
+#ifndef _WIN32
+std::string run_cmd(const char *cmd, ...)
+{
+  std::vector <const char *> arr;
+  va_list ap;
+  va_start(ap, cmd);
+  const char *c = cmd;
+  do {
+    arr.push_back(c);
+    c = va_arg(ap, const char*);
+  } while (c != NULL);
+  va_end(ap);
+  arr.push_back(NULL);
+
+  int fret = fork();
+  if (fret == -1) {
+    int err = errno;
+    ostringstream oss;
+    oss << "run_cmd(" << cmd << "): unable to fork(): " << cpp_strerror(err);
+    return oss.str();
+  }
+  else if (fret == 0) {
+    // execvp doesn't modify its arguments, so the const-cast here is safe.
+    close(STDIN_FILENO);
+    close(STDOUT_FILENO);
+    close(STDERR_FILENO);
+    execvp(cmd, (char * const*)&arr[0]);
+    _exit(127);
+  }
+  int status;
+  while (waitpid(fret, &status, 0) == -1) {
+    int err = errno;
+    if (err == EINTR)
+      continue;
+    ostringstream oss;
+    oss << "run_cmd(" << cmd << "): waitpid error: "
+	 << cpp_strerror(err);
+    return oss.str();
+  }
+  if (WIFEXITED(status)) {
+    int wexitstatus = WEXITSTATUS(status);
+    if (wexitstatus != 0) {
+      ostringstream oss;
+      oss << "run_cmd(" << cmd << "): exited with status " << wexitstatus;
+      return oss.str();
+    }
+    return "";
+  }
+  else if (WIFSIGNALED(status)) {
+    ostringstream oss;
+    oss << "run_cmd(" << cmd << "): terminated by signal";
+    return oss.str();
+  }
+  ostringstream oss;
+  oss << "run_cmd(" << cmd << "): terminated by unknown mechanism";
+  return oss.str();
+}
+#else
+std::string run_cmd(const char *cmd, ...)
+{
+  SubProcess p(cmd, SubProcess::CLOSE, SubProcess::PIPE, SubProcess::CLOSE);
+
+  va_list ap;
+  va_start(ap, cmd);
+  const char *c = cmd;
+  c = va_arg(ap, const char*);
+  while (c != NULL) {
+    p.add_cmd_arg(c);
+    c = va_arg(ap, const char*);
+  }
+  va_end(ap);
+
+  if (p.spawn() == 0) {
+    p.join();
+  }
+
+  return p.err();
+}
+#endif /* _WIN32 */
diff --git a/src/common/run_cmd.h b/src/common/run_cmd.h
new file mode 100644
index 000000000..9d82a6499
--- /dev/null
+++ b/src/common/run_cmd.h
@@ -0,0 +1,33 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_RUN_CMD_H
+#define CEPH_COMMON_RUN_CMD_H
+
+#include <string>
+
+//
+// Fork a command and run it. The shell will not be invoked and shell
+// expansions will not be done.
+// This function takes a variable number of arguments. The last argument must
+// be NULL.
+//
+// Example:
+//   run_cmd("rm", "-rf", "foo", NULL)
+//
+// Returns an empty string on success, and an error string otherwise.
+//
+std::string run_cmd(const char *cmd, ...);
+
+#endif
diff --git a/src/common/safe_io.c b/src/common/safe_io.c
new file mode 100644
index 000000000..80921d0f2
--- /dev/null
+++ b/src/common/safe_io.c
@@ -0,0 +1,297 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/safe_io.h"
+#include "include/compat.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sys/socket.h>
+
+ssize_t safe_read(int fd, void *buf, size_t count)
+{
+	size_t cnt = 0;
+
+	while (cnt < count) {
+		ssize_t r = read(fd, buf, count - cnt);
+		if (r <= 0) {
+			if (r == 0) {
+				// EOF
+				return cnt;
+			}
+			if (errno == EINTR)
+				continue;
+			return -errno;
+		}
+		cnt += r;
+		buf = (char *)buf + r;
+	}
+	return cnt;
+}
+
+#ifdef _WIN32
+// "read" doesn't work with Windows sockets.
+ssize_t safe_recv(int fd, void *buf, size_t count)
+{
+  size_t cnt = 0;
+
+  while (cnt < count) {
+    ssize_t r = recv(fd, (SOCKOPT_VAL_TYPE)buf, count - cnt, 0);
+    if (r <= 0) {
+      if (r == 0) {
+        // EOF
+        return cnt;
+      }
+      int err = ceph_sock_errno();
+      if (err == EAGAIN || err == EINTR) {
+        continue;
+      }
+      return -err;
+    }
+    cnt += r;
+    buf = (char *)buf + r;
+  }
+  return cnt;
+}
+#else
+ssize_t safe_recv(int fd, void *buf, size_t count)
+{
+  // We'll use "safe_read" so that this can work with any type of
+  // file descriptor.
+  return safe_read(fd, buf, count);
+}
+#endif /* _WIN32 */
+
+ssize_t safe_read_exact(int fd, void *buf, size_t count)
+{
+        ssize_t ret = safe_read(fd, buf, count);
+	if (ret < 0)
+		return ret;
+	if ((size_t)ret != count)
+		return -EDOM;
+	return 0;
+}
+
+ssize_t safe_recv_exact(int fd, void *buf, size_t count)
+{
+        ssize_t ret = safe_recv(fd, buf, count);
+  if (ret < 0)
+    return ret;
+  if ((size_t)ret != count)
+    return -EDOM;
+  return 0;
+}
+
+ssize_t safe_write(int fd, const void *buf, size_t count)
+{
+	while (count > 0) {
+		ssize_t r = write(fd, buf, count);
+		if (r < 0) {
+			if (errno == EINTR)
+				continue;
+			return -errno;
+		}
+		count -= r;
+		buf = (char *)buf + r;
+	}
+	return 0;
+}
+
+#ifdef _WIN32
+ssize_t safe_send(int fd, const void *buf, size_t count)
+{
+  while (count > 0) {
+    ssize_t r = send(fd, (SOCKOPT_VAL_TYPE)buf, count, 0);
+    if (r < 0) {
+      int err = ceph_sock_errno();
+      if (err == EINTR || err == EAGAIN) {
+        continue;
+      }
+      return -err;
+    }
+    count -= r;
+    buf = (char *)buf + r;
+  }
+  return 0;
+}
+#else
+ssize_t safe_send(int fd, const void *buf, size_t count)
+{
+  return safe_write(fd, buf, count);
+}
+#endif /* _WIN32 */
+
+ssize_t safe_pread(int fd, void *buf, size_t count, off_t offset)
+{
+	size_t cnt = 0;
+	char *b = (char*)buf;
+
+	while (cnt < count) {
+		ssize_t r = pread(fd, b + cnt, count - cnt, offset + cnt);
+		if (r <= 0) {
+			if (r == 0) {
+				// EOF
+				return cnt;
+			}
+			if (errno == EINTR)
+				continue;
+			return -errno;
+		}
+
+		cnt += r;
+	}
+	return cnt;
+}
+
+ssize_t safe_pread_exact(int fd, void *buf, size_t count, off_t offset)
+{
+	ssize_t ret = safe_pread(fd, buf, count, offset);
+	if (ret < 0)
+		return ret;
+	if ((size_t)ret != count)
+		return -EDOM;
+	return 0;
+}
+
+ssize_t safe_pwrite(int fd, const void *buf, size_t count, off_t offset)
+{
+	while (count > 0) {
+		ssize_t r = pwrite(fd, buf, count, offset);
+		if (r < 0) {
+			if (errno == EINTR)
+				continue;
+			return -errno;
+		}
+		count -= r;
+		buf = (char *)buf + r;
+		offset += r;
+	}
+	return 0;
+}
+
+#ifdef CEPH_HAVE_SPLICE
+ssize_t safe_splice(int fd_in, off_t *off_in, int fd_out, off_t *off_out,
+		    size_t len, unsigned int flags)
+{
+  size_t cnt = 0;
+
+  while (cnt < len) {
+    ssize_t r = splice(fd_in, off_in, fd_out, off_out, len - cnt, flags);
+    if (r <= 0) {
+      if (r == 0) {
+	// EOF
+	return cnt;
+      }
+      if (errno == EINTR)
+	continue;
+      if (errno == EAGAIN)
+	break;
+      return -errno;
+    }
+    cnt += r;
+  }
+  return cnt;
+}
+
+ssize_t safe_splice_exact(int fd_in, off_t *off_in, int fd_out,
+			  off_t *off_out, size_t len, unsigned int flags)
+{
+  ssize_t ret = safe_splice(fd_in, off_in, fd_out, off_out, len, flags);
+  if (ret < 0)
+    return ret;
+  if ((size_t)ret != len)
+    return -EDOM;
+  return 0;
+}
+#endif
+
+int safe_write_file(const char *base, const char *file,
+		    const char *val, size_t vallen,
+		    unsigned mode)
+{
+  int ret;
+  char fn[PATH_MAX];
+  char tmp[PATH_MAX];
+  int fd;
+
+  // does the file already have correct content?
+  char oldval[80];
+  ret = safe_read_file(base, file, oldval, sizeof(oldval));
+  if (ret == (int)vallen && memcmp(oldval, val, vallen) == 0)
+    return 0;  // yes.
+
+  snprintf(fn, sizeof(fn), "%s/%s", base, file);
+  snprintf(tmp, sizeof(tmp), "%s/%s.tmp", base, file);
+  fd = open(tmp, O_WRONLY|O_CREAT|O_TRUNC|O_BINARY, mode);
+  if (fd < 0) {
+    ret = errno;
+    return -ret;
+  }
+  ret = safe_write(fd, val, vallen);
+  if (ret) {
+    VOID_TEMP_FAILURE_RETRY(close(fd));
+    return ret;
+  }
+
+  ret = fsync(fd);
+  if (ret < 0) ret = -errno;
+  VOID_TEMP_FAILURE_RETRY(close(fd));
+  if (ret < 0) {
+    unlink(tmp);
+    return ret;
+  }
+  ret = rename(tmp, fn);
+  if (ret < 0) {
+    ret = -errno;
+    unlink(tmp);
+    return ret;
+  }
+
+  fd = open(base, O_RDONLY|O_BINARY);
+  if (fd < 0) {
+    ret = -errno;
+    return ret;
+  }
+  ret = fsync(fd);
+  if (ret < 0) ret = -errno;
+  VOID_TEMP_FAILURE_RETRY(close(fd));
+
+  return ret;
+}
+
+int safe_read_file(const char *base, const char *file,
+		   char *val, size_t vallen)
+{
+  char fn[PATH_MAX];
+  int fd, len;
+
+  snprintf(fn, sizeof(fn), "%s/%s", base, file);
+  fd = open(fn, O_RDONLY|O_BINARY);
+  if (fd < 0) {
+    return -errno;
+  }
+  len = safe_read(fd, val, vallen);
+  if (len < 0) {
+    VOID_TEMP_FAILURE_RETRY(close(fd));
+    return len;
+  }
+  // close sometimes returns errors, but only after write()
+  VOID_TEMP_FAILURE_RETRY(close(fd));
+
+  return len;
+}
diff --git a/src/common/safe_io.h b/src/common/safe_io.h
new file mode 100644
index 000000000..6eb25d52c
--- /dev/null
+++ b/src/common/safe_io.h
@@ -0,0 +1,82 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_SAFE_IO
+#define CEPH_SAFE_IO
+
+#include "common/compiler_extensions.h"
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  /*
+   * Safe functions wrapping the raw read() and write() libc functions.
+   * These retry on EINTR, and on error return -errno instead of returning
+   * -1 and setting errno).
+   *
+   * On Windows, only recv/send work with sockets.
+   */
+  ssize_t safe_read(int fd, void *buf, size_t count)
+      WARN_UNUSED_RESULT;
+  ssize_t safe_write(int fd, const void *buf, size_t count)
+      WARN_UNUSED_RESULT;
+  ssize_t safe_recv(int fd, void *buf, size_t count)
+      WARN_UNUSED_RESULT;
+  ssize_t safe_send(int fd, const void *buf, size_t count)
+      WARN_UNUSED_RESULT;
+  ssize_t safe_pread(int fd, void *buf, size_t count, off_t offset)
+      WARN_UNUSED_RESULT;
+  ssize_t safe_pwrite(int fd, const void *buf, size_t count, off_t offset)
+      WARN_UNUSED_RESULT;
+#ifdef CEPH_HAVE_SPLICE
+  /*
+   * Similar to the above (non-exact version) and below (exact version).
+   * See splice(2) for parameter descriptions.
+   */
+  ssize_t safe_splice(int fd_in, off_t *off_in, int fd_out, off_t *off_out,
+		      size_t len, unsigned int flags)
+    WARN_UNUSED_RESULT;
+  ssize_t safe_splice_exact(int fd_in, off_t *off_in, int fd_out,
+			    off_t *off_out, size_t len, unsigned int flags)
+    WARN_UNUSED_RESULT;
+#endif
+
+  /*
+   * Same as the above functions, but return -EDOM unless exactly the requested
+   * number of bytes can be read.
+   */
+  ssize_t safe_read_exact(int fd, void *buf, size_t count)
+      WARN_UNUSED_RESULT;
+  ssize_t safe_recv_exact(int fd, void *buf, size_t count)
+      WARN_UNUSED_RESULT;
+  ssize_t safe_pread_exact(int fd, void *buf, size_t count, off_t offset)
+      WARN_UNUSED_RESULT;
+
+
+  /*
+   * Safe functions to read and write an entire file.
+   */
+  int safe_write_file(const char *base, const char *file,
+		      const char *val, size_t vallen,
+		      unsigned mode);
+  int safe_read_file(const char *base, const char *file,
+		     char *val, size_t vallen);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/common/scrub_types.cc b/src/common/scrub_types.cc
new file mode 100644
index 000000000..9168ee0a2
--- /dev/null
+++ b/src/common/scrub_types.cc
@@ -0,0 +1,292 @@
+#include "scrub_types.h"
+
+using std::map;
+
+using namespace librados;
+
+void object_id_wrapper::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(name, bl);
+  encode(nspace, bl);
+  encode(locator, bl);
+  encode(snap, bl);
+  ENCODE_FINISH(bl);
+}
+
+void object_id_wrapper::decode(bufferlist::const_iterator& bp)
+{
+  DECODE_START(1, bp);
+  decode(name, bp);
+  decode(nspace, bp);
+  decode(locator, bp);
+  decode(snap, bp);
+  DECODE_FINISH(bp);
+}
+
+namespace librados {
+static void encode(const object_id_t& obj, bufferlist& bl)
+{
+  reinterpret_cast<const object_id_wrapper&>(obj).encode(bl);
+}
+}
+
+void osd_shard_wrapper::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(osd, bl);
+  encode(shard, bl);
+  ENCODE_FINISH(bl);
+}
+
+void osd_shard_wrapper::decode(bufferlist::const_iterator& bp)
+{
+  DECODE_START(1, bp);
+  decode(osd, bp);
+  decode(shard, bp);
+  DECODE_FINISH(bp);
+}
+
+namespace librados {
+static void encode(const osd_shard_t& shard, bufferlist& bl) {
+  reinterpret_cast<const osd_shard_wrapper&>(shard).encode(bl);
+}
+}
+
+void shard_info_wrapper::set_object(const ScrubMap::object& object)
+{
+  for (auto attr : object.attrs) {
+    bufferlist bl;
+    bl.push_back(attr.second);
+    attrs.insert(std::make_pair(attr.first, std::move(bl)));
+  }
+  size = object.size;
+  if (object.omap_digest_present) {
+    omap_digest_present = true;
+    omap_digest = object.omap_digest;
+  }
+  if (object.digest_present) {
+    data_digest_present = true;
+    data_digest = object.digest;
+  }
+}
+
+void shard_info_wrapper::encode(bufferlist& bl) const
+{
+  ENCODE_START(3, 3, bl);
+  encode(errors, bl);
+  encode(primary, bl);
+  if (!has_shard_missing()) {
+    encode(attrs, bl);
+    encode(size, bl);
+    encode(omap_digest_present, bl);
+    encode(omap_digest, bl);
+    encode(data_digest_present, bl);
+    encode(data_digest, bl);
+    encode(selected_oi, bl);
+  }
+  ENCODE_FINISH(bl);
+}
+
+void shard_info_wrapper::decode(bufferlist::const_iterator& bp)
+{
+  DECODE_START(3, bp);
+  decode(errors, bp);
+  decode(primary, bp);
+  if (!has_shard_missing()) {
+    decode(attrs, bp);
+    decode(size, bp);
+    decode(omap_digest_present, bp);
+    decode(omap_digest, bp);
+    decode(data_digest_present, bp);
+    decode(data_digest, bp);
+    decode(selected_oi, bp);
+  }
+  DECODE_FINISH(bp);
+}
+
+inconsistent_obj_wrapper::inconsistent_obj_wrapper(const hobject_t& hoid)
+  : inconsistent_obj_t{librados::object_id_t{hoid.oid.name,
+                                 hoid.nspace,
+                                 hoid.get_key(), hoid.snap}}
+{}
+
+void inconsistent_obj_wrapper::add_shard(const pg_shard_t& pgs,
+                                         const shard_info_wrapper& shard)
+{
+  union_shards.errors |= shard.errors;
+  shards.emplace(osd_shard_t{pgs.osd, int8_t(pgs.shard)}, shard);
+}
+
+void
+inconsistent_obj_wrapper::set_auth_missing(const hobject_t& hoid,
+                                           const map<pg_shard_t, ScrubMap>& maps,
+					   map<pg_shard_t, shard_info_wrapper> &shard_map,
+					   int &shallow_errors, int &deep_errors,
+					   const pg_shard_t &primary)
+{
+  for (auto pg_map : maps) {
+    auto oid_object = pg_map.second.objects.find(hoid);
+    shard_map[pg_map.first].primary = (pg_map.first == primary);
+    if (oid_object == pg_map.second.objects.end())
+      shard_map[pg_map.first].set_missing();
+    else
+      shard_map[pg_map.first].set_object(oid_object->second);
+    if (shard_map[pg_map.first].has_deep_errors())
+      ++deep_errors;
+    else if (shard_map[pg_map.first].has_shallow_errors())
+      ++shallow_errors;
+    union_shards.errors |= shard_map[pg_map.first].errors;
+    shards.emplace(osd_shard_t{pg_map.first.osd, pg_map.first.shard}, shard_map[pg_map.first]);
+  }
+}
+
+namespace librados {
+  static void encode(const shard_info_t& shard, bufferlist& bl)
+  {
+    reinterpret_cast<const shard_info_wrapper&>(shard).encode(bl);
+  }
+}
+
+void inconsistent_obj_wrapper::encode(bufferlist& bl) const
+{
+  ENCODE_START(2, 2, bl);
+  encode(errors, bl);
+  encode(object, bl);
+  encode(version, bl);
+  encode(shards, bl);
+  encode(union_shards.errors, bl);
+  ENCODE_FINISH(bl);
+}
+
+void inconsistent_obj_wrapper::decode(bufferlist::const_iterator& bp)
+{
+  DECODE_START(2, bp);
+  DECODE_OLDEST(2);
+  decode(errors, bp);
+  decode(object, bp);
+  decode(version, bp);
+  decode(shards, bp);
+  decode(union_shards.errors, bp);
+  DECODE_FINISH(bp);
+}
+
+inconsistent_snapset_wrapper::inconsistent_snapset_wrapper(const hobject_t& hoid)
+  : inconsistent_snapset_t{object_id_t{hoid.oid.name,
+                                       hoid.nspace,
+                                       hoid.get_key(),
+                                       hoid.snap}}
+{}
+
+using inc_snapset_t = inconsistent_snapset_t;
+
+void inconsistent_snapset_wrapper::set_headless()
+{
+  errors |= inc_snapset_t::HEADLESS_CLONE;
+}
+
+void inconsistent_snapset_wrapper::set_snapset_missing()
+{
+  errors |= inc_snapset_t::SNAPSET_MISSING;
+}
+
+void inconsistent_snapset_wrapper::set_info_missing()
+{
+  errors |= inc_snapset_t::INFO_MISSING;
+}
+
+void inconsistent_snapset_wrapper::set_snapset_corrupted()
+{
+  errors |= inc_snapset_t::SNAPSET_CORRUPTED;
+}
+
+void inconsistent_snapset_wrapper::set_info_corrupted()
+{
+  errors |= inc_snapset_t::INFO_CORRUPTED;
+}
+
+void inconsistent_snapset_wrapper::set_clone_missing(snapid_t snap)
+{
+  errors |= inc_snapset_t::CLONE_MISSING;
+  missing.push_back(snap);
+}
+
+void inconsistent_snapset_wrapper::set_clone(snapid_t snap)
+{
+  errors |= inc_snapset_t::EXTRA_CLONES;
+  clones.push_back(snap);
+}
+
+void inconsistent_snapset_wrapper::set_snapset_error()
+{
+  errors |= inc_snapset_t::SNAP_ERROR;
+}
+
+void inconsistent_snapset_wrapper::set_size_mismatch()
+{
+  errors |= inc_snapset_t::SIZE_MISMATCH;
+}
+
+void inconsistent_snapset_wrapper::encode(bufferlist& bl) const
+{
+  ENCODE_START(2, 1, bl);
+  encode(errors, bl);
+  encode(object, bl);
+  encode(clones, bl);
+  encode(missing, bl);
+  encode(ss_bl, bl);
+  ENCODE_FINISH(bl);
+}
+
+void inconsistent_snapset_wrapper::decode(bufferlist::const_iterator& bp)
+{
+  DECODE_START(2, bp);
+  decode(errors, bp);
+  decode(object, bp);
+  decode(clones, bp);
+  decode(missing, bp);
+  if (struct_v >= 2) {
+    decode(ss_bl, bp);
+  }
+  DECODE_FINISH(bp);
+}
+
+void scrub_ls_arg_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(interval, bl);
+  encode(get_snapsets, bl);
+  encode(start_after.name, bl);
+  encode(start_after.nspace, bl);
+  encode(start_after.snap, bl);
+  encode(max_return, bl);
+  ENCODE_FINISH(bl);
+}
+
+void scrub_ls_arg_t::decode(bufferlist::const_iterator& bp)
+{
+  DECODE_START(1, bp);
+  decode(interval, bp);
+  decode(get_snapsets, bp);
+  decode(start_after.name, bp);
+  decode(start_after.nspace, bp);
+  decode(start_after.snap, bp);
+  decode(max_return, bp);
+  DECODE_FINISH(bp);
+}
+
+void scrub_ls_result_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(interval, bl);
+  encode(vals, bl);
+  ENCODE_FINISH(bl);
+}
+
+void scrub_ls_result_t::decode(bufferlist::const_iterator& bp)
+{
+  DECODE_START(1, bp);
+  decode(interval, bp);
+  decode(vals, bp);
+  DECODE_FINISH(bp);
+}
diff --git a/src/common/scrub_types.h b/src/common/scrub_types.h
new file mode 100644
index 000000000..0394eddd7
--- /dev/null
+++ b/src/common/scrub_types.h
@@ -0,0 +1,210 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_SCRUB_TYPES_H
+#define CEPH_SCRUB_TYPES_H
+
+#include "osd/osd_types.h"
+
+// wrappers around scrub types to offer the necessary bits other than
+// the minimal set that the lirados requires
+struct object_id_wrapper : public librados::object_id_t {
+  explicit object_id_wrapper(const hobject_t& hoid)
+    : object_id_t{hoid.oid.name, hoid.nspace, hoid.get_key(), hoid.snap}
+  {}
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+};
+
+WRITE_CLASS_ENCODER(object_id_wrapper)
+
+namespace librados {
+inline void decode(object_id_t& obj, ceph::buffer::list::const_iterator& bp) {
+  reinterpret_cast<object_id_wrapper&>(obj).decode(bp);
+}
+}
+
+struct osd_shard_wrapper : public librados::osd_shard_t {
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bp);
+};
+
+WRITE_CLASS_ENCODER(osd_shard_wrapper)
+
+namespace librados {
+  inline void decode(librados::osd_shard_t& shard, ceph::buffer::list::const_iterator& bp) {
+    reinterpret_cast<osd_shard_wrapper&>(shard).decode(bp);
+  }
+}
+
+struct shard_info_wrapper : public librados::shard_info_t {
+public:
+  shard_info_wrapper() = default;
+  explicit shard_info_wrapper(const ScrubMap::object& object) {
+    set_object(object);
+  }
+  void set_object(const ScrubMap::object& object);
+  void set_missing() {
+    errors |= err_t::SHARD_MISSING;
+  }
+  void set_omap_digest_mismatch_info() {
+    errors |= err_t::OMAP_DIGEST_MISMATCH_INFO;
+  }
+  void set_size_mismatch_info() {
+    errors |= err_t::SIZE_MISMATCH_INFO;
+  }
+  void set_data_digest_mismatch_info() {
+    errors |= err_t::DATA_DIGEST_MISMATCH_INFO;
+  }
+  void set_read_error() {
+    errors |= err_t::SHARD_READ_ERR;
+  }
+  void set_stat_error() {
+    errors |= err_t::SHARD_STAT_ERR;
+  }
+  void set_ec_hash_mismatch() {
+    errors |= err_t::SHARD_EC_HASH_MISMATCH;
+  }
+  void set_ec_size_mismatch() {
+    errors |= err_t::SHARD_EC_SIZE_MISMATCH;
+  }
+  void set_info_missing() {
+    errors |= err_t::INFO_MISSING;
+  }
+  void set_info_corrupted() {
+    errors |= err_t::INFO_CORRUPTED;
+  }
+  void set_snapset_missing() {
+    errors |= err_t::SNAPSET_MISSING;
+  }
+  void set_snapset_corrupted() {
+    errors |= err_t::SNAPSET_CORRUPTED;
+  }
+  void set_obj_size_info_mismatch() {
+    errors |= err_t::OBJ_SIZE_INFO_MISMATCH;
+  }
+  void set_hinfo_missing() {
+    errors |= err_t::HINFO_MISSING;
+  }
+  void set_hinfo_corrupted() {
+    errors |= err_t::HINFO_CORRUPTED;
+  }
+  bool only_data_digest_mismatch_info() const {
+    return errors == err_t::DATA_DIGEST_MISMATCH_INFO;
+  }
+  void clear_data_digest_mismatch_info() {
+    errors &= ~err_t::DATA_DIGEST_MISMATCH_INFO;
+  }
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bp);
+};
+
+WRITE_CLASS_ENCODER(shard_info_wrapper)
+
+namespace librados {
+  inline void decode(librados::shard_info_t& shard,
+		     ceph::buffer::list::const_iterator& bp) {
+    reinterpret_cast<shard_info_wrapper&>(shard).decode(bp);
+  }
+}
+
+struct inconsistent_obj_wrapper : librados::inconsistent_obj_t {
+  explicit inconsistent_obj_wrapper(const hobject_t& hoid);
+
+  void set_object_info_inconsistency() {
+    errors |= obj_err_t::OBJECT_INFO_INCONSISTENCY;
+  }
+  void set_omap_digest_mismatch() {
+    errors |= obj_err_t::OMAP_DIGEST_MISMATCH;
+  }
+  void set_data_digest_mismatch() {
+    errors |= obj_err_t::DATA_DIGEST_MISMATCH;
+  }
+  void set_size_mismatch() {
+    errors |= obj_err_t::SIZE_MISMATCH;
+  }
+  void set_attr_value_mismatch() {
+    errors |= obj_err_t::ATTR_VALUE_MISMATCH;
+  }
+  void set_attr_name_mismatch() {
+    errors |= obj_err_t::ATTR_NAME_MISMATCH;
+  }
+  void set_snapset_inconsistency() {
+    errors |= obj_err_t::SNAPSET_INCONSISTENCY;
+  }
+  void set_hinfo_inconsistency() {
+    errors |= obj_err_t::HINFO_INCONSISTENCY;
+  }
+  void set_size_too_large() {
+    errors |= obj_err_t::SIZE_TOO_LARGE;
+  }
+  void add_shard(const pg_shard_t& pgs, const shard_info_wrapper& shard);
+  void set_auth_missing(const hobject_t& hoid,
+                        const std::map<pg_shard_t, ScrubMap>&,
+			std::map<pg_shard_t, shard_info_wrapper>&,
+			int &shallow_errors, int &deep_errors,
+			const pg_shard_t &primary);
+  void set_version(uint64_t ver) { version = ver; }
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bp);
+};
+
+WRITE_CLASS_ENCODER(inconsistent_obj_wrapper)
+
+inline void decode(librados::inconsistent_obj_t& obj,
+		   ceph::buffer::list::const_iterator& bp) {
+  reinterpret_cast<inconsistent_obj_wrapper&>(obj).decode(bp);
+}
+
+struct inconsistent_snapset_wrapper : public librados::inconsistent_snapset_t {
+  inconsistent_snapset_wrapper() = default;
+  explicit inconsistent_snapset_wrapper(const hobject_t& head);
+  void set_headless();
+  // soid claims that it is a head or a snapdir, but its SS_ATTR
+  // is missing.
+  void set_snapset_missing();
+  void set_info_missing();
+  void set_snapset_corrupted();
+  void set_info_corrupted();
+  // snapset with missing clone
+  void set_clone_missing(snapid_t);
+  // Clones that are there
+  void set_clone(snapid_t);
+  // the snapset is not consistent with itself
+  void set_snapset_error();
+  void set_size_mismatch();
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bp);
+};
+
+WRITE_CLASS_ENCODER(inconsistent_snapset_wrapper)
+
+namespace librados {
+  inline void decode(librados::inconsistent_snapset_t& snapset,
+		     ceph::buffer::list::const_iterator& bp) {
+    reinterpret_cast<inconsistent_snapset_wrapper&>(snapset).decode(bp);
+  }
+}
+
+struct scrub_ls_arg_t {
+  uint32_t interval;
+  uint32_t get_snapsets;
+  librados::object_id_t start_after;
+  uint64_t max_return;
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+};
+
+WRITE_CLASS_ENCODER(scrub_ls_arg_t);
+
+struct scrub_ls_result_t {
+  epoch_t interval;
+  std::vector<ceph::buffer::list> vals;
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+};
+
+WRITE_CLASS_ENCODER(scrub_ls_result_t);
+
+#endif
diff --git a/src/common/sctp_crc32.c b/src/common/sctp_crc32.c
new file mode 100644
index 000000000..964216d7f
--- /dev/null
+++ b/src/common/sctp_crc32.c
@@ -0,0 +1,789 @@
+/*-
+ * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctp_crc32.c,v 1.12 2005/03/06 16:04:17 itojun Exp $	 */
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/cdefs.h>
+#endif
+#if 0
+__FBSDID("$FreeBSD: src/sys/netinet/sctp_crc32.c,v 1.8 2007/05/08 17:01:10 rrs Exp $");
+
+
+#include <netinet/sctp_os.h>
+#include <netinet/sctp_crc32.h>
+#endif
+
+#include <stdint.h>
+
+#include "acconfig.h"
+
+#ifndef SCTP_USE_ADLER32
+
+
+/**
+ *
+ * Routine Description:
+ *
+ * Computes the CRC32c checksum for the specified buffer using the slicing by 8
+ * algorithm over 64 bit quantities.
+ *
+ * Arguments:
+ *
+ *		p_running_crc - pointer to the initial or final remainder value
+ *				used in CRC computations. It should be set to
+ *				non-NULL if the mode argument is equal to CONT or END
+ *		p_buf - the packet buffer where crc computations are being performed
+ *		length - the length of p_buf in bytes
+ *		init_bytes - the number of initial bytes that need to be processed before
+ *					 aligning p_buf to multiples of 4 bytes
+ *		mode - can be any of the following: BEGIN, CONT, END, BODY, ALIGN
+ *
+ * Return value:
+ *
+ *		The computed CRC32c value
+ */
+
+
+/*
+ * Copyright (c) 2004-2006 Intel Corporation - All Rights Reserved
+ *
+ *
+ * This software program is licensed subject to the BSD License, available at
+ * http://www.opensource.org/licenses/bsd-license.html.
+ *
+ * Abstract:
+ *
+ * Tables for software CRC generation
+ */
+
+/*
+ * The following CRC lookup table was generated automagically using the
+ * following model parameters:
+ *
+ * Generator Polynomial = ................. 0x1EDC6F41 Generator Polynomial
+ * Length = .......... 32 bits Reflected Bits = ....................... TRUE
+ * Table Generation Offset = .............. 32 bits Number of Slices =
+ * ..................... 8 slices Slice Lengths = ........................ 8
+ * 8 8 8 8 8 8 8 Directory Name = ....................... .\ File Name =
+ * ............................ 8x256_tables.c
+ */
+
+uint32_t sctp_crc_tableil8_o32[256] =
+{
+	0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB,
+	0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24,
+	0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B, 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384,
+	0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54, 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B,
+	0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A, 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35,
+	0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5, 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA,
+	0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45, 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A,
+	0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A, 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595,
+	0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48, 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957,
+	0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687, 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198,
+	0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927, 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38,
+	0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8, 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7,
+	0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096, 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789,
+	0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859, 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46,
+	0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9, 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6,
+	0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36, 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829,
+	0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C, 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93,
+	0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043, 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C,
+	0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3, 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC,
+	0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C, 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033,
+	0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652, 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D,
+	0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D, 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982,
+	0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D, 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622,
+	0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2, 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED,
+	0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530, 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F,
+	0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF, 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0,
+	0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F, 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540,
+	0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90, 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F,
+	0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE, 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1,
+	0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321, 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E,
+	0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81, 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E,
+	0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351
+};
+
+/*
+ * end of the CRC lookup table crc_tableil8_o32
+ */
+
+
+
+/*
+ * The following CRC lookup table was generated automagically using the
+ * following model parameters:
+ *
+ * Generator Polynomial = ................. 0x1EDC6F41 Generator Polynomial
+ * Length = .......... 32 bits Reflected Bits = ....................... TRUE
+ * Table Generation Offset = .............. 32 bits Number of Slices =
+ * ..................... 8 slices Slice Lengths = ........................ 8
+ * 8 8 8 8 8 8 8 Directory Name = ....................... .\ File Name =
+ * ............................ 8x256_tables.c
+ */
+
+uint32_t sctp_crc_tableil8_o40[256] =
+{
+	0x00000000, 0x13A29877, 0x274530EE, 0x34E7A899, 0x4E8A61DC, 0x5D28F9AB, 0x69CF5132, 0x7A6DC945,
+	0x9D14C3B8, 0x8EB65BCF, 0xBA51F356, 0xA9F36B21, 0xD39EA264, 0xC03C3A13, 0xF4DB928A, 0xE7790AFD,
+	0x3FC5F181, 0x2C6769F6, 0x1880C16F, 0x0B225918, 0x714F905D, 0x62ED082A, 0x560AA0B3, 0x45A838C4,
+	0xA2D13239, 0xB173AA4E, 0x859402D7, 0x96369AA0, 0xEC5B53E5, 0xFFF9CB92, 0xCB1E630B, 0xD8BCFB7C,
+	0x7F8BE302, 0x6C297B75, 0x58CED3EC, 0x4B6C4B9B, 0x310182DE, 0x22A31AA9, 0x1644B230, 0x05E62A47,
+	0xE29F20BA, 0xF13DB8CD, 0xC5DA1054, 0xD6788823, 0xAC154166, 0xBFB7D911, 0x8B507188, 0x98F2E9FF,
+	0x404E1283, 0x53EC8AF4, 0x670B226D, 0x74A9BA1A, 0x0EC4735F, 0x1D66EB28, 0x298143B1, 0x3A23DBC6,
+	0xDD5AD13B, 0xCEF8494C, 0xFA1FE1D5, 0xE9BD79A2, 0x93D0B0E7, 0x80722890, 0xB4958009, 0xA737187E,
+	0xFF17C604, 0xECB55E73, 0xD852F6EA, 0xCBF06E9D, 0xB19DA7D8, 0xA23F3FAF, 0x96D89736, 0x857A0F41,
+	0x620305BC, 0x71A19DCB, 0x45463552, 0x56E4AD25, 0x2C896460, 0x3F2BFC17, 0x0BCC548E, 0x186ECCF9,
+	0xC0D23785, 0xD370AFF2, 0xE797076B, 0xF4359F1C, 0x8E585659, 0x9DFACE2E, 0xA91D66B7, 0xBABFFEC0,
+	0x5DC6F43D, 0x4E646C4A, 0x7A83C4D3, 0x69215CA4, 0x134C95E1, 0x00EE0D96, 0x3409A50F, 0x27AB3D78,
+	0x809C2506, 0x933EBD71, 0xA7D915E8, 0xB47B8D9F, 0xCE1644DA, 0xDDB4DCAD, 0xE9537434, 0xFAF1EC43,
+	0x1D88E6BE, 0x0E2A7EC9, 0x3ACDD650, 0x296F4E27, 0x53028762, 0x40A01F15, 0x7447B78C, 0x67E52FFB,
+	0xBF59D487, 0xACFB4CF0, 0x981CE469, 0x8BBE7C1E, 0xF1D3B55B, 0xE2712D2C, 0xD69685B5, 0xC5341DC2,
+	0x224D173F, 0x31EF8F48, 0x050827D1, 0x16AABFA6, 0x6CC776E3, 0x7F65EE94, 0x4B82460D, 0x5820DE7A,
+	0xFBC3FAF9, 0xE861628E, 0xDC86CA17, 0xCF245260, 0xB5499B25, 0xA6EB0352, 0x920CABCB, 0x81AE33BC,
+	0x66D73941, 0x7575A136, 0x419209AF, 0x523091D8, 0x285D589D, 0x3BFFC0EA, 0x0F186873, 0x1CBAF004,
+	0xC4060B78, 0xD7A4930F, 0xE3433B96, 0xF0E1A3E1, 0x8A8C6AA4, 0x992EF2D3, 0xADC95A4A, 0xBE6BC23D,
+	0x5912C8C0, 0x4AB050B7, 0x7E57F82E, 0x6DF56059, 0x1798A91C, 0x043A316B, 0x30DD99F2, 0x237F0185,
+	0x844819FB, 0x97EA818C, 0xA30D2915, 0xB0AFB162, 0xCAC27827, 0xD960E050, 0xED8748C9, 0xFE25D0BE,
+	0x195CDA43, 0x0AFE4234, 0x3E19EAAD, 0x2DBB72DA, 0x57D6BB9F, 0x447423E8, 0x70938B71, 0x63311306,
+	0xBB8DE87A, 0xA82F700D, 0x9CC8D894, 0x8F6A40E3, 0xF50789A6, 0xE6A511D1, 0xD242B948, 0xC1E0213F,
+	0x26992BC2, 0x353BB3B5, 0x01DC1B2C, 0x127E835B, 0x68134A1E, 0x7BB1D269, 0x4F567AF0, 0x5CF4E287,
+	0x04D43CFD, 0x1776A48A, 0x23910C13, 0x30339464, 0x4A5E5D21, 0x59FCC556, 0x6D1B6DCF, 0x7EB9F5B8,
+	0x99C0FF45, 0x8A626732, 0xBE85CFAB, 0xAD2757DC, 0xD74A9E99, 0xC4E806EE, 0xF00FAE77, 0xE3AD3600,
+	0x3B11CD7C, 0x28B3550B, 0x1C54FD92, 0x0FF665E5, 0x759BACA0, 0x663934D7, 0x52DE9C4E, 0x417C0439,
+	0xA6050EC4, 0xB5A796B3, 0x81403E2A, 0x92E2A65D, 0xE88F6F18, 0xFB2DF76F, 0xCFCA5FF6, 0xDC68C781,
+	0x7B5FDFFF, 0x68FD4788, 0x5C1AEF11, 0x4FB87766, 0x35D5BE23, 0x26772654, 0x12908ECD, 0x013216BA,
+	0xE64B1C47, 0xF5E98430, 0xC10E2CA9, 0xD2ACB4DE, 0xA8C17D9B, 0xBB63E5EC, 0x8F844D75, 0x9C26D502,
+	0x449A2E7E, 0x5738B609, 0x63DF1E90, 0x707D86E7, 0x0A104FA2, 0x19B2D7D5, 0x2D557F4C, 0x3EF7E73B,
+	0xD98EEDC6, 0xCA2C75B1, 0xFECBDD28, 0xED69455F, 0x97048C1A, 0x84A6146D, 0xB041BCF4, 0xA3E32483
+};
+
+/*
+ * end of the CRC lookup table crc_tableil8_o40
+ */
+
+
+
+/*
+ * The following CRC lookup table was generated automagically using the
+ * following model parameters:
+ *
+ * Generator Polynomial = ................. 0x1EDC6F41 Generator Polynomial
+ * Length = .......... 32 bits Reflected Bits = ....................... TRUE
+ * Table Generation Offset = .............. 32 bits Number of Slices =
+ * ..................... 8 slices Slice Lengths = ........................ 8
+ * 8 8 8 8 8 8 8 Directory Name = ....................... .\ File Name =
+ * ............................ 8x256_tables.c
+ */
+
+uint32_t sctp_crc_tableil8_o48[256] =
+{
+	0x00000000, 0xA541927E, 0x4F6F520D, 0xEA2EC073, 0x9EDEA41A, 0x3B9F3664, 0xD1B1F617, 0x74F06469,
+	0x38513EC5, 0x9D10ACBB, 0x773E6CC8, 0xD27FFEB6, 0xA68F9ADF, 0x03CE08A1, 0xE9E0C8D2, 0x4CA15AAC,
+	0x70A27D8A, 0xD5E3EFF4, 0x3FCD2F87, 0x9A8CBDF9, 0xEE7CD990, 0x4B3D4BEE, 0xA1138B9D, 0x045219E3,
+	0x48F3434F, 0xEDB2D131, 0x079C1142, 0xA2DD833C, 0xD62DE755, 0x736C752B, 0x9942B558, 0x3C032726,
+	0xE144FB14, 0x4405696A, 0xAE2BA919, 0x0B6A3B67, 0x7F9A5F0E, 0xDADBCD70, 0x30F50D03, 0x95B49F7D,
+	0xD915C5D1, 0x7C5457AF, 0x967A97DC, 0x333B05A2, 0x47CB61CB, 0xE28AF3B5, 0x08A433C6, 0xADE5A1B8,
+	0x91E6869E, 0x34A714E0, 0xDE89D493, 0x7BC846ED, 0x0F382284, 0xAA79B0FA, 0x40577089, 0xE516E2F7,
+	0xA9B7B85B, 0x0CF62A25, 0xE6D8EA56, 0x43997828, 0x37691C41, 0x92288E3F, 0x78064E4C, 0xDD47DC32,
+	0xC76580D9, 0x622412A7, 0x880AD2D4, 0x2D4B40AA, 0x59BB24C3, 0xFCFAB6BD, 0x16D476CE, 0xB395E4B0,
+	0xFF34BE1C, 0x5A752C62, 0xB05BEC11, 0x151A7E6F, 0x61EA1A06, 0xC4AB8878, 0x2E85480B, 0x8BC4DA75,
+	0xB7C7FD53, 0x12866F2D, 0xF8A8AF5E, 0x5DE93D20, 0x29195949, 0x8C58CB37, 0x66760B44, 0xC337993A,
+	0x8F96C396, 0x2AD751E8, 0xC0F9919B, 0x65B803E5, 0x1148678C, 0xB409F5F2, 0x5E273581, 0xFB66A7FF,
+	0x26217BCD, 0x8360E9B3, 0x694E29C0, 0xCC0FBBBE, 0xB8FFDFD7, 0x1DBE4DA9, 0xF7908DDA, 0x52D11FA4,
+	0x1E704508, 0xBB31D776, 0x511F1705, 0xF45E857B, 0x80AEE112, 0x25EF736C, 0xCFC1B31F, 0x6A802161,
+	0x56830647, 0xF3C29439, 0x19EC544A, 0xBCADC634, 0xC85DA25D, 0x6D1C3023, 0x8732F050, 0x2273622E,
+	0x6ED23882, 0xCB93AAFC, 0x21BD6A8F, 0x84FCF8F1, 0xF00C9C98, 0x554D0EE6, 0xBF63CE95, 0x1A225CEB,
+	0x8B277743, 0x2E66E53D, 0xC448254E, 0x6109B730, 0x15F9D359, 0xB0B84127, 0x5A968154, 0xFFD7132A,
+	0xB3764986, 0x1637DBF8, 0xFC191B8B, 0x595889F5, 0x2DA8ED9C, 0x88E97FE2, 0x62C7BF91, 0xC7862DEF,
+	0xFB850AC9, 0x5EC498B7, 0xB4EA58C4, 0x11ABCABA, 0x655BAED3, 0xC01A3CAD, 0x2A34FCDE, 0x8F756EA0,
+	0xC3D4340C, 0x6695A672, 0x8CBB6601, 0x29FAF47F, 0x5D0A9016, 0xF84B0268, 0x1265C21B, 0xB7245065,
+	0x6A638C57, 0xCF221E29, 0x250CDE5A, 0x804D4C24, 0xF4BD284D, 0x51FCBA33, 0xBBD27A40, 0x1E93E83E,
+	0x5232B292, 0xF77320EC, 0x1D5DE09F, 0xB81C72E1, 0xCCEC1688, 0x69AD84F6, 0x83834485, 0x26C2D6FB,
+	0x1AC1F1DD, 0xBF8063A3, 0x55AEA3D0, 0xF0EF31AE, 0x841F55C7, 0x215EC7B9, 0xCB7007CA, 0x6E3195B4,
+	0x2290CF18, 0x87D15D66, 0x6DFF9D15, 0xC8BE0F6B, 0xBC4E6B02, 0x190FF97C, 0xF321390F, 0x5660AB71,
+	0x4C42F79A, 0xE90365E4, 0x032DA597, 0xA66C37E9, 0xD29C5380, 0x77DDC1FE, 0x9DF3018D, 0x38B293F3,
+	0x7413C95F, 0xD1525B21, 0x3B7C9B52, 0x9E3D092C, 0xEACD6D45, 0x4F8CFF3B, 0xA5A23F48, 0x00E3AD36,
+	0x3CE08A10, 0x99A1186E, 0x738FD81D, 0xD6CE4A63, 0xA23E2E0A, 0x077FBC74, 0xED517C07, 0x4810EE79,
+	0x04B1B4D5, 0xA1F026AB, 0x4BDEE6D8, 0xEE9F74A6, 0x9A6F10CF, 0x3F2E82B1, 0xD50042C2, 0x7041D0BC,
+	0xAD060C8E, 0x08479EF0, 0xE2695E83, 0x4728CCFD, 0x33D8A894, 0x96993AEA, 0x7CB7FA99, 0xD9F668E7,
+	0x9557324B, 0x3016A035, 0xDA386046, 0x7F79F238, 0x0B899651, 0xAEC8042F, 0x44E6C45C, 0xE1A75622,
+	0xDDA47104, 0x78E5E37A, 0x92CB2309, 0x378AB177, 0x437AD51E, 0xE63B4760, 0x0C158713, 0xA954156D,
+	0xE5F54FC1, 0x40B4DDBF, 0xAA9A1DCC, 0x0FDB8FB2, 0x7B2BEBDB, 0xDE6A79A5, 0x3444B9D6, 0x91052BA8
+};
+
+/*
+ * end of the CRC lookup table crc_tableil8_o48
+ */
+
+
+
+/*
+ * The following CRC lookup table was generated automagically using the
+ * following model parameters:
+ *
+ * Generator Polynomial = ................. 0x1EDC6F41 Generator Polynomial
+ * Length = .......... 32 bits Reflected Bits = ....................... TRUE
+ * Table Generation Offset = .............. 32 bits Number of Slices =
+ * ..................... 8 slices Slice Lengths = ........................ 8
+ * 8 8 8 8 8 8 8 Directory Name = ....................... .\ File Name =
+ * ............................ 8x256_tables.c
+ */
+
+uint32_t sctp_crc_tableil8_o56[256] =
+{
+	0x00000000, 0xDD45AAB8, 0xBF672381, 0x62228939, 0x7B2231F3, 0xA6679B4B, 0xC4451272, 0x1900B8CA,
+	0xF64463E6, 0x2B01C95E, 0x49234067, 0x9466EADF, 0x8D665215, 0x5023F8AD, 0x32017194, 0xEF44DB2C,
+	0xE964B13D, 0x34211B85, 0x560392BC, 0x8B463804, 0x924680CE, 0x4F032A76, 0x2D21A34F, 0xF06409F7,
+	0x1F20D2DB, 0xC2657863, 0xA047F15A, 0x7D025BE2, 0x6402E328, 0xB9474990, 0xDB65C0A9, 0x06206A11,
+	0xD725148B, 0x0A60BE33, 0x6842370A, 0xB5079DB2, 0xAC072578, 0x71428FC0, 0x136006F9, 0xCE25AC41,
+	0x2161776D, 0xFC24DDD5, 0x9E0654EC, 0x4343FE54, 0x5A43469E, 0x8706EC26, 0xE524651F, 0x3861CFA7,
+	0x3E41A5B6, 0xE3040F0E, 0x81268637, 0x5C632C8F, 0x45639445, 0x98263EFD, 0xFA04B7C4, 0x27411D7C,
+	0xC805C650, 0x15406CE8, 0x7762E5D1, 0xAA274F69, 0xB327F7A3, 0x6E625D1B, 0x0C40D422, 0xD1057E9A,
+	0xABA65FE7, 0x76E3F55F, 0x14C17C66, 0xC984D6DE, 0xD0846E14, 0x0DC1C4AC, 0x6FE34D95, 0xB2A6E72D,
+	0x5DE23C01, 0x80A796B9, 0xE2851F80, 0x3FC0B538, 0x26C00DF2, 0xFB85A74A, 0x99A72E73, 0x44E284CB,
+	0x42C2EEDA, 0x9F874462, 0xFDA5CD5B, 0x20E067E3, 0x39E0DF29, 0xE4A57591, 0x8687FCA8, 0x5BC25610,
+	0xB4868D3C, 0x69C32784, 0x0BE1AEBD, 0xD6A40405, 0xCFA4BCCF, 0x12E11677, 0x70C39F4E, 0xAD8635F6,
+	0x7C834B6C, 0xA1C6E1D4, 0xC3E468ED, 0x1EA1C255, 0x07A17A9F, 0xDAE4D027, 0xB8C6591E, 0x6583F3A6,
+	0x8AC7288A, 0x57828232, 0x35A00B0B, 0xE8E5A1B3, 0xF1E51979, 0x2CA0B3C1, 0x4E823AF8, 0x93C79040,
+	0x95E7FA51, 0x48A250E9, 0x2A80D9D0, 0xF7C57368, 0xEEC5CBA2, 0x3380611A, 0x51A2E823, 0x8CE7429B,
+	0x63A399B7, 0xBEE6330F, 0xDCC4BA36, 0x0181108E, 0x1881A844, 0xC5C402FC, 0xA7E68BC5, 0x7AA3217D,
+	0x52A0C93F, 0x8FE56387, 0xEDC7EABE, 0x30824006, 0x2982F8CC, 0xF4C75274, 0x96E5DB4D, 0x4BA071F5,
+	0xA4E4AAD9, 0x79A10061, 0x1B838958, 0xC6C623E0, 0xDFC69B2A, 0x02833192, 0x60A1B8AB, 0xBDE41213,
+	0xBBC47802, 0x6681D2BA, 0x04A35B83, 0xD9E6F13B, 0xC0E649F1, 0x1DA3E349, 0x7F816A70, 0xA2C4C0C8,
+	0x4D801BE4, 0x90C5B15C, 0xF2E73865, 0x2FA292DD, 0x36A22A17, 0xEBE780AF, 0x89C50996, 0x5480A32E,
+	0x8585DDB4, 0x58C0770C, 0x3AE2FE35, 0xE7A7548D, 0xFEA7EC47, 0x23E246FF, 0x41C0CFC6, 0x9C85657E,
+	0x73C1BE52, 0xAE8414EA, 0xCCA69DD3, 0x11E3376B, 0x08E38FA1, 0xD5A62519, 0xB784AC20, 0x6AC10698,
+	0x6CE16C89, 0xB1A4C631, 0xD3864F08, 0x0EC3E5B0, 0x17C35D7A, 0xCA86F7C2, 0xA8A47EFB, 0x75E1D443,
+	0x9AA50F6F, 0x47E0A5D7, 0x25C22CEE, 0xF8878656, 0xE1873E9C, 0x3CC29424, 0x5EE01D1D, 0x83A5B7A5,
+	0xF90696D8, 0x24433C60, 0x4661B559, 0x9B241FE1, 0x8224A72B, 0x5F610D93, 0x3D4384AA, 0xE0062E12,
+	0x0F42F53E, 0xD2075F86, 0xB025D6BF, 0x6D607C07, 0x7460C4CD, 0xA9256E75, 0xCB07E74C, 0x16424DF4,
+	0x106227E5, 0xCD278D5D, 0xAF050464, 0x7240AEDC, 0x6B401616, 0xB605BCAE, 0xD4273597, 0x09629F2F,
+	0xE6264403, 0x3B63EEBB, 0x59416782, 0x8404CD3A, 0x9D0475F0, 0x4041DF48, 0x22635671, 0xFF26FCC9,
+	0x2E238253, 0xF36628EB, 0x9144A1D2, 0x4C010B6A, 0x5501B3A0, 0x88441918, 0xEA669021, 0x37233A99,
+	0xD867E1B5, 0x05224B0D, 0x6700C234, 0xBA45688C, 0xA345D046, 0x7E007AFE, 0x1C22F3C7, 0xC167597F,
+	0xC747336E, 0x1A0299D6, 0x782010EF, 0xA565BA57, 0xBC65029D, 0x6120A825, 0x0302211C, 0xDE478BA4,
+	0x31035088, 0xEC46FA30, 0x8E647309, 0x5321D9B1, 0x4A21617B, 0x9764CBC3, 0xF54642FA, 0x2803E842
+};
+
+/*
+ * end of the CRC lookup table crc_tableil8_o56
+ */
+
+
+
+/*
+ * The following CRC lookup table was generated automagically using the
+ * following model parameters:
+ *
+ * Generator Polynomial = ................. 0x1EDC6F41 Generator Polynomial
+ * Length = .......... 32 bits Reflected Bits = ....................... TRUE
+ * Table Generation Offset = .............. 32 bits Number of Slices =
+ * ..................... 8 slices Slice Lengths = ........................ 8
+ * 8 8 8 8 8 8 8 Directory Name = ....................... .\ File Name =
+ * ............................ 8x256_tables.c
+ */
+
+uint32_t sctp_crc_tableil8_o64[256] =
+{
+	0x00000000, 0x38116FAC, 0x7022DF58, 0x4833B0F4, 0xE045BEB0, 0xD854D11C, 0x906761E8, 0xA8760E44,
+	0xC5670B91, 0xFD76643D, 0xB545D4C9, 0x8D54BB65, 0x2522B521, 0x1D33DA8D, 0x55006A79, 0x6D1105D5,
+	0x8F2261D3, 0xB7330E7F, 0xFF00BE8B, 0xC711D127, 0x6F67DF63, 0x5776B0CF, 0x1F45003B, 0x27546F97,
+	0x4A456A42, 0x725405EE, 0x3A67B51A, 0x0276DAB6, 0xAA00D4F2, 0x9211BB5E, 0xDA220BAA, 0xE2336406,
+	0x1BA8B557, 0x23B9DAFB, 0x6B8A6A0F, 0x539B05A3, 0xFBED0BE7, 0xC3FC644B, 0x8BCFD4BF, 0xB3DEBB13,
+	0xDECFBEC6, 0xE6DED16A, 0xAEED619E, 0x96FC0E32, 0x3E8A0076, 0x069B6FDA, 0x4EA8DF2E, 0x76B9B082,
+	0x948AD484, 0xAC9BBB28, 0xE4A80BDC, 0xDCB96470, 0x74CF6A34, 0x4CDE0598, 0x04EDB56C, 0x3CFCDAC0,
+	0x51EDDF15, 0x69FCB0B9, 0x21CF004D, 0x19DE6FE1, 0xB1A861A5, 0x89B90E09, 0xC18ABEFD, 0xF99BD151,
+	0x37516AAE, 0x0F400502, 0x4773B5F6, 0x7F62DA5A, 0xD714D41E, 0xEF05BBB2, 0xA7360B46, 0x9F2764EA,
+	0xF236613F, 0xCA270E93, 0x8214BE67, 0xBA05D1CB, 0x1273DF8F, 0x2A62B023, 0x625100D7, 0x5A406F7B,
+	0xB8730B7D, 0x806264D1, 0xC851D425, 0xF040BB89, 0x5836B5CD, 0x6027DA61, 0x28146A95, 0x10050539,
+	0x7D1400EC, 0x45056F40, 0x0D36DFB4, 0x3527B018, 0x9D51BE5C, 0xA540D1F0, 0xED736104, 0xD5620EA8,
+	0x2CF9DFF9, 0x14E8B055, 0x5CDB00A1, 0x64CA6F0D, 0xCCBC6149, 0xF4AD0EE5, 0xBC9EBE11, 0x848FD1BD,
+	0xE99ED468, 0xD18FBBC4, 0x99BC0B30, 0xA1AD649C, 0x09DB6AD8, 0x31CA0574, 0x79F9B580, 0x41E8DA2C,
+	0xA3DBBE2A, 0x9BCAD186, 0xD3F96172, 0xEBE80EDE, 0x439E009A, 0x7B8F6F36, 0x33BCDFC2, 0x0BADB06E,
+	0x66BCB5BB, 0x5EADDA17, 0x169E6AE3, 0x2E8F054F, 0x86F90B0B, 0xBEE864A7, 0xF6DBD453, 0xCECABBFF,
+	0x6EA2D55C, 0x56B3BAF0, 0x1E800A04, 0x269165A8, 0x8EE76BEC, 0xB6F60440, 0xFEC5B4B4, 0xC6D4DB18,
+	0xABC5DECD, 0x93D4B161, 0xDBE70195, 0xE3F66E39, 0x4B80607D, 0x73910FD1, 0x3BA2BF25, 0x03B3D089,
+	0xE180B48F, 0xD991DB23, 0x91A26BD7, 0xA9B3047B, 0x01C50A3F, 0x39D46593, 0x71E7D567, 0x49F6BACB,
+	0x24E7BF1E, 0x1CF6D0B2, 0x54C56046, 0x6CD40FEA, 0xC4A201AE, 0xFCB36E02, 0xB480DEF6, 0x8C91B15A,
+	0x750A600B, 0x4D1B0FA7, 0x0528BF53, 0x3D39D0FF, 0x954FDEBB, 0xAD5EB117, 0xE56D01E3, 0xDD7C6E4F,
+	0xB06D6B9A, 0x887C0436, 0xC04FB4C2, 0xF85EDB6E, 0x5028D52A, 0x6839BA86, 0x200A0A72, 0x181B65DE,
+	0xFA2801D8, 0xC2396E74, 0x8A0ADE80, 0xB21BB12C, 0x1A6DBF68, 0x227CD0C4, 0x6A4F6030, 0x525E0F9C,
+	0x3F4F0A49, 0x075E65E5, 0x4F6DD511, 0x777CBABD, 0xDF0AB4F9, 0xE71BDB55, 0xAF286BA1, 0x9739040D,
+	0x59F3BFF2, 0x61E2D05E, 0x29D160AA, 0x11C00F06, 0xB9B60142, 0x81A76EEE, 0xC994DE1A, 0xF185B1B6,
+	0x9C94B463, 0xA485DBCF, 0xECB66B3B, 0xD4A70497, 0x7CD10AD3, 0x44C0657F, 0x0CF3D58B, 0x34E2BA27,
+	0xD6D1DE21, 0xEEC0B18D, 0xA6F30179, 0x9EE26ED5, 0x36946091, 0x0E850F3D, 0x46B6BFC9, 0x7EA7D065,
+	0x13B6D5B0, 0x2BA7BA1C, 0x63940AE8, 0x5B856544, 0xF3F36B00, 0xCBE204AC, 0x83D1B458, 0xBBC0DBF4,
+	0x425B0AA5, 0x7A4A6509, 0x3279D5FD, 0x0A68BA51, 0xA21EB415, 0x9A0FDBB9, 0xD23C6B4D, 0xEA2D04E1,
+	0x873C0134, 0xBF2D6E98, 0xF71EDE6C, 0xCF0FB1C0, 0x6779BF84, 0x5F68D028, 0x175B60DC, 0x2F4A0F70,
+	0xCD796B76, 0xF56804DA, 0xBD5BB42E, 0x854ADB82, 0x2D3CD5C6, 0x152DBA6A, 0x5D1E0A9E, 0x650F6532,
+	0x081E60E7, 0x300F0F4B, 0x783CBFBF, 0x402DD013, 0xE85BDE57, 0xD04AB1FB, 0x9879010F, 0xA0686EA3
+};
+
+/*
+ * end of the CRC lookup table crc_tableil8_o64
+ */
+
+
+
+/*
+ * The following CRC lookup table was generated automagically using the
+ * following model parameters:
+ *
+ * Generator Polynomial = ................. 0x1EDC6F41 Generator Polynomial
+ * Length = .......... 32 bits Reflected Bits = ....................... TRUE
+ * Table Generation Offset = .............. 32 bits Number of Slices =
+ * ..................... 8 slices Slice Lengths = ........................ 8
+ * 8 8 8 8 8 8 8 Directory Name = ....................... .\ File Name =
+ * ............................ 8x256_tables.c
+ */
+
+uint32_t sctp_crc_tableil8_o72[256] =
+{
+	0x00000000, 0xEF306B19, 0xDB8CA0C3, 0x34BCCBDA, 0xB2F53777, 0x5DC55C6E, 0x697997B4, 0x8649FCAD,
+	0x6006181F, 0x8F367306, 0xBB8AB8DC, 0x54BAD3C5, 0xD2F32F68, 0x3DC34471, 0x097F8FAB, 0xE64FE4B2,
+	0xC00C303E, 0x2F3C5B27, 0x1B8090FD, 0xF4B0FBE4, 0x72F90749, 0x9DC96C50, 0xA975A78A, 0x4645CC93,
+	0xA00A2821, 0x4F3A4338, 0x7B8688E2, 0x94B6E3FB, 0x12FF1F56, 0xFDCF744F, 0xC973BF95, 0x2643D48C,
+	0x85F4168D, 0x6AC47D94, 0x5E78B64E, 0xB148DD57, 0x370121FA, 0xD8314AE3, 0xEC8D8139, 0x03BDEA20,
+	0xE5F20E92, 0x0AC2658B, 0x3E7EAE51, 0xD14EC548, 0x570739E5, 0xB83752FC, 0x8C8B9926, 0x63BBF23F,
+	0x45F826B3, 0xAAC84DAA, 0x9E748670, 0x7144ED69, 0xF70D11C4, 0x183D7ADD, 0x2C81B107, 0xC3B1DA1E,
+	0x25FE3EAC, 0xCACE55B5, 0xFE729E6F, 0x1142F576, 0x970B09DB, 0x783B62C2, 0x4C87A918, 0xA3B7C201,
+	0x0E045BEB, 0xE13430F2, 0xD588FB28, 0x3AB89031, 0xBCF16C9C, 0x53C10785, 0x677DCC5F, 0x884DA746,
+	0x6E0243F4, 0x813228ED, 0xB58EE337, 0x5ABE882E, 0xDCF77483, 0x33C71F9A, 0x077BD440, 0xE84BBF59,
+	0xCE086BD5, 0x213800CC, 0x1584CB16, 0xFAB4A00F, 0x7CFD5CA2, 0x93CD37BB, 0xA771FC61, 0x48419778,
+	0xAE0E73CA, 0x413E18D3, 0x7582D309, 0x9AB2B810, 0x1CFB44BD, 0xF3CB2FA4, 0xC777E47E, 0x28478F67,
+	0x8BF04D66, 0x64C0267F, 0x507CEDA5, 0xBF4C86BC, 0x39057A11, 0xD6351108, 0xE289DAD2, 0x0DB9B1CB,
+	0xEBF65579, 0x04C63E60, 0x307AF5BA, 0xDF4A9EA3, 0x5903620E, 0xB6330917, 0x828FC2CD, 0x6DBFA9D4,
+	0x4BFC7D58, 0xA4CC1641, 0x9070DD9B, 0x7F40B682, 0xF9094A2F, 0x16392136, 0x2285EAEC, 0xCDB581F5,
+	0x2BFA6547, 0xC4CA0E5E, 0xF076C584, 0x1F46AE9D, 0x990F5230, 0x763F3929, 0x4283F2F3, 0xADB399EA,
+	0x1C08B7D6, 0xF338DCCF, 0xC7841715, 0x28B47C0C, 0xAEFD80A1, 0x41CDEBB8, 0x75712062, 0x9A414B7B,
+	0x7C0EAFC9, 0x933EC4D0, 0xA7820F0A, 0x48B26413, 0xCEFB98BE, 0x21CBF3A7, 0x1577387D, 0xFA475364,
+	0xDC0487E8, 0x3334ECF1, 0x0788272B, 0xE8B84C32, 0x6EF1B09F, 0x81C1DB86, 0xB57D105C, 0x5A4D7B45,
+	0xBC029FF7, 0x5332F4EE, 0x678E3F34, 0x88BE542D, 0x0EF7A880, 0xE1C7C399, 0xD57B0843, 0x3A4B635A,
+	0x99FCA15B, 0x76CCCA42, 0x42700198, 0xAD406A81, 0x2B09962C, 0xC439FD35, 0xF08536EF, 0x1FB55DF6,
+	0xF9FAB944, 0x16CAD25D, 0x22761987, 0xCD46729E, 0x4B0F8E33, 0xA43FE52A, 0x90832EF0, 0x7FB345E9,
+	0x59F09165, 0xB6C0FA7C, 0x827C31A6, 0x6D4C5ABF, 0xEB05A612, 0x0435CD0B, 0x308906D1, 0xDFB96DC8,
+	0x39F6897A, 0xD6C6E263, 0xE27A29B9, 0x0D4A42A0, 0x8B03BE0D, 0x6433D514, 0x508F1ECE, 0xBFBF75D7,
+	0x120CEC3D, 0xFD3C8724, 0xC9804CFE, 0x26B027E7, 0xA0F9DB4A, 0x4FC9B053, 0x7B757B89, 0x94451090,
+	0x720AF422, 0x9D3A9F3B, 0xA98654E1, 0x46B63FF8, 0xC0FFC355, 0x2FCFA84C, 0x1B736396, 0xF443088F,
+	0xD200DC03, 0x3D30B71A, 0x098C7CC0, 0xE6BC17D9, 0x60F5EB74, 0x8FC5806D, 0xBB794BB7, 0x544920AE,
+	0xB206C41C, 0x5D36AF05, 0x698A64DF, 0x86BA0FC6, 0x00F3F36B, 0xEFC39872, 0xDB7F53A8, 0x344F38B1,
+	0x97F8FAB0, 0x78C891A9, 0x4C745A73, 0xA344316A, 0x250DCDC7, 0xCA3DA6DE, 0xFE816D04, 0x11B1061D,
+	0xF7FEE2AF, 0x18CE89B6, 0x2C72426C, 0xC3422975, 0x450BD5D8, 0xAA3BBEC1, 0x9E87751B, 0x71B71E02,
+	0x57F4CA8E, 0xB8C4A197, 0x8C786A4D, 0x63480154, 0xE501FDF9, 0x0A3196E0, 0x3E8D5D3A, 0xD1BD3623,
+	0x37F2D291, 0xD8C2B988, 0xEC7E7252, 0x034E194B, 0x8507E5E6, 0x6A378EFF, 0x5E8B4525, 0xB1BB2E3C
+};
+
+/*
+ * end of the CRC lookup table crc_tableil8_o72
+ */
+
+
+
+/*
+ * The following CRC lookup table was generated automagically using the
+ * following model parameters:
+ *
+ * Generator Polynomial = ................. 0x1EDC6F41 Generator Polynomial
+ * Length = .......... 32 bits Reflected Bits = ....................... TRUE
+ * Table Generation Offset = .............. 32 bits Number of Slices =
+ * ..................... 8 slices Slice Lengths = ........................ 8
+ * 8 8 8 8 8 8 8 Directory Name = ....................... .\ File Name =
+ * ............................ 8x256_tables.c
+ */
+
+uint32_t sctp_crc_tableil8_o80[256] =
+{
+	0x00000000, 0x68032CC8, 0xD0065990, 0xB8057558, 0xA5E0C5D1, 0xCDE3E919, 0x75E69C41, 0x1DE5B089,
+	0x4E2DFD53, 0x262ED19B, 0x9E2BA4C3, 0xF628880B, 0xEBCD3882, 0x83CE144A, 0x3BCB6112, 0x53C84DDA,
+	0x9C5BFAA6, 0xF458D66E, 0x4C5DA336, 0x245E8FFE, 0x39BB3F77, 0x51B813BF, 0xE9BD66E7, 0x81BE4A2F,
+	0xD27607F5, 0xBA752B3D, 0x02705E65, 0x6A7372AD, 0x7796C224, 0x1F95EEEC, 0xA7909BB4, 0xCF93B77C,
+	0x3D5B83BD, 0x5558AF75, 0xED5DDA2D, 0x855EF6E5, 0x98BB466C, 0xF0B86AA4, 0x48BD1FFC, 0x20BE3334,
+	0x73767EEE, 0x1B755226, 0xA370277E, 0xCB730BB6, 0xD696BB3F, 0xBE9597F7, 0x0690E2AF, 0x6E93CE67,
+	0xA100791B, 0xC90355D3, 0x7106208B, 0x19050C43, 0x04E0BCCA, 0x6CE39002, 0xD4E6E55A, 0xBCE5C992,
+	0xEF2D8448, 0x872EA880, 0x3F2BDDD8, 0x5728F110, 0x4ACD4199, 0x22CE6D51, 0x9ACB1809, 0xF2C834C1,
+	0x7AB7077A, 0x12B42BB2, 0xAAB15EEA, 0xC2B27222, 0xDF57C2AB, 0xB754EE63, 0x0F519B3B, 0x6752B7F3,
+	0x349AFA29, 0x5C99D6E1, 0xE49CA3B9, 0x8C9F8F71, 0x917A3FF8, 0xF9791330, 0x417C6668, 0x297F4AA0,
+	0xE6ECFDDC, 0x8EEFD114, 0x36EAA44C, 0x5EE98884, 0x430C380D, 0x2B0F14C5, 0x930A619D, 0xFB094D55,
+	0xA8C1008F, 0xC0C22C47, 0x78C7591F, 0x10C475D7, 0x0D21C55E, 0x6522E996, 0xDD279CCE, 0xB524B006,
+	0x47EC84C7, 0x2FEFA80F, 0x97EADD57, 0xFFE9F19F, 0xE20C4116, 0x8A0F6DDE, 0x320A1886, 0x5A09344E,
+	0x09C17994, 0x61C2555C, 0xD9C72004, 0xB1C40CCC, 0xAC21BC45, 0xC422908D, 0x7C27E5D5, 0x1424C91D,
+	0xDBB77E61, 0xB3B452A9, 0x0BB127F1, 0x63B20B39, 0x7E57BBB0, 0x16549778, 0xAE51E220, 0xC652CEE8,
+	0x959A8332, 0xFD99AFFA, 0x459CDAA2, 0x2D9FF66A, 0x307A46E3, 0x58796A2B, 0xE07C1F73, 0x887F33BB,
+	0xF56E0EF4, 0x9D6D223C, 0x25685764, 0x4D6B7BAC, 0x508ECB25, 0x388DE7ED, 0x808892B5, 0xE88BBE7D,
+	0xBB43F3A7, 0xD340DF6F, 0x6B45AA37, 0x034686FF, 0x1EA33676, 0x76A01ABE, 0xCEA56FE6, 0xA6A6432E,
+	0x6935F452, 0x0136D89A, 0xB933ADC2, 0xD130810A, 0xCCD53183, 0xA4D61D4B, 0x1CD36813, 0x74D044DB,
+	0x27180901, 0x4F1B25C9, 0xF71E5091, 0x9F1D7C59, 0x82F8CCD0, 0xEAFBE018, 0x52FE9540, 0x3AFDB988,
+	0xC8358D49, 0xA036A181, 0x1833D4D9, 0x7030F811, 0x6DD54898, 0x05D66450, 0xBDD31108, 0xD5D03DC0,
+	0x8618701A, 0xEE1B5CD2, 0x561E298A, 0x3E1D0542, 0x23F8B5CB, 0x4BFB9903, 0xF3FEEC5B, 0x9BFDC093,
+	0x546E77EF, 0x3C6D5B27, 0x84682E7F, 0xEC6B02B7, 0xF18EB23E, 0x998D9EF6, 0x2188EBAE, 0x498BC766,
+	0x1A438ABC, 0x7240A674, 0xCA45D32C, 0xA246FFE4, 0xBFA34F6D, 0xD7A063A5, 0x6FA516FD, 0x07A63A35,
+	0x8FD9098E, 0xE7DA2546, 0x5FDF501E, 0x37DC7CD6, 0x2A39CC5F, 0x423AE097, 0xFA3F95CF, 0x923CB907,
+	0xC1F4F4DD, 0xA9F7D815, 0x11F2AD4D, 0x79F18185, 0x6414310C, 0x0C171DC4, 0xB412689C, 0xDC114454,
+	0x1382F328, 0x7B81DFE0, 0xC384AAB8, 0xAB878670, 0xB66236F9, 0xDE611A31, 0x66646F69, 0x0E6743A1,
+	0x5DAF0E7B, 0x35AC22B3, 0x8DA957EB, 0xE5AA7B23, 0xF84FCBAA, 0x904CE762, 0x2849923A, 0x404ABEF2,
+	0xB2828A33, 0xDA81A6FB, 0x6284D3A3, 0x0A87FF6B, 0x17624FE2, 0x7F61632A, 0xC7641672, 0xAF673ABA,
+	0xFCAF7760, 0x94AC5BA8, 0x2CA92EF0, 0x44AA0238, 0x594FB2B1, 0x314C9E79, 0x8949EB21, 0xE14AC7E9,
+	0x2ED97095, 0x46DA5C5D, 0xFEDF2905, 0x96DC05CD, 0x8B39B544, 0xE33A998C, 0x5B3FECD4, 0x333CC01C,
+	0x60F48DC6, 0x08F7A10E, 0xB0F2D456, 0xD8F1F89E, 0xC5144817, 0xAD1764DF, 0x15121187, 0x7D113D4F
+};
+
+/*
+ * end of the CRC lookup table crc_tableil8_o80
+ */
+
+
+
+/*
+ * The following CRC lookup table was generated automagically using the
+ * following model parameters:
+ *
+ * Generator Polynomial = ................. 0x1EDC6F41 Generator Polynomial
+ * Length = .......... 32 bits Reflected Bits = ....................... TRUE
+ * Table Generation Offset = .............. 32 bits Number of Slices =
+ * ..................... 8 slices Slice Lengths = ........................ 8
+ * 8 8 8 8 8 8 8 Directory Name = ....................... .\ File Name =
+ * ............................ 8x256_tables.c
+ */
+
+uint32_t sctp_crc_tableil8_o88[256] =
+{
+	0x00000000, 0x493C7D27, 0x9278FA4E, 0xDB448769, 0x211D826D, 0x6821FF4A, 0xB3657823, 0xFA590504,
+	0x423B04DA, 0x0B0779FD, 0xD043FE94, 0x997F83B3, 0x632686B7, 0x2A1AFB90, 0xF15E7CF9, 0xB86201DE,
+	0x847609B4, 0xCD4A7493, 0x160EF3FA, 0x5F328EDD, 0xA56B8BD9, 0xEC57F6FE, 0x37137197, 0x7E2F0CB0,
+	0xC64D0D6E, 0x8F717049, 0x5435F720, 0x1D098A07, 0xE7508F03, 0xAE6CF224, 0x7528754D, 0x3C14086A,
+	0x0D006599, 0x443C18BE, 0x9F789FD7, 0xD644E2F0, 0x2C1DE7F4, 0x65219AD3, 0xBE651DBA, 0xF759609D,
+	0x4F3B6143, 0x06071C64, 0xDD439B0D, 0x947FE62A, 0x6E26E32E, 0x271A9E09, 0xFC5E1960, 0xB5626447,
+	0x89766C2D, 0xC04A110A, 0x1B0E9663, 0x5232EB44, 0xA86BEE40, 0xE1579367, 0x3A13140E, 0x732F6929,
+	0xCB4D68F7, 0x827115D0, 0x593592B9, 0x1009EF9E, 0xEA50EA9A, 0xA36C97BD, 0x782810D4, 0x31146DF3,
+	0x1A00CB32, 0x533CB615, 0x8878317C, 0xC1444C5B, 0x3B1D495F, 0x72213478, 0xA965B311, 0xE059CE36,
+	0x583BCFE8, 0x1107B2CF, 0xCA4335A6, 0x837F4881, 0x79264D85, 0x301A30A2, 0xEB5EB7CB, 0xA262CAEC,
+	0x9E76C286, 0xD74ABFA1, 0x0C0E38C8, 0x453245EF, 0xBF6B40EB, 0xF6573DCC, 0x2D13BAA5, 0x642FC782,
+	0xDC4DC65C, 0x9571BB7B, 0x4E353C12, 0x07094135, 0xFD504431, 0xB46C3916, 0x6F28BE7F, 0x2614C358,
+	0x1700AEAB, 0x5E3CD38C, 0x857854E5, 0xCC4429C2, 0x361D2CC6, 0x7F2151E1, 0xA465D688, 0xED59ABAF,
+	0x553BAA71, 0x1C07D756, 0xC743503F, 0x8E7F2D18, 0x7426281C, 0x3D1A553B, 0xE65ED252, 0xAF62AF75,
+	0x9376A71F, 0xDA4ADA38, 0x010E5D51, 0x48322076, 0xB26B2572, 0xFB575855, 0x2013DF3C, 0x692FA21B,
+	0xD14DA3C5, 0x9871DEE2, 0x4335598B, 0x0A0924AC, 0xF05021A8, 0xB96C5C8F, 0x6228DBE6, 0x2B14A6C1,
+	0x34019664, 0x7D3DEB43, 0xA6796C2A, 0xEF45110D, 0x151C1409, 0x5C20692E, 0x8764EE47, 0xCE589360,
+	0x763A92BE, 0x3F06EF99, 0xE44268F0, 0xAD7E15D7, 0x572710D3, 0x1E1B6DF4, 0xC55FEA9D, 0x8C6397BA,
+	0xB0779FD0, 0xF94BE2F7, 0x220F659E, 0x6B3318B9, 0x916A1DBD, 0xD856609A, 0x0312E7F3, 0x4A2E9AD4,
+	0xF24C9B0A, 0xBB70E62D, 0x60346144, 0x29081C63, 0xD3511967, 0x9A6D6440, 0x4129E329, 0x08159E0E,
+	0x3901F3FD, 0x703D8EDA, 0xAB7909B3, 0xE2457494, 0x181C7190, 0x51200CB7, 0x8A648BDE, 0xC358F6F9,
+	0x7B3AF727, 0x32068A00, 0xE9420D69, 0xA07E704E, 0x5A27754A, 0x131B086D, 0xC85F8F04, 0x8163F223,
+	0xBD77FA49, 0xF44B876E, 0x2F0F0007, 0x66337D20, 0x9C6A7824, 0xD5560503, 0x0E12826A, 0x472EFF4D,
+	0xFF4CFE93, 0xB67083B4, 0x6D3404DD, 0x240879FA, 0xDE517CFE, 0x976D01D9, 0x4C2986B0, 0x0515FB97,
+	0x2E015D56, 0x673D2071, 0xBC79A718, 0xF545DA3F, 0x0F1CDF3B, 0x4620A21C, 0x9D642575, 0xD4585852,
+	0x6C3A598C, 0x250624AB, 0xFE42A3C2, 0xB77EDEE5, 0x4D27DBE1, 0x041BA6C6, 0xDF5F21AF, 0x96635C88,
+	0xAA7754E2, 0xE34B29C5, 0x380FAEAC, 0x7133D38B, 0x8B6AD68F, 0xC256ABA8, 0x19122CC1, 0x502E51E6,
+	0xE84C5038, 0xA1702D1F, 0x7A34AA76, 0x3308D751, 0xC951D255, 0x806DAF72, 0x5B29281B, 0x1215553C,
+	0x230138CF, 0x6A3D45E8, 0xB179C281, 0xF845BFA6, 0x021CBAA2, 0x4B20C785, 0x906440EC, 0xD9583DCB,
+	0x613A3C15, 0x28064132, 0xF342C65B, 0xBA7EBB7C, 0x4027BE78, 0x091BC35F, 0xD25F4436, 0x9B633911,
+	0xA777317B, 0xEE4B4C5C, 0x350FCB35, 0x7C33B612, 0x866AB316, 0xCF56CE31, 0x14124958, 0x5D2E347F,
+	0xE54C35A1, 0xAC704886, 0x7734CFEF, 0x3E08B2C8, 0xC451B7CC, 0x8D6DCAEB, 0x56294D82, 0x1F1530A5
+};
+
+/*
+ * end of the CRC lookup table crc_tableil8_o88
+ */
+
+static uint32_t
+sctp_crc32c_sb8_64_bit(uint32_t crc,
+    unsigned char const *p_buf,
+    uint32_t length,
+    uint32_t offset)
+{
+	uint32_t li;
+	uint32_t term1, term2;
+	uint32_t running_length;
+	uint32_t end_bytes;
+	uint32_t init_bytes;
+
+	init_bytes = (4-offset) & 0x3;
+
+	if (init_bytes > length)
+		init_bytes = length;
+
+	running_length = ((length - init_bytes) / 8) * 8;
+	end_bytes = length - init_bytes - running_length;
+
+	for (li = 0; li < init_bytes; li++)
+		crc = sctp_crc_tableil8_o32[(crc ^ *p_buf++) & 0x000000FF] ^
+		    (crc >> 8);
+	for (li = 0; li < running_length / 8; li++) {
+#ifdef CEPH_BIG_ENDIAN
+		crc ^= *p_buf++;
+		crc ^= (*p_buf++) << 8;
+		crc ^= (*p_buf++) << 16;
+		crc ^= (*p_buf++) << 24;
+#else
+		crc ^= *(uint32_t *) p_buf;
+		p_buf += 4;
+#endif
+		term1 = sctp_crc_tableil8_o88[crc & 0x000000FF] ^
+		    sctp_crc_tableil8_o80[(crc >> 8) & 0x000000FF];
+		term2 = crc >> 16;
+		crc = term1 ^
+		    sctp_crc_tableil8_o72[term2 & 0x000000FF] ^
+		    sctp_crc_tableil8_o64[(term2 >> 8) & 0x000000FF];
+
+#ifdef CEPH_BIG_ENDIAN
+		crc ^= sctp_crc_tableil8_o56[*p_buf++];
+		crc ^= sctp_crc_tableil8_o48[*p_buf++];
+		crc ^= sctp_crc_tableil8_o40[*p_buf++];
+		crc ^= sctp_crc_tableil8_o32[*p_buf++];
+#else
+		term1 = sctp_crc_tableil8_o56[(*(uint32_t *) p_buf) & 0x000000FF] ^
+		    sctp_crc_tableil8_o48[((*(uint32_t *) p_buf) >> 8) & 0x000000FF];
+
+		term2 = (*(uint32_t *) p_buf) >> 16;
+		crc = crc ^
+		    term1 ^
+		    sctp_crc_tableil8_o40[term2 & 0x000000FF] ^
+		    sctp_crc_tableil8_o32[(term2 >> 8) & 0x000000FF];
+		p_buf += 4;
+#endif
+	}
+	for (li = 0; li < end_bytes; li++)
+		crc = sctp_crc_tableil8_o32[(crc ^ *p_buf++) & 0x000000FF] ^
+		    (crc >> 8);
+	return crc;
+}
+
+static uint32_t
+sctp_crc32c_sb8_64_bit_zero(uint32_t crc,
+    uint32_t length,
+    uint32_t offset)
+{
+	uint32_t li;
+	uint32_t term1, term2;
+	uint32_t running_length;
+	uint32_t end_bytes;
+	uint32_t init_bytes;
+
+	init_bytes = (4-offset) & 0x3;
+
+	if (init_bytes > length)
+		init_bytes = length;
+
+	running_length = ((length - init_bytes) / 8) * 8;
+	end_bytes = length - init_bytes - running_length;
+
+	for (li = 0; li < init_bytes; li++)
+		crc = sctp_crc_tableil8_o32[crc & 0x000000FF] ^
+		    (crc >> 8);
+	for (li = 0; li < running_length / 8; li++) {
+		term1 = sctp_crc_tableil8_o88[crc & 0x000000FF] ^
+		    sctp_crc_tableil8_o80[(crc >> 8) & 0x000000FF];
+		term2 = crc >> 16;
+		crc = term1 ^
+		    sctp_crc_tableil8_o72[term2 & 0x000000FF] ^
+		    sctp_crc_tableil8_o64[(term2 >> 8) & 0x000000FF];
+
+#ifdef CEPH_BIG_ENDIAN
+		crc ^= sctp_crc_tableil8_o56[0];
+		crc ^= sctp_crc_tableil8_o48[0];
+		crc ^= sctp_crc_tableil8_o40[0];
+		crc ^= sctp_crc_tableil8_o32[0];
+#else
+		term1 = sctp_crc_tableil8_o56[0] ^
+			sctp_crc_tableil8_o48[0];
+
+		term2 = 0;
+		crc = crc ^
+		    term1 ^
+		    sctp_crc_tableil8_o40[term2 & 0x000000FF] ^
+		    sctp_crc_tableil8_o32[(term2 >> 8) & 0x000000FF];
+#endif
+	}
+	for (li = 0; li < end_bytes; li++)
+		crc = sctp_crc_tableil8_o32[crc & 0x000000FF] ^
+		    (crc >> 8);
+	return crc;
+}
+
+
+/**
+ *
+ * Routine Description:
+ *
+ * warms the tables
+ *
+ * Arguments:
+ *
+ *		none
+ *
+ * Return value:
+ *
+ *		none
+ */
+static uint32_t
+update_crc32(uint32_t crc32c,
+    unsigned char const *buffer,
+    unsigned int length)
+{
+	uint32_t offset;
+
+	if (length == 0) {
+		return (crc32c);
+	}
+	offset = ((uintptr_t) buffer) & 0x3;
+	if (buffer)
+		return (sctp_crc32c_sb8_64_bit(crc32c, buffer, length, offset));
+	else
+		return (sctp_crc32c_sb8_64_bit_zero(crc32c, length, offset));
+}
+
+uint32_t sctp_crc_c[256] = {
+	0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4,
+	0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB,
+	0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B,
+	0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24,
+	0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B,
+	0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384,
+	0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54,
+	0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B,
+	0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A,
+	0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35,
+	0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5,
+	0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA,
+	0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45,
+	0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A,
+	0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A,
+	0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595,
+	0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48,
+	0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957,
+	0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687,
+	0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198,
+	0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927,
+	0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38,
+	0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8,
+	0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7,
+	0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096,
+	0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789,
+	0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859,
+	0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46,
+	0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9,
+	0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6,
+	0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36,
+	0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829,
+	0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C,
+	0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93,
+	0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043,
+	0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C,
+	0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3,
+	0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC,
+	0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C,
+	0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033,
+	0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652,
+	0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D,
+	0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D,
+	0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982,
+	0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D,
+	0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622,
+	0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2,
+	0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED,
+	0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530,
+	0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F,
+	0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF,
+	0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0,
+	0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F,
+	0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540,
+	0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90,
+	0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F,
+	0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE,
+	0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1,
+	0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321,
+	0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E,
+	0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81,
+	0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E,
+	0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E,
+	0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351,
+};
+
+
+#define SCTP_CRC32C(c,d) (c=(c>>8)^sctp_crc_c[(c^(d))&0xFF])
+
+#if 0
+static uint32_t
+old_update_crc32(uint32_t crc32c,
+    unsigned char const *buffer,
+    unsigned int length)
+{
+	unsigned int i;
+
+	for (i = 0; i < length; i++) {
+		SCTP_CRC32C(crc32c, buffer[i]);
+	}
+	return (crc32c);
+}
+
+
+static uint32_t
+sctp_csum_finalize(uint32_t crc32c)
+{
+	uint32_t result;
+
+#if BYTE_ORDER == BIG_ENDIAN
+	uint8_t byte0, byte1, byte2, byte3;
+
+#endif
+	/* Complement the result */
+	result = ~crc32c;
+#if BYTE_ORDER == BIG_ENDIAN
+	/*
+	 * For BIG-ENDIAN.. aka Motorola byte order the result is in
+	 * little-endian form. So we must manually swap the bytes. Then we
+	 * can call htonl() which does nothing...
+	 */
+	byte0 = result & 0x000000ff;
+	byte1 = (result >> 8) & 0x000000ff;
+	byte2 = (result >> 16) & 0x000000ff;
+	byte3 = (result >> 24) & 0x000000ff;
+	crc32c = ((byte0 << 24) | (byte1 << 16) | (byte2 << 8) | byte3);
+#else
+	/*
+	 * For INTEL platforms the result comes out in network order. No
+	 * htonl is required or the swap above. So we optimize out both the
+	 * htonl and the manual swap above.
+	 */
+	crc32c = result;
+#endif
+	return (crc32c);
+}
+#endif
+
+uint32_t ceph_crc32c_sctp(uint32_t crc, unsigned char const *data, unsigned length)
+{
+	return update_crc32(crc, data, length);
+}
+
+
+#endif
diff --git a/src/common/sctp_crc32.h b/src/common/sctp_crc32.h
new file mode 100644
index 000000000..92d20bcb7
--- /dev/null
+++ b/src/common/sctp_crc32.h
@@ -0,0 +1,14 @@
+#ifndef CEPH_COMMON_SCTP_CRC32_H
+#define CEPH_COMMON_SCTP_CRC32_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint32_t ceph_crc32c_sctp(uint32_t crc, unsigned char const *data, unsigned length);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/common/secret.c b/src/common/secret.c
new file mode 100644
index 000000000..fe34efdff
--- /dev/null
+++ b/src/common/secret.c
@@ -0,0 +1,86 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <keyutils.h>
+
+#include "include/compat.h"
+#include "common/armor.h"
+#include "common/safe_io.h"
+
+int read_secret_from_file(const char *filename, char *secret, size_t max_len)
+{
+  char *end;
+  int fd;
+  int len;
+
+  fd = open(filename, O_RDONLY);
+  if (fd < 0) {
+    perror("unable to read secretfile");
+    return -errno;
+  }
+  len = safe_read(fd, secret, max_len);
+  if (len <= 0) {
+    perror("unable to read secret from file");
+    close(fd);
+    return len ? len : -ENODATA;
+  }
+  end = secret;
+  while (end < secret + len && *end && *end != '\n' && *end != '\r')
+    end++;
+  *end = '\0';
+  close(fd);
+
+  return 0;
+}
+
+int set_kernel_secret(const char *secret, const char *key_name)
+{
+  /* try to submit key to kernel via the keys api */
+  key_serial_t serial;
+  int ret;
+  int secret_len = strlen(secret);
+  char payload[((secret_len * 3) / 4) + 4];
+
+  if (!secret_len) {
+    fprintf(stderr, "secret is empty.\n");
+    return -EINVAL;
+  }
+
+  ret = ceph_unarmor(payload, payload+sizeof(payload), secret, secret+secret_len);
+  if (ret < 0) {
+    char error_buf[80];
+    fprintf(stderr, "secret is not valid base64: %s.\n",
+	    ceph_strerror_r(-ret, error_buf, sizeof(error_buf)));
+    return ret;
+  }
+
+  serial = add_key("ceph", key_name, payload, ret, KEY_SPEC_PROCESS_KEYRING);
+  if (serial == -1) {
+    ret = -errno;
+  }
+
+  return ret;
+}
+
+int is_kernel_secret(const char *key_name)
+{
+  key_serial_t serial;
+  serial = request_key("ceph", key_name, NULL, KEY_SPEC_USER_KEYRING);
+  return serial != -1;
+}
diff --git a/src/common/secret.h b/src/common/secret.h
new file mode 100644
index 000000000..5d2ad179d
--- /dev/null
+++ b/src/common/secret.h
@@ -0,0 +1,18 @@
+#ifndef CEPH_SECRET_H
+#define CEPH_SECRET_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int read_secret_from_file(const char *filename, char *secret, size_t max_len);
+
+int set_kernel_secret(const char *secret, const char *key_name);
+
+int is_kernel_secret(const char *key_name);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/common/shared_cache.hpp b/src/common/shared_cache.hpp
new file mode 100644
index 000000000..2735692dc
--- /dev/null
+++ b/src/common/shared_cache.hpp
@@ -0,0 +1,384 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_SHAREDCACHE_H
+#define CEPH_SHAREDCACHE_H
+
+#include <map>
+#include <list>
+#ifdef WITH_SEASTAR
+#include <boost/smart_ptr/local_shared_ptr.hpp>
+#else
+#include <memory>
+#endif
+#include "common/ceph_mutex.h"
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "include/unordered_map.h"
+
+template <class K, class V>
+class SharedLRU {
+  CephContext *cct;
+#ifdef WITH_SEASTAR
+  using VPtr = boost::local_shared_ptr<V>;
+  using WeakVPtr = boost::weak_ptr<V>;
+#else
+  using VPtr = std::shared_ptr<V>;
+  using WeakVPtr = std::weak_ptr<V>;
+#endif
+  ceph::mutex lock;
+  size_t max_size;
+  ceph::condition_variable cond;
+  unsigned size;
+public:
+  int waiting;
+private:
+  using C = std::less<K>;
+  using H = std::hash<K>;
+  ceph::unordered_map<K, typename std::list<std::pair<K, VPtr> >::iterator, H> contents;
+  std::list<std::pair<K, VPtr> > lru;
+
+  std::map<K, std::pair<WeakVPtr, V*>, C> weak_refs;
+
+  void trim_cache(std::list<VPtr> *to_release) {
+    while (size > max_size) {
+      to_release->push_back(lru.back().second);
+      lru_remove(lru.back().first);
+    }
+  }
+
+  void lru_remove(const K& key) {
+    auto i = contents.find(key);
+    if (i == contents.end())
+      return;
+    lru.erase(i->second);
+    --size;
+    contents.erase(i);
+  }
+
+  void lru_add(const K& key, const VPtr& val, std::list<VPtr> *to_release) {
+    auto i = contents.find(key);
+    if (i != contents.end()) {
+      lru.splice(lru.begin(), lru, i->second);
+    } else {
+      ++size;
+      lru.push_front(make_pair(key, val));
+      contents[key] = lru.begin();
+      trim_cache(to_release);
+    }
+  }
+
+  void remove(const K& key, V *valptr) {
+    std::lock_guard l{lock};
+    auto i = weak_refs.find(key);
+    if (i != weak_refs.end() && i->second.second == valptr) {
+      weak_refs.erase(i);
+    }
+    cond.notify_all();
+  }
+
+  class Cleanup {
+  public:
+    SharedLRU<K, V> *cache;
+    K key;
+    Cleanup(SharedLRU<K, V> *cache, K key) : cache(cache), key(key) {}
+    void operator()(V *ptr) {
+      cache->remove(key, ptr);
+      delete ptr;
+    }
+  };
+
+public:
+  SharedLRU(CephContext *cct = NULL, size_t max_size = 20)
+    : cct(cct),
+      lock{ceph::make_mutex("SharedLRU::lock")},
+      max_size(max_size),
+      size(0), waiting(0) {
+    contents.rehash(max_size); 
+  }
+  
+  ~SharedLRU() {
+    contents.clear();
+    lru.clear();
+    if (!weak_refs.empty()) {
+      lderr(cct) << "leaked refs:\n";
+      dump_weak_refs(*_dout);
+      *_dout << dendl;
+      if (cct->_conf.get_val<bool>("debug_asserts_on_shutdown")) {
+	ceph_assert(weak_refs.empty());
+      }
+    }
+  }
+
+  int get_count() {
+    std::lock_guard locker{lock};
+    return size;
+  }
+
+  void set_cct(CephContext *c) {
+    cct = c;
+  }
+
+  void dump_weak_refs() {
+    lderr(cct) << "leaked refs:\n";
+    dump_weak_refs(*_dout);
+    *_dout << dendl;
+  }
+
+  void dump_weak_refs(std::ostream& out) {
+    for (const auto& [key, ref] : weak_refs) {
+      out << __func__ << " " << this << " weak_refs: "
+	  << key << " = " << ref.second
+	  << " with " << ref.first.use_count() << " refs"
+	  << std::endl;
+    }
+  }
+
+  //clear all strong reference from the lru.
+  void clear() {
+    while (true) {
+      VPtr val; // release any ref we have after we drop the lock
+      std::lock_guard locker{lock};
+      if (size == 0)
+        break;
+
+      val = lru.back().second;
+      lru_remove(lru.back().first);
+    }
+  }
+
+  void clear(const K& key) {
+    VPtr val; // release any ref we have after we drop the lock
+    {
+      std::lock_guard l{lock};
+      auto i = weak_refs.find(key);
+      if (i != weak_refs.end()) {
+	val = i->second.first.lock();
+      }
+      lru_remove(key);
+    }
+  }
+
+  /* Clears weakrefs in the interval [from, to] -- note that to is inclusive */
+  void clear_range(
+    const K& from,
+    const K& to) {
+    std::list<VPtr> vals; // release any refs we have after we drop the lock
+    {
+      std::lock_guard l{lock};
+      auto from_iter = weak_refs.lower_bound(from);
+      auto to_iter = weak_refs.upper_bound(to);
+      for (auto i = from_iter; i != to_iter; ) {
+	vals.push_back(i->second.first.lock());
+	lru_remove((i++)->first);
+      }
+    }
+  }
+
+
+  void purge(const K &key) {
+    VPtr val; // release any ref we have after we drop the lock
+    {
+      std::lock_guard l{lock};
+      auto i = weak_refs.find(key);
+      if (i != weak_refs.end()) {
+	val = i->second.first.lock();
+        weak_refs.erase(i);
+      }
+      lru_remove(key);
+    }
+  }
+
+  void set_size(size_t new_size) {
+    std::list<VPtr> to_release;
+    {
+      std::lock_guard l{lock};
+      max_size = new_size;
+      trim_cache(&to_release);
+    }
+  }
+
+  // Returns K key s.t. key <= k for all currently cached k,v
+  K cached_key_lower_bound() {
+    std::lock_guard l{lock};
+    return weak_refs.begin()->first;
+  }
+
+  VPtr lower_bound(const K& key) {
+    VPtr val;
+    std::list<VPtr> to_release;
+    {
+      std::unique_lock l{lock};
+      ++waiting;
+      cond.wait(l, [this, &key, &val, &to_release] {
+        if (weak_refs.empty()) {
+          return true;
+        }
+        auto i = weak_refs.lower_bound(key);
+        if (i == weak_refs.end()) {
+          --i;
+        }
+        if (val = i->second.first.lock(); val) {
+          lru_add(i->first, val, &to_release);
+          return true;
+        } else {
+          return false;
+        }
+      });
+      --waiting;
+    }
+    return val;
+  }
+  bool get_next(const K &key, std::pair<K, VPtr> *next) {
+    std::pair<K, VPtr> r;
+    {
+      std::lock_guard l{lock};
+      VPtr next_val;
+      typename std::map<K, std::pair<WeakVPtr, V*>, C>::iterator i = weak_refs.upper_bound(key);
+
+      while (i != weak_refs.end() &&
+	     !(next_val = i->second.first.lock()))
+	++i;
+
+      if (i == weak_refs.end())
+	return false;
+
+      if (next)
+	r = make_pair(i->first, next_val);
+    }
+    if (next)
+      *next = r;
+    return true;
+  }
+  bool get_next(const K &key, std::pair<K, V> *next) {
+    std::pair<K, VPtr> r;
+    bool found = get_next(key, &r);
+    if (!found || !next)
+      return found;
+    next->first = r.first;
+    ceph_assert(r.second);
+    next->second = *(r.second);
+    return found;
+  }
+
+  VPtr lookup(const K& key) {
+    VPtr val;
+    std::list<VPtr> to_release;
+    {
+      std::unique_lock l{lock};
+      ++waiting;
+      cond.wait(l, [this, &key, &val, &to_release] {
+        if (auto i = weak_refs.find(key); i != weak_refs.end()) {
+          if (val = i->second.first.lock(); val) {
+            lru_add(key, val, &to_release);
+            return true;
+          } else {
+            return false;
+          }
+        } else {
+          return true;
+        }
+      });
+      --waiting;
+    }
+    return val;
+  }
+  VPtr lookup_or_create(const K &key) {
+    VPtr val;
+    std::list<VPtr> to_release;
+    {
+      std::unique_lock l{lock};
+      cond.wait(l, [this, &key, &val] {
+        if (auto i = weak_refs.find(key); i != weak_refs.end()) {
+          if (val = i->second.first.lock(); val) {
+            return true;
+          } else {
+            return false;
+          }
+        } else {
+          return true;
+        }
+      });
+      if (!val) {
+        val = VPtr{new V{}, Cleanup{this, key}};
+        weak_refs.insert(make_pair(key, make_pair(val, val.get())));
+      }
+      lru_add(key, val, &to_release);
+    }
+    return val;
+  }
+
+  /**
+   * empty()
+   *
+   * Returns true iff there are no live references left to anything that has been
+   * in the cache.
+   */
+  bool empty() {
+    std::lock_guard l{lock};
+    return weak_refs.empty();
+  }
+
+  /***
+   * Inserts a key if not present, or bumps it to the front of the LRU if
+   * it is, and then gives you a reference to the value. If the key already
+   * existed, you are responsible for deleting the new value you tried to
+   * insert.
+   *
+   * @param key The key to insert
+   * @param value The value that goes with the key
+   * @param existed Set to true if the value was already in the
+   * map, false otherwise
+   * @return A reference to the map's value for the given key
+   */
+  VPtr add(const K& key, V *value, bool *existed = NULL) {
+    VPtr val;
+    std::list<VPtr> to_release;
+    {
+      typename std::map<K, std::pair<WeakVPtr, V*>, C>::iterator actual;
+      std::unique_lock l{lock};
+      cond.wait(l, [this, &key, &actual, &val] {
+	  actual = weak_refs.lower_bound(key);
+	  if (actual != weak_refs.end() && actual->first == key) {
+	    val = actual->second.first.lock();
+	    if (val) {
+	      return true;
+	    } else {
+	      return false;
+	    }
+	  } else {
+	    return true;
+	  }
+      });
+
+      if (val) {
+	if (existed) {
+	  *existed = true;
+	}
+      } else {
+	if (existed) {
+	  *existed = false;
+	}
+	val = VPtr(value, Cleanup(this, key));
+	weak_refs.insert(actual, make_pair(key, make_pair(val, value)));
+      }
+      lru_add(key, val, &to_release);
+    }
+    return val;
+  }
+
+  friend class SharedLRUTest;
+};
+
+#endif
diff --git a/src/common/shared_mutex_debug.cc b/src/common/shared_mutex_debug.cc
new file mode 100644
index 000000000..258cf4039
--- /dev/null
+++ b/src/common/shared_mutex_debug.cc
@@ -0,0 +1,166 @@
+#include "shared_mutex_debug.h"
+
+#include <system_error>
+
+#include "acconfig.h"
+#include "common/valgrind.h"
+
+namespace ceph {
+
+shared_mutex_debug::shared_mutex_debug(std::string group,
+                                       bool track_lock,
+                                       bool enable_lock_dep,
+                                       bool prioritize_write)
+  : mutex_debugging_base{std::move(group),
+                         enable_lock_dep,
+                         false /* backtrace */},
+    track(track_lock)
+{
+#ifdef HAVE_PTHREAD_RWLOCKATTR_SETKIND_NP
+  if (prioritize_write) {
+    pthread_rwlockattr_t attr;
+    pthread_rwlockattr_init(&attr);
+    // PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP
+    //   Setting the lock kind to this avoids writer starvation as long as
+    //   long as any read locking is not done in a recursive fashion.
+    pthread_rwlockattr_setkind_np(&attr,
+                                  PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP);
+    pthread_rwlock_init(&rwlock, &attr);
+    pthread_rwlockattr_destroy(&attr);
+  } else
+#endif
+  // Next block is in {} to possibly connect to the above if when code is used.
+  {
+    pthread_rwlock_init(&rwlock, NULL);
+  }
+  ANNOTATE_BENIGN_RACE_SIZED(&id, sizeof(id), "shared_mutex_debug lockdep id");
+  ANNOTATE_BENIGN_RACE_SIZED(&nlock, sizeof(nlock), "shared_mutex_debug nwlock");
+  ANNOTATE_BENIGN_RACE_SIZED(&nrlock, sizeof(nrlock), "shared_mutex_debug nrlock");
+}
+
+// exclusive
+void shared_mutex_debug::lock()
+{
+  if (_enable_lockdep()) {
+    _will_lock();
+  }
+  if (int r = pthread_rwlock_wrlock(&rwlock); r != 0) {
+    throw std::system_error(r, std::generic_category());
+  }
+  if (_enable_lockdep()) {
+    _locked();
+  }
+  _post_lock();
+}
+
+bool shared_mutex_debug::try_lock()
+{
+  int r = pthread_rwlock_trywrlock(&rwlock);
+  switch (r) {
+  case 0:
+    if (_enable_lockdep()) {
+      _locked();
+    }
+    _post_lock();
+    return true;
+  case EBUSY:
+    return false;
+  default:
+    throw std::system_error(r, std::generic_category());
+  }
+}
+
+void shared_mutex_debug::unlock()
+{
+  _pre_unlock();
+  if (_enable_lockdep()) {
+    _will_unlock();
+  }
+  if (int r = pthread_rwlock_unlock(&rwlock); r != 0) {
+    throw std::system_error(r, std::generic_category());
+  }
+}
+
+// shared locking
+void shared_mutex_debug::lock_shared()
+{
+  if (_enable_lockdep()) {
+    _will_lock();
+  }
+  if (int r = pthread_rwlock_rdlock(&rwlock); r != 0) {
+    throw std::system_error(r, std::generic_category());
+  }
+  if (_enable_lockdep()) {
+    _locked();
+  }
+  _post_lock_shared();
+}
+
+bool shared_mutex_debug::try_lock_shared()
+{
+  if (_enable_lockdep()) {
+    _will_unlock();
+  }
+  switch (int r = pthread_rwlock_rdlock(&rwlock); r) {
+  case 0:
+    if (_enable_lockdep()) {
+      _locked();
+    }
+    _post_lock_shared();
+    return true;
+  case EBUSY:
+    return false;
+  default:
+    throw std::system_error(r, std::generic_category());
+  }
+}
+
+void shared_mutex_debug::unlock_shared()
+{
+  _pre_unlock_shared();
+  if (_enable_lockdep()) {
+    _will_unlock();
+  }
+  if (int r = pthread_rwlock_unlock(&rwlock); r != 0) {
+    throw std::system_error(r, std::generic_category());
+  }
+}
+
+// exclusive locking
+void shared_mutex_debug::_pre_unlock()
+{
+  if (track) {
+    ceph_assert(nlock > 0);
+    --nlock;
+    ceph_assert(locked_by == std::this_thread::get_id());
+    ceph_assert(nlock == 0);
+    locked_by = std::thread::id();
+  }
+}
+
+void shared_mutex_debug::_post_lock()
+{
+  if (track) {
+    ceph_assert(nlock == 0);
+    locked_by = std::this_thread::get_id();
+    ++nlock;
+  }
+}
+
+// shared locking
+void shared_mutex_debug::_pre_unlock_shared()
+{
+  if (track) {
+    ceph_assert(nrlock > 0);
+    nrlock--;
+  }
+}
+
+void shared_mutex_debug::_post_lock_shared()
+{
+  if (track) {
+    ++nrlock;
+  }
+}
+
+} // namespace ceph
diff --git a/src/common/shared_mutex_debug.h b/src/common/shared_mutex_debug.h
new file mode 100644
index 000000000..0d8d46587
--- /dev/null
+++ b/src/common/shared_mutex_debug.h
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <pthread.h>
+#include <atomic>
+
+#include "common/mutex_debug.h"
+
+namespace ceph {
+
+class shared_mutex_debug :
+    public ceph::mutex_debug_detail::mutex_debugging_base
+{
+  pthread_rwlock_t rwlock;
+  const bool track;
+  std::atomic<unsigned> nrlock{0};
+
+public:
+  shared_mutex_debug(std::string group,
+		     bool track_lock=true,
+		     bool enable_lock_dep=true,
+		     bool prioritize_write=false);
+  // exclusive locking
+  void lock();
+  bool try_lock();
+  void unlock();
+  bool is_wlocked() const {
+    return nlock > 0;
+  }
+  // shared locking
+  void lock_shared();
+  bool try_lock_shared();
+  void unlock_shared();
+  bool is_rlocked() const {
+    return nrlock > 0;
+  }
+  // either of them
+  bool is_locked() const {
+    return nlock > 0 || nrlock > 0;
+  }
+private:
+  // exclusive locking
+  void _pre_unlock();
+  void _post_lock();
+  // shared locking
+  void _pre_unlock_shared();
+  void _post_lock_shared();
+};
+
+} // namespace ceph
diff --git a/src/common/sharedptr_registry.hpp b/src/common/sharedptr_registry.hpp
new file mode 100644
index 000000000..3b3cf01bb
--- /dev/null
+++ b/src/common/sharedptr_registry.hpp
@@ -0,0 +1,189 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_SHAREDPTR_REGISTRY_H
+#define CEPH_SHAREDPTR_REGISTRY_H
+
+#include <map>
+#include <memory>
+#include "common/ceph_mutex.h"
+
+/**
+ * Provides a registry of shared_ptr<V> indexed by K while
+ * the references are alive.
+ */
+template <class K, class V, class C = std::less<K> >
+class SharedPtrRegistry {
+public:
+  typedef std::shared_ptr<V> VPtr;
+  typedef std::weak_ptr<V> WeakVPtr;
+  int waiting;
+private:
+  ceph::mutex lock = ceph::make_mutex("SharedPtrRegistry::lock");
+  ceph::condition_variable cond;
+  std::map<K, std::pair<WeakVPtr, V*>, C> contents;
+
+  class OnRemoval {
+    SharedPtrRegistry<K,V,C> *parent;
+    K key;
+  public:
+    OnRemoval(SharedPtrRegistry<K,V,C> *parent, K key) :
+      parent(parent), key(key) {}
+    void operator()(V *to_remove) {
+      {
+	std::lock_guard l(parent->lock);
+	typename std::map<K, std::pair<WeakVPtr, V*>, C>::iterator i =
+	  parent->contents.find(key);
+	if (i != parent->contents.end() &&
+	    i->second.second == to_remove) {
+	  parent->contents.erase(i);
+	  parent->cond.notify_all();
+	}
+      }
+      delete to_remove;
+    }
+  };
+  friend class OnRemoval;
+
+public:
+  SharedPtrRegistry() :
+    waiting(0)
+  {}
+
+  bool empty() {
+    std::lock_guard l(lock);
+    return contents.empty();
+  }
+
+  bool get_next(const K &key, std::pair<K, VPtr> *next) {
+    std::pair<K, VPtr> r;
+    {
+      std::lock_guard l(lock);
+      VPtr next_val;
+      typename std::map<K, std::pair<WeakVPtr, V*>, C>::iterator i =
+	contents.upper_bound(key);
+      while (i != contents.end() &&
+	     !(next_val = i->second.first.lock()))
+	++i;
+      if (i == contents.end())
+	return false;
+      if (next)
+	r = std::make_pair(i->first, next_val);
+    }
+    if (next)
+      *next = r;
+    return true;
+  }
+
+  
+  bool get_next(const K &key, std::pair<K, V> *next) {
+    VPtr next_val;
+    std::lock_guard l(lock);
+    typename std::map<K, std::pair<WeakVPtr, V*>, C>::iterator i =
+      contents.upper_bound(key);
+    while (i != contents.end() &&
+	   !(next_val = i->second.first.lock()))
+      ++i;
+    if (i == contents.end())
+      return false;
+    if (next)
+      *next = std::make_pair(i->first, *next_val);
+    return true;
+  }
+
+  VPtr lookup(const K &key) {
+    std::unique_lock l(lock);
+    waiting++;
+    while (1) {
+      typename std::map<K, std::pair<WeakVPtr, V*>, C>::iterator i =
+	contents.find(key);
+      if (i != contents.end()) {
+	VPtr retval = i->second.first.lock();
+	if (retval) {
+	  waiting--;
+	  return retval;
+	}
+      } else {
+	break;
+      }
+      cond.wait(l);
+    }
+    waiting--;
+    return VPtr();
+  }
+
+  VPtr lookup_or_create(const K &key) {
+    std::unique_lock l(lock);
+    waiting++;
+    while (1) {
+      typename std::map<K, std::pair<WeakVPtr, V*>, C>::iterator i =
+	contents.find(key);
+      if (i != contents.end()) {
+	VPtr retval = i->second.first.lock();
+	if (retval) {
+	  waiting--;
+	  return retval;
+	}
+      } else {
+	break;
+      }
+      cond.wait(l);
+    }
+    V *ptr = new V();
+    VPtr retval(ptr, OnRemoval(this, key));
+    contents.insert(std::make_pair(key, make_pair(retval, ptr)));
+    waiting--;
+    return retval;
+  }
+
+  unsigned size() {
+    std::lock_guard l(lock);
+    return contents.size();
+  }
+
+  void remove(const K &key) {
+    std::lock_guard l(lock);
+    contents.erase(key);
+    cond.notify_all();
+  }
+
+  template<class A>
+  VPtr lookup_or_create(const K &key, const A &arg) {
+    std::unique_lock l(lock);
+    waiting++;
+    while (1) {
+      typename std::map<K, std::pair<WeakVPtr, V*>, C>::iterator i =
+	contents.find(key);
+      if (i != contents.end()) {
+	VPtr retval = i->second.first.lock();
+	if (retval) {
+	  waiting--;
+	  return retval;
+	}
+      } else {
+	break;
+      }
+      cond.wait(l);
+    }
+    V *ptr = new V(arg);
+    VPtr retval(ptr, OnRemoval(this, key));
+    contents.insert(std::make_pair(key, make_pair(retval, ptr)));
+    waiting--;
+    return retval;
+  }
+
+  friend class SharedPtrRegistryTest;
+};
+
+#endif
diff --git a/src/common/shunique_lock.h b/src/common/shunique_lock.h
new file mode 100644
index 000000000..5f809e83a
--- /dev/null
+++ b/src/common/shunique_lock.h
@@ -0,0 +1,393 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_COMMON_SHUNIQUE_LOCK_H
+#define CEPH_COMMON_SHUNIQUE_LOCK_H
+
+#include <mutex>
+#include <shared_mutex>
+#include <system_error>
+
+namespace ceph {
+// This is a 'lock' class in the style of shared_lock and
+// unique_lock. Like shared_mutex it implements both Lockable and
+// SharedLockable.
+
+// My rationale is thus: one of the advantages of unique_lock is that
+// I can pass a thread of execution's control of a lock around as a
+// parameter. So that methods further down the call stack can unlock
+// it, do something, relock it, and have the lock state be known by
+// the caller afterward, explicitly. The shared_lock class offers a
+// similar advantage to shared_lock, but each class is one or the
+// other. In Objecter we have calls that in most cases need /a/ lock
+// on the shared mutex, and whether it's shared or exclusive doesn't
+// matter. In some circumstances they may drop the shared lock and
+// reacquire an exclusive one. This could be handled by passing both a
+// shared and unique lock down the call stack. This is vexacious and
+// shameful.
+
+// Wanting to avoid heaping shame and vexation upon myself, I threw
+// this class together.
+
+// This class makes no attempt to support atomic upgrade or
+// downgrade. I don't want either. Matt has convinced me that if you
+// think you want them you've usually made a mistake somewhere. It is
+// exactly and only a reification of the state held on a shared mutex.
+
+/// Acquire unique ownership of the mutex.
+struct acquire_unique_t { };
+
+/// Acquire shared ownership of the mutex.
+struct acquire_shared_t { };
+
+constexpr acquire_unique_t acquire_unique { };
+constexpr acquire_shared_t acquire_shared { };
+
+template<typename Mutex>
+class shunique_lock {
+public:
+  typedef Mutex mutex_type;
+  typedef std::unique_lock<Mutex> unique_lock_type;
+  typedef std::shared_lock<Mutex> shared_lock_type;
+
+  shunique_lock() noexcept : m(nullptr), o(ownership::none) { }
+
+  // We do not provide a default locking/try_locking constructor that
+  // takes only the mutex, since it is not clear whether to take it
+  // shared or unique. We explicitly require the use of lock_deferred
+  // to prevent Nasty Surprises.
+
+  shunique_lock(mutex_type& m, std::defer_lock_t) noexcept
+    : m(&m), o(ownership::none) { }
+
+  shunique_lock(mutex_type& m, acquire_unique_t)
+    : m(&m), o(ownership::none) {
+    lock();
+  }
+
+  shunique_lock(mutex_type& m, acquire_shared_t)
+    : m(&m), o(ownership::none) {
+    lock_shared();
+  }
+
+  template<typename AcquireType>
+  shunique_lock(mutex_type& m, AcquireType at, std::try_to_lock_t)
+    : m(&m), o(ownership::none) {
+    try_lock(at);
+  }
+
+  shunique_lock(mutex_type& m, acquire_unique_t, std::adopt_lock_t)
+    : m(&m), o(ownership::unique) {
+    // You'd better actually have a lock, or I will find you and I
+    // will hunt you down.
+  }
+
+  shunique_lock(mutex_type& m, acquire_shared_t, std::adopt_lock_t)
+    : m(&m), o(ownership::shared) {
+  }
+
+  template<typename AcquireType, typename Clock, typename Duration>
+  shunique_lock(mutex_type& m, AcquireType at,
+		const std::chrono::time_point<Clock, Duration>& t)
+    : m(&m), o(ownership::none) {
+    try_lock_until(at, t);
+  }
+
+  template<typename AcquireType, typename Rep, typename Period>
+  shunique_lock(mutex_type& m, AcquireType at,
+		const std::chrono::duration<Rep, Period>& dur)
+    : m(&m), o(ownership::none) {
+    try_lock_for(at, dur);
+  }
+
+  ~shunique_lock() {
+    switch (o) {
+    case ownership::none:
+      return;
+    case ownership::unique:
+      m->unlock();
+      break;
+    case ownership::shared:
+      m->unlock_shared();
+      break;
+    }
+  }
+
+  shunique_lock(shunique_lock const&) = delete;
+  shunique_lock& operator=(shunique_lock const&) = delete;
+
+  shunique_lock(shunique_lock&& l) noexcept : shunique_lock() {
+    swap(l);
+  }
+
+  shunique_lock(unique_lock_type&& l) noexcept {
+    if (l.owns_lock())
+      o = ownership::unique;
+    else
+      o = ownership::none;
+    m = l.release();
+  }
+
+  shunique_lock(shared_lock_type&& l) noexcept {
+    if (l.owns_lock())
+      o = ownership::shared;
+    else
+      o = ownership::none;
+    m = l.release();
+  }
+
+  shunique_lock& operator=(shunique_lock&& l) noexcept {
+    shunique_lock(std::move(l)).swap(*this);
+    return *this;
+  }
+
+  shunique_lock& operator=(unique_lock_type&& l) noexcept {
+    shunique_lock(std::move(l)).swap(*this);
+    return *this;
+  }
+
+  shunique_lock& operator=(shared_lock_type&& l) noexcept {
+    shunique_lock(std::move(l)).swap(*this);
+    return *this;
+  }
+
+  void lock() {
+    lockable();
+    m->lock();
+    o = ownership::unique;
+  }
+
+  void lock_shared() {
+    lockable();
+    m->lock_shared();
+    o = ownership::shared;
+  }
+
+  void lock(ceph::acquire_unique_t) {
+    lock();
+  }
+
+  void lock(ceph::acquire_shared_t) {
+    lock_shared();
+  }
+
+  bool try_lock() {
+    lockable();
+    if (m->try_lock()) {
+      o = ownership::unique;
+      return true;
+    }
+    return false;
+  }
+
+  bool try_lock_shared() {
+    lockable();
+    if (m->try_lock_shared()) {
+      o = ownership::shared;
+      return true;
+    }
+    return false;
+  }
+
+  bool try_lock(ceph::acquire_unique_t) {
+    return try_lock();
+  }
+
+  bool try_lock(ceph::acquire_shared_t) {
+    return try_lock_shared();
+  }
+
+  template<typename Rep, typename Period>
+  bool try_lock_for(const std::chrono::duration<Rep, Period>& dur) {
+    lockable();
+    if (m->try_lock_for(dur)) {
+      o = ownership::unique;
+      return true;
+    }
+    return false;
+  }
+
+  template<typename Rep, typename Period>
+  bool try_lock_shared_for(const std::chrono::duration<Rep, Period>& dur) {
+    lockable();
+    if (m->try_lock_shared_for(dur)) {
+      o = ownership::shared;
+      return true;
+    }
+    return false;
+  }
+
+  template<typename Rep, typename Period>
+  bool try_lock_for(ceph::acquire_unique_t,
+		    const std::chrono::duration<Rep, Period>& dur) {
+    return try_lock_for(dur);
+  }
+
+  template<typename Rep, typename Period>
+  bool try_lock_for(ceph::acquire_shared_t,
+		    const std::chrono::duration<Rep, Period>& dur) {
+    return try_lock_shared_for(dur);
+  }
+
+  template<typename Clock, typename Duration>
+  bool try_lock_until(const std::chrono::time_point<Clock, Duration>& time) {
+    lockable();
+    if (m->try_lock_until(time)) {
+      o = ownership::unique;
+      return true;
+    }
+    return false;
+  }
+
+  template<typename Clock, typename Duration>
+  bool try_lock_shared_until(const std::chrono::time_point<Clock,
+			     Duration>& time) {
+    lockable();
+    if (m->try_lock_shared_until(time)) {
+      o = ownership::shared;
+      return true;
+    }
+    return false;
+  }
+
+  template<typename Clock, typename Duration>
+  bool try_lock_until(ceph::acquire_unique_t,
+		      const std::chrono::time_point<Clock, Duration>& time) {
+    return try_lock_until(time);
+  }
+
+  template<typename Clock, typename Duration>
+  bool try_lock_until(ceph::acquire_shared_t,
+		      const std::chrono::time_point<Clock, Duration>& time) {
+    return try_lock_shared_until(time);
+  }
+
+  // Only have a single unlock method. Otherwise we'd be building an
+  // Acme lock class suitable only for ravenous coyotes desparate to
+  // devour a road runner. It would be bad. It would be disgusting. It
+  // would be infelicitous as heck. It would leave our developers in a
+  // state of seeming safety unaware of the yawning chasm of failure
+  // that had opened beneath their feet that would soon transition
+  // into a sickening realization of the error they made and a brief
+  // moment of blinking self pity before their program hurled itself
+  // into undefined behaviour and plummeted up the stack with core
+  // dumps trailing behind it.
+
+  void unlock() {
+    switch (o) {
+    case ownership::none:
+      throw std::system_error((int)std::errc::resource_deadlock_would_occur,
+			      std::generic_category());
+      break;
+
+    case ownership::unique:
+      m->unlock();
+      break;
+
+    case ownership::shared:
+      m->unlock_shared();
+      break;
+    }
+    o = ownership::none;
+  }
+
+  // Setters
+
+  void swap(shunique_lock& u) noexcept {
+    std::swap(m, u.m);
+    std::swap(o, u.o);
+  }
+
+  mutex_type* release() noexcept {
+    o = ownership::none;
+    mutex_type* tm = m;
+    m = nullptr;
+    return tm;
+  }
+
+  // Ideally I'd rather make a move constructor for std::unique_lock
+  // that took a shunique_lock, but obviously I can't.
+  unique_lock_type release_to_unique() {
+    if (o == ownership::unique) {
+      o = ownership::none;
+      unique_lock_type tu(*m, std::adopt_lock);
+      m = nullptr;
+      return tu;
+    } else if (o == ownership::none) {
+      unique_lock_type tu(*m, std::defer_lock);
+      m = nullptr;
+      return tu;
+    } else if (m == nullptr) {
+      return unique_lock_type();
+    }
+    throw std::system_error((int)std::errc::operation_not_permitted,
+			    std::generic_category());
+  }
+
+  shared_lock_type release_to_shared() {
+    if (o == ownership::shared) {
+      o = ownership::none;
+      shared_lock_type ts(*m, std::adopt_lock);
+      m = nullptr;
+      return ts;
+    } else if (o == ownership::none) {
+      shared_lock_type ts(*m, std::defer_lock);
+      m = nullptr;
+      return ts;
+    } else if (m == nullptr) {
+      return shared_lock_type();
+    }
+    throw std::system_error((int)std::errc::operation_not_permitted,
+			    std::generic_category());
+    return shared_lock_type();
+  }
+
+  // Getters
+
+  // Note that this returns true if the lock UNIQUE, it will return
+  // false for shared
+  bool owns_lock() const noexcept {
+    return o == ownership::unique;
+  }
+
+  bool owns_lock_shared() const noexcept {
+    return o == ownership::shared;
+  }
+
+  // If you want to make sure you have a lock of some sort on the
+  // mutex, just treat as a bool.
+  explicit operator bool() const noexcept {
+    return o != ownership::none;
+  }
+
+  mutex_type* mutex() const noexcept {
+    return m;
+  }
+
+private:
+  void lockable() const {
+    if (m == nullptr)
+      throw std::system_error((int)std::errc::operation_not_permitted,
+			      std::generic_category());
+    if (o != ownership::none)
+      throw std::system_error((int)std::errc::resource_deadlock_would_occur,
+			      std::generic_category());
+  }
+
+  mutex_type*	m;
+  enum struct ownership : uint8_t {
+    none, unique, shared
+      };
+  ownership o;
+};
+} // namespace ceph
+
+namespace std {
+  template<typename Mutex>
+  void swap(ceph::shunique_lock<Mutex> sh1,
+	    ceph::shunique_lock<Mutex> sha) {
+    sh1.swap(sha);
+  }
+} // namespace std
+
+#endif // CEPH_COMMON_SHUNIQUE_LOCK_H
diff --git a/src/common/signal.cc b/src/common/signal.cc
new file mode 100644
index 000000000..97f13edfa
--- /dev/null
+++ b/src/common/signal.cc
@@ -0,0 +1,97 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <cstdlib>
+#include <sstream>
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <signal.h>
+
+#include "common/BackTrace.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "common/signal.h"
+#include "common/perf_counters.h"
+
+#include "global/pidfile.h"
+
+using namespace std::literals;
+
+#ifndef _WIN32
+std::string signal_mask_to_str()
+{
+  sigset_t old_sigset;
+  if (pthread_sigmask(SIG_SETMASK, NULL, &old_sigset)) {
+    return "(pthread_signmask failed)";
+  }
+
+  std::ostringstream oss;
+  oss << "show_signal_mask: { ";
+  auto sep = ""s;
+  for (int signum = 0; signum < NSIG; ++signum) {
+    if (sigismember(&old_sigset, signum) == 1) {
+      oss << sep << signum;
+      sep = ", ";
+    }
+  }
+  oss << " }";
+  return oss.str();
+}
+
+/* Block the signals in 'siglist'. If siglist == NULL, block all signals. */
+void block_signals(const int *siglist, sigset_t *old_sigset)
+{
+  sigset_t sigset;
+  if (!siglist) {
+    sigfillset(&sigset);
+  }
+  else {
+    int i = 0;
+    sigemptyset(&sigset);
+    while (siglist[i]) {
+      sigaddset(&sigset, siglist[i]);
+      ++i;
+    }
+  }
+  int ret = pthread_sigmask(SIG_BLOCK, &sigset, old_sigset);
+  ceph_assert(ret == 0);
+}
+
+void restore_sigset(const sigset_t *old_sigset)
+{
+  int ret = pthread_sigmask(SIG_SETMASK, old_sigset, NULL);
+  ceph_assert(ret == 0);
+}
+
+void unblock_all_signals(sigset_t *old_sigset)
+{
+  sigset_t sigset;
+  sigfillset(&sigset);
+  sigdelset(&sigset, SIGKILL);
+  int ret = pthread_sigmask(SIG_UNBLOCK, &sigset, old_sigset);
+  ceph_assert(ret == 0);
+}
+#else
+std::string signal_mask_to_str()
+{
+  return "(unsupported signal)";
+}
+
+// Windows provides limited signal functionality.
+void block_signals(const int *siglist, sigset_t *old_sigset) {}
+void restore_sigset(const sigset_t *old_sigset) {}
+void unblock_all_signals(sigset_t *old_sigset) {}
+#endif /* _WIN32 */
diff --git a/src/common/signal.h b/src/common/signal.h
new file mode 100644
index 000000000..4b323de47
--- /dev/null
+++ b/src/common/signal.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_SIGNAL_H
+#define CEPH_COMMON_SIGNAL_H
+
+#include <signal.h>
+#include <string>
+
+// Returns a string showing the set of blocked signals for the calling thread.
+// Other threads may have a different set (this is per-thread thing).
+extern std::string signal_mask_to_str();
+
+// Block a list of signals. If siglist == NULL, blocks all signals.
+// If not, the list is terminated with a 0 element.
+//
+// On success, stores the old set of blocked signals in
+// old_sigset. On failure, stores an invalid set of blocked signals in
+// old_sigset.
+extern void block_signals(const int *siglist, sigset_t *old_sigset);
+
+// Restore the set of blocked signals. Will not restore an invalid set of
+// blocked signals.
+extern void restore_sigset(const sigset_t *old_sigset);
+
+// Unblock all signals. On success, stores the old set of blocked signals in
+// old_sigset. On failure, stores an invalid set of blocked signals in
+// old_sigset.
+extern void unblock_all_signals(sigset_t *old_sigset);
+
+#endif
diff --git a/src/common/simple_cache.hpp b/src/common/simple_cache.hpp
new file mode 100644
index 000000000..07d19a731
--- /dev/null
+++ b/src/common/simple_cache.hpp
@@ -0,0 +1,145 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_SIMPLECACHE_H
+#define CEPH_SIMPLECACHE_H
+
+#include <list>
+#include <map>
+#include <unordered_map>
+#include <utility>
+
+#include "common/ceph_mutex.h"
+
+template <class K, class V, class C = std::less<K>, class H = std::hash<K> >
+class SimpleLRU {
+  ceph::mutex lock = ceph::make_mutex("SimpleLRU::lock");
+  size_t max_size;
+  size_t max_bytes = 0;
+  size_t total_bytes = 0;
+  std::unordered_map<K, typename std::list<std::pair<K, V>>::iterator, H> contents;
+  std::list<std::pair<K, V> > lru;
+  std::map<K, V, C> pinned;
+
+  void trim_cache() {
+    while (contents.size() > max_size) {
+      contents.erase(lru.back().first);
+      lru.pop_back();
+    }
+  }
+
+  void trim_cache_bytes() {
+    while(total_bytes > max_bytes) {
+      total_bytes -= lru.back().second.length();
+      contents.erase(lru.back().first);
+      lru.pop_back();
+    }
+  }
+
+  void _add(K key, V&& value) {
+    lru.emplace_front(key, std::move(value)); // can't move key because we access it below
+    contents[key] = lru.begin();
+    trim_cache();
+  }
+
+  void _add_bytes(K key, V&& value) {
+    lru.emplace_front(key, std::move(value)); // can't move key because we access it below
+    contents[key] = lru.begin();
+    trim_cache_bytes();
+  }
+
+public:
+  SimpleLRU(size_t max_size) : max_size(max_size) {
+    contents.rehash(max_size);
+  }
+
+  void pin(K key, V val) {
+    std::lock_guard l(lock);
+    pinned.emplace(std::move(key), std::move(val));
+  }
+
+  void clear_pinned(K e) {
+    std::lock_guard l(lock);
+    for (auto i = pinned.begin();
+	 i != pinned.end() && i->first <= e;
+	 pinned.erase(i++)) {
+      auto iter = contents.find(i->first);
+      if (iter == contents.end())
+	_add(i->first, std::move(i->second));
+      else
+	lru.splice(lru.begin(), lru, iter->second);
+    }
+  }
+
+  void clear(K key) {
+    std::lock_guard l(lock);
+    auto i = contents.find(key);
+    if (i == contents.end())
+      return;
+    total_bytes -= i->second->second.length();
+    lru.erase(i->second);
+    contents.erase(i);
+  }
+
+  void set_size(size_t new_size) {
+    std::lock_guard l(lock);
+    max_size = new_size;
+    trim_cache();
+  }
+
+  size_t get_size() {
+    std::lock_guard l(lock);
+    return contents.size();
+  }
+
+  void set_bytes(size_t num_bytes) {
+    std::lock_guard l(lock);
+    max_bytes = num_bytes;
+    trim_cache_bytes();
+  }
+
+  size_t get_bytes() {
+    std::lock_guard l(lock);
+    return total_bytes;
+  }
+
+  bool lookup(K key, V *out) {
+    std::lock_guard l(lock);
+    auto i = contents.find(key);
+    if (i != contents.end()) {
+      *out = i->second->second;
+      lru.splice(lru.begin(), lru, i->second);
+      return true;
+    }
+    auto i_pinned = pinned.find(key);
+    if (i_pinned != pinned.end()) {
+      *out = i_pinned->second;
+      return true;
+    }
+    return false;
+  }
+
+  void add(K key, V value) {
+    std::lock_guard l(lock);
+    _add(std::move(key), std::move(value));
+  }
+
+  void add_bytes(K key, V value) {
+    std::lock_guard l(lock);
+    total_bytes += value.length();
+    _add_bytes(std::move(key), std::move(value));
+  }
+};
+
+#endif
diff --git a/src/common/snap_types.cc b/src/common/snap_types.cc
new file mode 100644
index 000000000..dc634907b
--- /dev/null
+++ b/src/common/snap_types.cc
@@ -0,0 +1,137 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "snap_types.h"
+#include "common/Formatter.h"
+
+void SnapRealmInfo::encode(ceph::buffer::list& bl) const
+{
+  h.num_snaps = my_snaps.size();
+  h.num_prior_parent_snaps = prior_parent_snaps.size();
+  using ceph::encode;
+  encode(h, bl);
+  ceph::encode_nohead(my_snaps, bl);
+  ceph::encode_nohead(prior_parent_snaps, bl);
+}
+
+void SnapRealmInfo::decode(ceph::buffer::list::const_iterator& bl)
+{
+  using ceph::decode;
+  decode(h, bl);
+  ceph::decode_nohead(h.num_snaps, my_snaps, bl);
+  ceph::decode_nohead(h.num_prior_parent_snaps, prior_parent_snaps, bl);
+}
+
+void SnapRealmInfo::dump(ceph::Formatter *f) const
+{
+  f->dump_unsigned("ino", ino());
+  f->dump_unsigned("parent", parent());
+  f->dump_unsigned("seq", seq());
+  f->dump_unsigned("parent_since", parent_since());
+  f->dump_unsigned("created", created());
+
+  f->open_array_section("snaps");
+  for (auto p = my_snaps.begin(); p != my_snaps.end(); ++p)
+    f->dump_unsigned("snap", *p);
+  f->close_section();
+
+  f->open_array_section("prior_parent_snaps");
+  for (auto p = prior_parent_snaps.begin(); p != prior_parent_snaps.end(); ++p)
+    f->dump_unsigned("snap", *p);
+  f->close_section();
+}
+
+void SnapRealmInfo::generate_test_instances(std::list<SnapRealmInfo*>& o)
+{
+  o.push_back(new SnapRealmInfo);
+  o.push_back(new SnapRealmInfo(1, 10, 10, 0));
+  o.push_back(new SnapRealmInfo(1, 10, 10, 0));
+  o.back()->my_snaps.push_back(10);
+  o.push_back(new SnapRealmInfo(1, 10, 10, 5));
+  o.back()->my_snaps.push_back(10);
+  o.back()->prior_parent_snaps.push_back(3);
+  o.back()->prior_parent_snaps.push_back(5);
+}
+
+// -- "new" SnapRealmInfo --
+
+void SnapRealmInfoNew::encode(ceph::buffer::list& bl) const
+{
+  using ceph::encode;
+  ENCODE_START(1, 1, bl);
+  encode(info, bl);
+  encode(last_modified, bl);
+  encode(change_attr, bl);
+  ENCODE_FINISH(bl);
+}
+
+void SnapRealmInfoNew::decode(ceph::buffer::list::const_iterator& bl)
+{
+  using ceph::decode;
+  DECODE_START(1, bl);
+  decode(info, bl);
+  decode(last_modified, bl);
+  decode(change_attr, bl);
+  DECODE_FINISH(bl);
+}
+
+void SnapRealmInfoNew::dump(ceph::Formatter *f) const
+{
+  info.dump(f);
+  f->dump_stream("last_modified") << last_modified;
+  f->dump_unsigned("change_attr", change_attr);
+}
+
+void SnapRealmInfoNew::generate_test_instances(std::list<SnapRealmInfoNew*>& o)
+{
+  o.push_back(new SnapRealmInfoNew);
+  o.push_back(new SnapRealmInfoNew(SnapRealmInfo(1, 10, 10, 0), utime_t(), 0));
+  o.push_back(new SnapRealmInfoNew(SnapRealmInfo(1, 10, 10, 0), utime_t(), 1));
+  o.back()->info.my_snaps.push_back(10);
+  o.push_back(new SnapRealmInfoNew(SnapRealmInfo(1, 10, 10, 5), utime_t(), 2));
+  o.back()->info.my_snaps.push_back(10);
+  o.back()->info.prior_parent_snaps.push_back(3);
+  o.back()->info.prior_parent_snaps.push_back(5);
+}
+
+// -----
+
+bool SnapContext::is_valid() const
+{
+  // seq is a valid snapid
+  if (seq > CEPH_MAXSNAP)
+    return false;
+  if (!snaps.empty()) {
+    // seq >= snaps[0]
+    if (snaps[0] > seq)
+      return false;
+    // snaps[] is descending
+    snapid_t t = snaps[0];
+    for (unsigned i=1; i<snaps.size(); i++) {
+      if (snaps[i] >= t || t == 0)
+	return false;
+      t = snaps[i];
+    }
+  }
+  return true;
+}
+
+void SnapContext::dump(ceph::Formatter *f) const
+{
+  f->dump_unsigned("seq", seq);
+  f->open_array_section("snaps");
+  for (auto p = snaps.cbegin(); p != snaps.cend(); ++p)
+    f->dump_unsigned("snap", *p);
+  f->close_section();
+}
+
+void SnapContext::generate_test_instances(std::list<SnapContext*>& o)
+{
+  o.push_back(new SnapContext);
+  std::vector<snapid_t> v;
+  o.push_back(new SnapContext(10, v));
+  v.push_back(18);
+  v.push_back(3);
+  v.push_back(1);
+  o.push_back(new SnapContext(20, v));
+}
diff --git a/src/common/snap_types.h b/src/common/snap_types.h
new file mode 100644
index 000000000..d87e763c4
--- /dev/null
+++ b/src/common/snap_types.h
@@ -0,0 +1,111 @@
+#ifndef __CEPH_SNAP_TYPES_H
+#define __CEPH_SNAP_TYPES_H
+
+#include "include/types.h"
+#include "include/utime.h"
+#include "include/fs_types.h"
+
+namespace ceph {
+class Formatter;
+}
+
+struct SnapRealmInfo {
+  mutable ceph_mds_snap_realm h;
+  std::vector<snapid_t> my_snaps;
+  std::vector<snapid_t> prior_parent_snaps;  // before parent_since
+
+  SnapRealmInfo() {
+    // FIPS zeroization audit 20191115: this memset is not security related.
+    memset(&h, 0, sizeof(h));
+  }
+  SnapRealmInfo(inodeno_t ino_, snapid_t created_, snapid_t seq_, snapid_t current_parent_since_) {
+    // FIPS zeroization audit 20191115: this memset is not security related.
+    memset(&h, 0, sizeof(h));
+    h.ino = ino_;
+    h.created = created_;
+    h.seq = seq_;
+    h.parent_since = current_parent_since_;
+  }
+
+  inodeno_t ino() const { return inodeno_t(h.ino); }
+  inodeno_t parent() const { return inodeno_t(h.parent); }
+  snapid_t seq() const { return snapid_t(h.seq); }
+  snapid_t parent_since() const { return snapid_t(h.parent_since); }
+  snapid_t created() const { return snapid_t(h.created); }
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<SnapRealmInfo*>& o);
+};
+WRITE_CLASS_ENCODER(SnapRealmInfo)
+
+// "new* snap realm info - carries additional metadata (last modified,
+// change_attr) and is version encoded.
+struct SnapRealmInfoNew {
+  SnapRealmInfo info;
+  utime_t last_modified;
+  uint64_t change_attr;
+
+  SnapRealmInfoNew() {
+  }
+
+  SnapRealmInfoNew(const SnapRealmInfo &info_, utime_t last_modified_, uint64_t change_attr_) {
+    // FIPS zeroization audit 20191115: this memset is not security related.
+    info = info_;
+    last_modified = last_modified_;
+    change_attr = change_attr_;
+  }
+
+  inodeno_t ino() const { return inodeno_t(info.h.ino); }
+  inodeno_t parent() const { return inodeno_t(info.h.parent); }
+  snapid_t seq() const { return snapid_t(info.h.seq); }
+  snapid_t parent_since() const { return snapid_t(info.h.parent_since); }
+  snapid_t created() const { return snapid_t(info.h.created); }
+
+  void encode(ceph::buffer::list& bl) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<SnapRealmInfoNew*>& o);
+};
+WRITE_CLASS_ENCODER(SnapRealmInfoNew)
+
+struct SnapContext {
+  snapid_t seq;            // 'time' stamp
+  std::vector<snapid_t> snaps;  // existent snaps, in descending order
+
+  SnapContext() {}
+  SnapContext(snapid_t s, const std::vector<snapid_t>& v) : seq(s), snaps(v) {}    
+
+  bool is_valid() const;
+
+  void clear() {
+    seq = 0;
+    snaps.clear();
+  }
+  bool empty() const { return seq == 0; }
+
+  void encode(ceph::buffer::list& bl) const {
+    using ceph::encode;
+    encode(seq, bl);
+    encode(snaps, bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    using ceph::decode;
+    decode(seq, bl);
+    decode(snaps, bl);
+  }
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<SnapContext*>& o);
+};
+WRITE_CLASS_ENCODER(SnapContext)
+
+inline std::ostream& operator<<(std::ostream& out, const SnapContext& snapc) {
+  return out << snapc.seq << "=" << snapc.snaps;
+}
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<SnapContext> : fmt::ostream_formatter {};
+#endif
+
+#endif
diff --git a/src/common/solaris_errno.cc b/src/common/solaris_errno.cc
new file mode 100644
index 000000000..bb68e68fa
--- /dev/null
+++ b/src/common/solaris_errno.cc
@@ -0,0 +1,233 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/types.h"
+
+
+// converts from linux errno values to host values
+__s32 ceph_to_hostos_errno(__s32 r) 
+{
+  if (r < -34) {
+    switch (r) {
+      case -35:
+        return -EDEADLK;
+      case -36:
+        return -ENAMETOOLONG;
+      case -37:
+        return -ENOLCK;
+      case -38:
+        return -ENOSYS;
+      case -39:
+        return -ENOTEMPTY;
+      case -40:
+        return -ELOOP;
+      case -42:
+        return -ENOMSG;
+      case -43:
+        return -EIDRM;
+      case -44:
+        return -ECHRNG;
+      case -45:
+        return -EL2NSYNC;
+      case -46:
+        return -EL3HLT;
+      case -47:
+        return -EL3RST;
+      case -48:
+        return -ELNRNG;
+      case -49:
+        return -EUNATCH;
+      case -50:
+        return -ENOCSI;
+      case -51:
+        return -EL2HLT;
+      case -52:
+        return -EBADE;
+      case -53:
+        return -EBADR;
+      case -54:
+        return -EXFULL;
+      case -55:
+        return -ENOANO;
+      case -56:
+        return -EBADRQC;
+      case -57:
+        return -EBADSLT;
+      case -59:
+        return -EBFONT;
+      case -60:
+        return -ENOSTR;
+      case -61:
+        return -ENODATA;
+      case -62:
+        return -ETIME;
+      case -63:
+        return -ENOSR;
+      //case -64:
+      //  return -EPERM; //TODO ENONET
+      //case -65:
+      //  return -EPERM; //TODO ENOPKG
+      //case -66:
+      //  return -EREMOTE;
+      //case -67:
+      //  return -ENOLINK;
+      //case -68:
+      //  return -EPERM; //TODO EADV 
+      //case -69:
+      //  return -EPERM; //TODO ESRMNT 
+      //case -70:
+      //  return -EPERM; //TODO ECOMM
+      case -71:
+        return -EPROTO;
+      case -72:
+        return -EMULTIHOP;
+      case -73:
+        return -EPERM; //TODO EDOTDOT 
+      case -74:
+        return -EBADMSG;
+      case -75:
+        return -EOVERFLOW;
+      case -76:
+        return -ENOTUNIQ;
+      case -77:
+        return -EBADFD;
+      case -78:
+        return -EREMCHG;
+      case -79:
+        return -ELIBACC;
+      case -80:
+        return -ELIBBAD;
+      case -81:
+        return -ELIBSCN;
+      case -82:
+        return -ELIBMAX;
+      case -83:
+	return -ELIBEXEC;
+      case -84:
+        return -EILSEQ;
+      case -85:
+        return -ERESTART;
+      case -86:
+        return -ESTRPIPE; 
+      case -87:
+        return -EUSERS;
+      case -88:
+        return -ENOTSOCK;
+      case -89:
+        return -EDESTADDRREQ;
+      case -90:
+        return -EMSGSIZE;
+      case -91:
+        return -EPROTOTYPE;
+      case -92:
+        return -ENOPROTOOPT;
+      case -93:
+        return -EPROTONOSUPPORT;
+      case -94:
+        return -ESOCKTNOSUPPORT;
+      case -95:
+        return -EOPNOTSUPP;
+      case -96:
+        return -EPFNOSUPPORT;
+      case -97:
+        return -EAFNOSUPPORT;
+      case -98:
+        return -EADDRINUSE;
+      case -99:
+        return -EADDRNOTAVAIL;
+      case -100:
+        return -ENETDOWN;
+      case -101:
+        return -ENETUNREACH;
+      case -102:
+        return -ENETRESET;
+      case -103:
+        return -ECONNABORTED;
+      case -104:
+        return -ECONNRESET;
+      case -105:
+        return -ENOBUFS;
+      case -106:
+        return -EISCONN;
+      case -107:
+        return -ENOTCONN;
+      case -108:
+        return -ESHUTDOWN;
+      case -109:
+        return -ETOOMANYREFS;
+      case -110:
+        return -ETIMEDOUT;
+      case -111:
+        return -ECONNREFUSED;
+      case -112:
+        return -EHOSTDOWN;
+      case -113:
+        return -EHOSTUNREACH;
+      case -114:
+        return -EALREADY;
+      case -115:
+        return -EINPROGRESS;
+      case -116:
+        return -ESTALE;
+      case -117:
+        return -EPERM; //TODO EUCLEAN 
+      case -118:
+        return -EPERM; //TODO ENOTNAM
+      case -119:
+        return -EPERM; //TODO ENAVAIL
+      case -120:
+        return -EPERM; //TODO EISNAM
+      case -121:
+        return -EPERM; //TODO EREMOTEIO
+      case -122:
+        return -EDQUOT;
+      case -123:
+        return -EPERM; //TODO ENOMEDIUM
+      case -124:
+        return -EPERM; //TODO EMEDIUMTYPE - not used
+      case -125:
+        return -ECANCELED;
+      case -126:
+        return -EPERM; //TODO ENOKEY
+      case -127:
+        return -EPERM; //TODO EKEYEXPIRED
+      case -128:
+        return -EPERM; //TODO EKEYREVOKED
+      case -129:
+        return -EPERM; //TODO EKEYREJECTED
+      case -130:
+        return -EOWNERDEAD;
+      case -131:
+        return -ENOTRECOVERABLE;
+      case -132:
+        return -EPERM; //TODO ERFKILL
+      case -133:
+        return -EPERM; //TODO EHWPOISON
+
+      default: { 
+        break;
+      }
+    }
+  } 
+  return r; // otherwise return original value
+}
+
+// converts Host OS errno values to linux/Ceph values
+// XXX Currently not worked out
+__s32 hostos_to_ceph_errno(__s32 r)
+{
+  return r;
+}
+
+
diff --git a/src/common/split.h b/src/common/split.h
new file mode 100644
index 000000000..1b12963ce
--- /dev/null
+++ b/src/common/split.h
@@ -0,0 +1,107 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <string_view>
+
+namespace ceph {
+
+// a forward iterator over the parts of a split string
+class spliterator {
+  std::string_view str; // full string
+  std::string_view delims; // delimiters
+
+  using size_type = std::string_view::size_type;
+  size_type pos = 0; // start position of current part
+  std::string_view part; // view of current part
+
+  // return the next part after the given position
+  std::string_view next(size_type end) {
+    pos = str.find_first_not_of(delims, end);
+    if (pos == str.npos) {
+      return {};
+    }
+    return str.substr(pos, str.find_first_of(delims, pos) - pos);
+  }
+ public:
+  // types required by std::iterator_traits
+  using difference_type = int;
+  using value_type = std::string_view;
+  using pointer = const value_type*;
+  using reference = const value_type&;
+  using iterator_category = std::forward_iterator_tag;
+
+  spliterator() = default;
+
+  spliterator(std::string_view str, std::string_view delims)
+    : str(str), delims(delims), pos(0), part(next(0))
+  {}
+
+  spliterator& operator++() {
+    part = next(pos + part.size());
+    return *this;
+  }
+  spliterator operator++(int) {
+    spliterator tmp = *this;
+    part = next(pos + part.size());
+    return tmp;
+  }
+
+  reference operator*() const { return part; }
+  pointer operator->() const { return &part; }
+
+  friend bool operator==(const spliterator& lhs, const spliterator& rhs) {
+    return lhs.part.data() == rhs.part.data()
+        && lhs.part.size() == rhs.part.size();
+  }
+  friend bool operator!=(const spliterator& lhs, const spliterator& rhs) {
+    return lhs.part.data() != rhs.part.data()
+        || lhs.part.size() != rhs.part.size();
+  }
+};
+
+// represents an immutable range of split string parts
+//
+// ranged-for loop example:
+//
+//   for (std::string_view s : split(input)) {
+//     ...
+//
+// container initialization example:
+//
+//   auto parts = split(input);
+//
+//   std::vector<std::string> strings;
+//   strings.assign(parts.begin(), parts.end());
+//
+class split {
+  std::string_view str; // full string
+  std::string_view delims; // delimiters
+ public:
+  split(std::string_view str, std::string_view delims = ";,= \t\n")
+    : str(str), delims(delims) {}
+
+  using iterator = spliterator;
+  using const_iterator = spliterator;
+
+  iterator begin() const { return {str, delims}; }
+  const_iterator cbegin() const { return {str, delims}; }
+
+  iterator end() const { return {}; }
+  const_iterator cend() const { return {}; }
+};
+
+} // namespace ceph
diff --git a/src/common/sstring.hh b/src/common/sstring.hh
new file mode 100644
index 000000000..b0fcd9b5c
--- /dev/null
+++ b/src/common/sstring.hh
@@ -0,0 +1,717 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2014 Cloudius Systems
+ */
+/*
+ * C++2014 dependencies removed.  Uses of std::string_view adapted to
+ * std::string_view.  Matt Benjamin <mbenjamin@redhat.com>
+ */
+
+#ifndef SSTRING_HH_
+#define SSTRING_HH_
+
+#include <string_view>
+#include <type_traits>
+
+#include "include/buffer.h"
+#include "include/denc.h"
+
+template <typename char_type, typename Size, Size max_size>
+class basic_sstring;
+
+using sstring = basic_sstring<char, uint32_t, 15>;
+
+template <typename string_type = sstring, typename T>
+inline string_type to_sstring(T value);
+
+template <typename char_type, typename Size, Size max_size>
+class basic_sstring {
+    static_assert(
+            (std::is_same<char_type, char>::value
+             || std::is_same<char_type, signed char>::value
+             || std::is_same<char_type, unsigned char>::value),
+            "basic_sstring only supports single byte char types");
+    union contents {
+        struct external_type {
+            char_type* str;
+            Size size;
+            int8_t pad;
+        } external;
+        struct internal_type {
+            char_type str[max_size];
+            int8_t size;
+        } internal;
+        static_assert(sizeof(external_type) <= sizeof(internal_type), "max_size too small");
+        static_assert(max_size <= 127, "max_size too large");
+    } u;
+    bool is_internal() const noexcept {
+        return u.internal.size >= 0;
+    }
+    bool is_external() const noexcept {
+        return !is_internal();
+    }
+    const char_type* str() const {
+        return is_internal() ? u.internal.str : u.external.str;
+    }
+    char_type* str() {
+        return is_internal() ? u.internal.str : u.external.str;
+    }
+
+    template <typename string_type, typename T>
+    static inline string_type to_sstring_sprintf(T value, const char* fmt) {
+        char tmp[sizeof(value) * 3 + 2];
+        auto len = std::sprintf(tmp, fmt, value);
+        using ch_type = typename string_type::value_type;
+        return string_type(reinterpret_cast<ch_type*>(tmp), len);
+    }
+
+    template <typename string_type>
+    static inline string_type to_sstring(int value) {
+        return to_sstring_sprintf<string_type>(value, "%d");
+    }
+
+    template <typename string_type>
+    static inline string_type to_sstring(unsigned value) {
+        return to_sstring_sprintf<string_type>(value, "%u");
+    }
+
+    template <typename string_type>
+    static inline string_type to_sstring(long value) {
+        return to_sstring_sprintf<string_type>(value, "%ld");
+    }
+
+    template <typename string_type>
+    static inline string_type to_sstring(unsigned long value) {
+        return to_sstring_sprintf<string_type>(value, "%lu");
+    }
+
+    template <typename string_type>
+    static inline string_type to_sstring(long long value) {
+        return to_sstring_sprintf<string_type>(value, "%lld");
+    }
+
+    template <typename string_type>
+    static inline string_type to_sstring(unsigned long long value) {
+        return to_sstring_sprintf<string_type>(value, "%llu");
+    }
+
+    template <typename string_type>
+    static inline string_type to_sstring(float value) {
+        return to_sstring_sprintf<string_type>(value, "%g");
+    }
+
+    template <typename string_type>
+    static inline string_type to_sstring(double value) {
+        return to_sstring_sprintf<string_type>(value, "%g");
+    }
+
+    template <typename string_type>
+    static inline string_type to_sstring(long double value) {
+        return to_sstring_sprintf<string_type>(value, "%Lg");
+    }
+
+    template <typename string_type>
+    static inline string_type to_sstring(const char* value) {
+        return string_type(value);
+    }
+
+    template <typename string_type>
+    static inline string_type to_sstring(sstring value) {
+        return value;
+    }
+
+public:
+    using value_type = char_type;
+    using traits_type = std::char_traits<char_type>;
+    using allocator_type = std::allocator<char_type>;
+    using reference = char_type&;
+    using const_reference = const char_type&;
+    using pointer = char_type*;
+    using const_pointer = const char_type*;
+    using iterator = char_type*;
+    using const_iterator = const char_type*;
+    // FIXME: add reverse_iterator and friend
+    using difference_type = ssize_t;  // std::make_signed_t<Size> can be too small
+    using size_type = Size;
+    static constexpr size_type  npos = static_cast<size_type>(-1);
+public:
+    struct initialized_later {};
+
+    basic_sstring() noexcept {
+        u.internal.size = 0;
+        u.internal.str[0] = '\0';
+    }
+    basic_sstring(const basic_sstring& x) {
+        if (x.is_internal()) {
+            u.internal = x.u.internal;
+        } else {
+            u.internal.size = -1;
+            u.external.str = reinterpret_cast<char_type*>(std::malloc(x.u.external.size + 1));
+            if (!u.external.str) {
+                throw std::bad_alloc();
+            }
+            std::copy(x.u.external.str, x.u.external.str + x.u.external.size + 1, u.external.str);
+            u.external.size = x.u.external.size;
+        }
+    }
+    basic_sstring(basic_sstring&& x) noexcept {
+        u = x.u;
+        x.u.internal.size = 0;
+        x.u.internal.str[0] = '\0';
+    }
+    basic_sstring(initialized_later, size_t size) {
+        if (size_type(size) != size) {
+            throw std::overflow_error("sstring overflow");
+        }
+        if (size + 1 <= sizeof(u.internal.str)) {
+            u.internal.str[size] = '\0';
+            u.internal.size = size;
+        } else {
+            u.internal.size = -1;
+            u.external.str = reinterpret_cast<char_type*>(std::malloc(size + 1));
+            if (!u.external.str) {
+                throw std::bad_alloc();
+            }
+            u.external.size = size;
+            u.external.str[size] = '\0';
+        }
+    }
+    basic_sstring(const char_type* x, size_t size) {
+        if (size_type(size) != size) {
+            throw std::overflow_error("sstring overflow");
+        }
+        if (size + 1 <= sizeof(u.internal.str)) {
+            std::copy(x, x + size, u.internal.str);
+            u.internal.str[size] = '\0';
+            u.internal.size = size;
+        } else {
+            u.internal.size = -1;
+            u.external.str = reinterpret_cast<char_type*>(std::malloc(size + 1));
+            if (!u.external.str) {
+                throw std::bad_alloc();
+            }
+            u.external.size = size;
+            std::copy(x, x + size, u.external.str);
+            u.external.str[size] = '\0';
+        }
+    }
+
+    basic_sstring(size_t size, char_type x) : basic_sstring(initialized_later(), size) {
+        memset(begin(), x, size);
+    }
+
+    basic_sstring(const char* x) : basic_sstring(reinterpret_cast<const char_type*>(x), std::strlen(x)) {}
+    basic_sstring(std::basic_string<char_type>& x) : basic_sstring(x.c_str(), x.size()) {}
+    basic_sstring(std::initializer_list<char_type> x) : basic_sstring(x.begin(), x.end() - x.begin()) {}
+    basic_sstring(const char_type* b, const char_type* e) : basic_sstring(b, e - b) {}
+    basic_sstring(const std::basic_string<char_type>& s)
+        : basic_sstring(s.data(), s.size()) {}
+    template <typename InputIterator>
+    basic_sstring(InputIterator first, InputIterator last)
+            : basic_sstring(initialized_later(), std::distance(first, last)) {
+        std::copy(first, last, begin());
+    }
+    ~basic_sstring() noexcept {
+        if (is_external()) {
+            std::free(u.external.str);
+        }
+    }
+    basic_sstring& operator=(const basic_sstring& x) {
+        basic_sstring tmp(x);
+        swap(tmp);
+        return *this;
+    }
+    basic_sstring& operator=(basic_sstring&& x) noexcept {
+        if (this != &x) {
+            swap(x);
+            x.reset();
+        }
+        return *this;
+    }
+    operator std::basic_string<char_type>() const {
+        return { str(), size() };
+    }
+    size_t size() const noexcept {
+        return is_internal() ? u.internal.size : u.external.size;
+    }
+
+    size_t length() const noexcept {
+        return size();
+    }
+
+    size_t find(char_type t, size_t pos = 0) const noexcept {
+        const char_type* it = str() + pos;
+        const char_type* end = str() + size();
+        while (it < end) {
+            if (*it == t) {
+                return it - str();
+            }
+            it++;
+        }
+        return npos;
+    }
+
+    size_t find(const basic_sstring& s, size_t pos = 0) const noexcept {
+        const char_type* it = str() + pos;
+        const char_type* end = str() + size();
+        const char_type* c_str = s.str();
+        const char_type* c_str_end = s.str() + s.size();
+
+        while (it < end) {
+            auto i = it;
+            auto j = c_str;
+            while ( i < end && j < c_str_end && *i == *j) {
+                i++;
+                j++;
+            }
+            if (j == c_str_end) {
+                return it - str();
+            }
+            it++;
+        }
+        return npos;
+    }
+
+    /**
+     * find_last_of find the last occurrence of c in the string.
+     * When pos is specified, the search only includes characters
+     * at or before position pos.
+     *
+     */
+    size_t find_last_of (char_type c, size_t pos = npos) const noexcept {
+        const char_type* str_start = str();
+        if (size()) {
+            if (pos >= size()) {
+                pos = size() - 1;
+            }
+            const char_type* p = str_start + pos + 1;
+            do {
+                p--;
+                if (*p == c) {
+                    return (p - str_start);
+                }
+            } while (p != str_start);
+        }
+        return npos;
+    }
+
+    /**
+     *  Append a C substring.
+     *  @param s  The C string to append.
+     *  @param n  The number of characters to append.
+     *  @return  Reference to this string.
+     */
+    basic_sstring& append (const char_type* s, size_t n) {
+        basic_sstring ret(initialized_later(), size() + n);
+        std::copy(begin(), end(), ret.begin());
+        std::copy(s, s + n, ret.begin() + size());
+        *this = std::move(ret);
+        return *this;
+    }
+
+    /**
+     *  Replace characters with a value of a C style substring.
+     *
+     */
+    basic_sstring& replace(size_type pos, size_type n1, const char_type* s,
+             size_type n2) {
+        if (pos > size()) {
+            throw std::out_of_range("sstring::replace out of range");
+        }
+
+        if (n1 > size() - pos) {
+            n1 = size() - pos;
+        }
+
+        if (n1 == n2) {
+            if (n2) {
+                std::copy(s, s + n2, begin() + pos);
+            }
+            return *this;
+        }
+        basic_sstring ret(initialized_later(), size() + n2 - n1);
+        char_type* p= ret.begin();
+        std::copy(begin(), begin() + pos, p);
+        p += pos;
+        if (n2) {
+            std::copy(s, s + n2, p);
+        }
+        p += n2;
+        std::copy(begin() + pos + n1, end(), p);
+        *this = std::move(ret);
+        return *this;
+    }
+
+    template <class InputIterator>
+    basic_sstring& replace (const_iterator i1, const_iterator i2,
+            InputIterator first, InputIterator last) {
+        if (i1 < begin() || i1 > end() || i2 < begin()) {
+            throw std::out_of_range("sstring::replace out of range");
+        }
+        if (i2 > end()) {
+            i2 = end();
+        }
+
+        if (i2 - i1 == last - first) {
+            //in place replacement
+            std::copy(first, last, const_cast<char_type*>(i1));
+            return *this;
+        }
+        basic_sstring ret(initialized_later(), size() + (last - first) - (i2 - i1));
+        char_type* p = ret.begin();
+        p = std::copy(cbegin(), i1, p);
+        p = std::copy(first, last, p);
+        std::copy(i2, cend(), p);
+        *this = std::move(ret);
+        return *this;
+    }
+
+    iterator erase(iterator first, iterator last) {
+        size_t pos = first - begin();
+        replace(pos, last - first, nullptr, 0);
+        return begin() + pos;
+    }
+
+    /**
+     * Inserts additional characters into the string right before
+     * the character indicated by p.
+     */
+    template <class InputIterator>
+    void insert(const_iterator p, InputIterator beg, InputIterator end) {
+        replace(p, p, beg, end);
+    }
+
+    /**
+     *  Returns a read/write reference to the data at the last
+     *  element of the string.
+     *  This function shall not be called on empty strings.
+     */
+    reference
+    back() noexcept {
+        return operator[](size() - 1);
+    }
+
+    /**
+     *  Returns a  read-only (constant) reference to the data at the last
+     *  element of the string.
+     *  This function shall not be called on empty strings.
+     */
+    const_reference
+    back() const noexcept {
+        return operator[](size() - 1);
+    }
+
+    basic_sstring substr(size_t from, size_t len = npos)  const {
+        if (from > size()) {
+            throw std::out_of_range("sstring::substr out of range");
+        }
+        if (len > size() - from) {
+            len = size() - from;
+        }
+        if (len == 0) {
+            return "";
+        }
+        return { str() + from , len };
+    }
+
+    const char_type& at(size_t pos) const {
+        if (pos >= size()) {
+            throw std::out_of_range("sstring::at out of range");
+        }
+        return *(str() + pos);
+    }
+
+    char_type& at(size_t pos) {
+        if (pos >= size()) {
+            throw std::out_of_range("sstring::at out of range");
+        }
+        return *(str() + pos);
+    }
+
+    bool empty() const noexcept {
+        return u.internal.size == 0;
+    }
+    void reset() noexcept {
+        if (is_external()) {
+            std::free(u.external.str);
+        }
+        u.internal.size = 0;
+        u.internal.str[0] = '\0';
+    }
+
+    int compare(const basic_sstring& x) const noexcept {
+        auto n = traits_type::compare(begin(), x.begin(), std::min(size(), x.size()));
+        if (n != 0) {
+            return n;
+        }
+        if (size() < x.size()) {
+            return -1;
+        } else if (size() > x.size()) {
+            return 1;
+        } else {
+            return 0;
+        }
+    }
+
+    int compare(size_t pos, size_t sz, const basic_sstring& x) const {
+        if (pos > size()) {
+            throw std::out_of_range("pos larger than string size");
+        }
+
+        sz = std::min(size() - pos, sz);
+        auto n = traits_type::compare(begin() + pos, x.begin(), std::min(sz, x.size()));
+        if (n != 0) {
+            return n;
+        }
+        if (sz < x.size()) {
+            return -1;
+        } else if (sz > x.size()) {
+            return 1;
+        } else {
+            return 0;
+        }
+    }
+
+    void swap(basic_sstring& x) noexcept {
+        contents tmp;
+        tmp = x.u;
+        x.u = u;
+        u = tmp;
+    }
+    const char_type* c_str() const {
+        return str();
+    }
+    const char_type* begin() const { return str(); }
+    const char_type* end() const { return str() + size(); }
+    const char_type* cbegin() const { return str(); }
+    const char_type* cend() const { return str() + size(); }
+    char_type* begin() { return str(); }
+    char_type* end() { return str() + size(); }
+    bool operator==(const basic_sstring& x) const {
+        return size() == x.size() && std::equal(begin(), end(), x.begin());
+    }
+    bool operator!=(const basic_sstring& x) const {
+        return !operator==(x);
+    }
+    bool operator<(const basic_sstring& x) const {
+        return compare(x) < 0;
+    }
+    basic_sstring operator+(const basic_sstring& x) const {
+        basic_sstring ret(initialized_later(), size() + x.size());
+        std::copy(begin(), end(), ret.begin());
+        std::copy(x.begin(), x.end(), ret.begin() + size());
+        return ret;
+    }
+    basic_sstring& operator+=(const basic_sstring& x) {
+        return *this = *this + x;
+    }
+    char_type& operator[](size_type pos) {
+        return str()[pos];
+    }
+    const char_type& operator[](size_type pos) const {
+        return str()[pos];
+    }
+    operator std::basic_string_view<char_type, traits_type>() const {
+		return std::basic_string_view<char_type, traits_type>(str(), size());
+    }
+    template <typename string_type, typename T>
+    friend inline string_type to_sstring(T value);
+};
+template <typename char_type, typename Size, Size max_size>
+constexpr Size basic_sstring<char_type, Size, max_size>::npos;
+
+template <typename char_type, typename size_type, size_type Max, size_type N>
+inline
+basic_sstring<char_type, size_type, Max>
+operator+(const char(&s)[N], const basic_sstring<char_type, size_type, Max>& t) {
+    using sstring = basic_sstring<char_type, size_type, Max>;
+    // don't copy the terminating NUL character
+    sstring ret(typename sstring::initialized_later(), N-1 + t.size());
+    auto p = std::copy(std::begin(s), std::end(s)-1, ret.begin());
+    std::copy(t.begin(), t.end(), p);
+    return ret;
+}
+
+template <size_t N>
+static inline
+size_t str_len(const char(&s)[N]) { return N - 1; }
+
+template <size_t N>
+static inline
+const char* str_begin(const char(&s)[N]) { return s; }
+
+template <size_t N>
+static inline
+const char* str_end(const char(&s)[N]) { return str_begin(s) + str_len(s); }
+
+template <typename char_type, typename size_type, size_type max_size>
+static inline
+const char_type* str_begin(const basic_sstring<char_type, size_type, max_size>& s) { return s.begin(); }
+
+template <typename char_type, typename size_type, size_type max_size>
+static inline
+const char_type* str_end(const basic_sstring<char_type, size_type, max_size>& s) { return s.end(); }
+
+template <typename char_type, typename size_type, size_type max_size>
+static inline
+size_type str_len(const basic_sstring<char_type, size_type, max_size>& s) { return s.size(); }
+
+template <typename First, typename Second, typename... Tail>
+static inline
+size_t str_len(const First& first, const Second& second, const Tail&... tail) {
+    return str_len(first) + str_len(second, tail...);
+}
+
+template <typename char_type, typename size_type, size_type max_size>
+inline
+void swap(basic_sstring<char_type, size_type, max_size>& x,
+          basic_sstring<char_type, size_type, max_size>& y) noexcept
+{
+    return x.swap(y);
+}
+
+template <typename char_type, typename size_type, size_type max_size, typename char_traits>
+inline
+std::basic_ostream<char_type, char_traits>&
+operator<<(std::basic_ostream<char_type, char_traits>& os,
+        const basic_sstring<char_type, size_type, max_size>& s) {
+    return os.write(s.begin(), s.size());
+}
+
+template <typename char_type, typename size_type, size_type max_size, typename char_traits>
+inline
+std::basic_istream<char_type, char_traits>&
+operator>>(std::basic_istream<char_type, char_traits>& is,
+        basic_sstring<char_type, size_type, max_size>& s) {
+    std::string tmp;
+    is >> tmp;
+    s = tmp;
+    return is;
+}
+
+namespace std {
+
+template <typename char_type, typename size_type, size_type max_size>
+struct hash<basic_sstring<char_type, size_type, max_size>> {
+    size_t operator()(const basic_sstring<char_type, size_type, max_size>& s) const {
+		using traits_type = std::char_traits<char_type>;
+		return std::hash<std::basic_string_view<char_type,traits_type>>()(s);
+    }
+};
+
+}
+
+static inline
+char* copy_str_to(char* dst) {
+    return dst;
+}
+
+template <typename Head, typename... Tail>
+static inline
+char* copy_str_to(char* dst, const Head& head, const Tail&... tail) {
+    return copy_str_to(std::copy(str_begin(head), str_end(head), dst), tail...);
+}
+
+template <typename String = sstring, typename... Args>
+static String make_sstring(Args&&... args)
+{
+    String ret(sstring::initialized_later(), str_len(args...));
+    copy_str_to(ret.begin(), args...);
+    return ret;
+}
+
+template <typename string_type, typename T>
+inline string_type to_sstring(T value) {
+    return sstring::to_sstring<string_type>(value);
+}
+
+
+// encode/decode
+template <typename Char, typename Size, Size Max>
+struct denc_traits<basic_sstring<Char, Size, Max>> {
+private:
+  using value_type = basic_sstring<Char, Size, Max>;
+public:
+  static constexpr bool supported = true;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = false;
+
+  static void bound_encode(const value_type& s, size_t& p, uint64_t f=0) {
+    p += sizeof(Size) + s.size();
+  }
+
+  static void encode_nohead(const value_type& s,
+                            buffer::list::contiguous_appender& p)
+  {
+    auto len = s.size();
+    if (len) {
+      p.append(reinterpret_cast<const char*>(s.c_str()), len);
+    }
+  }
+
+  static void decode_nohead(size_t len, value_type& s,
+                            buffer::ptr::const_iterator& p)
+  {
+    s.reset();
+    if (len) {
+      s.append(reinterpret_cast<const Char*>(p.get_pos_add(len)), len);
+    }
+  }
+
+  static void encode(const value_type& s,
+                     buffer::list::contiguous_appender& p,
+                     uint64_t f=0)
+  {
+    Size len = (Size)(s.size());
+    ::denc(len, p);
+    if (len) {
+      p.append(reinterpret_cast<const char*>(s.c_str()), len);
+    }
+  }
+
+  static void decode(value_type& s,
+                     buffer::ptr::const_iterator& p,
+                     uint64_t f=0)
+  {
+    Size len;
+    ::denc(len, p);
+    decode_nohead(len, s, p);
+  }
+};
+
+#if 0 /* XXX conflicts w/Ceph types.h */
+template <typename T>
+inline
+std::ostream& operator<<(std::ostream& os, const std::vector<T>& v) {
+    bool first = true;
+    os << "{";
+    for (auto&& elem : v) {
+        if (!first) {
+            os << ", ";
+        } else {
+            first = false;
+        }
+        os << elem;
+    }
+    os << "}";
+    return os;
+}
+#endif
+
+#endif /* SSTRING_HH_ */
diff --git a/src/common/static_ptr.h b/src/common/static_ptr.h
new file mode 100644
index 000000000..31df4cf0a
--- /dev/null
+++ b/src/common/static_ptr.h
@@ -0,0 +1,341 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <utility>
+#include <type_traits>
+
+namespace ceph {
+// `static_ptr`
+// ===========
+//
+// It would be really nice if polymorphism didn't require a bunch of
+// mucking about with the heap. So let's build something where we
+// don't have to do that.
+//
+namespace _mem {
+
+// This, an operator function, is one of the canonical ways to do type
+// erasure in C++ so long as all operations can be done with subsets
+// of the same arguments (which is not true for function type erasure)
+// it's a pretty good one.
+enum class op {
+  move, destroy, size
+};
+template<typename T>
+static std::size_t op_fun(op oper, void* p1, void* p2)
+{
+  auto me = static_cast<T*>(p1);
+
+  switch (oper) {
+  case op::move:
+    new (p2) T(std::move(*me));
+    break;
+
+  case op::destroy:
+    me->~T();
+    break;
+
+  case op::size:
+    return sizeof(T);
+  }
+  return 0;
+}
+}
+// The thing itself!
+//
+// The default value for Size may be wrong in almost all cases. You
+// can change it to your heart's content. The upside is that you'll
+// just get a compile error and you can bump it up.
+//
+// I *recommend* having a size constant in header files (or perhaps a
+// using declaration, e.g.
+// ```
+// using StaticFoo = static_ptr<Foo, sizeof(Blah)>`
+// ```
+// in some header file that can be used multiple places) so that when
+// you create a new derived class with a larger size, you only have to
+// change it in one place.
+//
+template<typename Base, std::size_t Size = sizeof(Base)>
+class static_ptr {
+  template<typename U, std::size_t S>
+  friend class static_ptr;
+
+  // Refuse to be set to anything with whose type we are
+  // incompatible. Also never try to eat anything bigger than you are.
+  //
+  template<typename T, std::size_t S>
+  constexpr static int create_ward() noexcept {
+    static_assert(std::is_void_v<Base> ||
+                  std::is_base_of_v<Base, std::decay_t<T>>,
+                  "Value to store must be a derivative of the base.");
+    static_assert(S <= Size, "Value too large.");
+    static_assert(std::is_void_v<Base> || !std::is_const<Base>{} ||
+                  std::is_const_v<T>,
+                  "Cannot assign const pointer to non-const pointer.");
+    return 0;
+  }
+  // Here we can store anything that has the same signature, which is
+  // relevant to the multiple-versions for move/copy support that I
+  // mentioned above.
+  //
+  size_t (*operate)(_mem::op, void*, void*);
+
+  // This is mutable so that get and the dereference operators can be
+  // const. Since we're modeling a pointer, we should preserve the
+  // difference in semantics between a pointer-to-const and a const
+  // pointer.
+  //
+  mutable typename std::aligned_storage<Size>::type buf;
+
+public:
+  using element_type = Base;
+  using pointer = Base*;
+
+  // Empty
+  static_ptr() noexcept : operate(nullptr) {}
+  static_ptr(std::nullptr_t) noexcept : operate(nullptr) {}
+  static_ptr& operator =(std::nullptr_t) noexcept {
+    reset();
+    return *this;
+  }
+  ~static_ptr() noexcept {
+    reset();
+  }
+
+  // Since other pointer-ish types have it
+  void reset() noexcept {
+    if (operate) {
+      operate(_mem::op::destroy, &buf, nullptr);
+      operate = nullptr;
+    }
+  }
+
+  // Set from another static pointer.
+  //
+  // Since the templated versions don't count for overriding the defaults
+  static_ptr(static_ptr&& rhs)
+    noexcept(std::is_nothrow_move_constructible_v<Base>) : operate(rhs.operate) {
+    if (operate) {
+      operate(_mem::op::move, &rhs.buf, &buf);
+    }
+  }
+
+  template<typename U, std::size_t S>
+  static_ptr(static_ptr<U, S>&& rhs)
+    noexcept(std::is_nothrow_move_constructible_v<U>) : operate(rhs.operate) {
+    create_ward<U, S>();
+    if (operate) {
+      operate(_mem::op::move, &rhs.buf, &buf);
+    }
+  }
+
+  static_ptr& operator =(static_ptr&& rhs)
+    noexcept(std::is_nothrow_move_constructible_v<Base>) {
+    reset();
+    if (rhs) {
+      operate = rhs.operate;
+      operate(_mem::op::move, &rhs.buf, &buf);
+    }
+    return *this;
+  }
+
+  template<typename U, std::size_t S>
+  static_ptr& operator =(static_ptr<U, S>&& rhs)
+    noexcept(std::is_nothrow_move_constructible_v<U>) {
+    create_ward<U, S>();
+    reset();
+    if (rhs) {
+      operate = rhs.operate;
+      operate(_mem::op::move, &rhs.buf, &buf);
+    }
+    return *this;
+  }
+
+
+  bool operator ==(std::nullptr_t) const {
+    return !operate;
+  }
+
+  // In-place construction!
+  //
+  // This is basically what you want, and I didn't include value
+  // construction because in-place construction renders it
+  // unnecessary. Also it doesn't fit the pointer idiom as well.
+  //
+  template<typename T, typename... Args>
+  static_ptr(std::in_place_type_t<T>, Args&& ...args)
+    noexcept(std::is_nothrow_constructible_v<T, Args...>)
+    : operate(&_mem::op_fun<T>){
+    static_assert((!std::is_nothrow_copy_constructible_v<Base> ||
+		   std::is_nothrow_copy_constructible_v<T>) &&
+		  (!std::is_nothrow_move_constructible_v<Base> ||
+		   std::is_nothrow_move_constructible_v<T>),
+		  "If declared type of static_ptr is nothrow "
+		  "move/copy constructible, then any "
+		  "type assigned to it must be as well. "
+		  "You can use reinterpret_pointer_cast "
+		  "to get around this limit, but don't "
+		  "come crying to me when the C++ "
+		  "runtime calls terminate().");
+    create_ward<T, sizeof(T)>();
+    new (&buf) T(std::forward<Args>(args)...);
+  }
+
+  // I occasionally get tempted to make an overload of the assignment
+  // operator that takes a tuple as its right-hand side to provide
+  // arguments.
+  //
+  template<typename T, typename... Args>
+  void emplace(Args&& ...args)
+    noexcept(std::is_nothrow_constructible_v<T, Args...>) {
+    create_ward<T, sizeof(T)>();
+    reset();
+    operate = &_mem::op_fun<T>;
+    new (&buf) T(std::forward<Args>(args)...);
+  }
+
+  // Access!
+  Base* get() const noexcept {
+    return operate ? reinterpret_cast<Base*>(&buf) : nullptr;
+  }
+  template<typename U = Base>
+  std::enable_if_t<!std::is_void_v<U>, Base*> operator->() const noexcept {
+    return get();
+  }
+  template<typename U = Base>
+  std::enable_if_t<!std::is_void_v<U>, Base&> operator *() const noexcept {
+    return *get();
+  }
+  operator bool() const noexcept {
+    return !!operate;
+  }
+
+  // Big wall of friendship
+  //
+  template<typename U, std::size_t Z, typename T, std::size_t S>
+  friend static_ptr<U, Z> static_pointer_cast(const static_ptr<T, S>& p);
+  template<typename U, std::size_t Z, typename T, std::size_t S>
+  friend static_ptr<U, Z> static_pointer_cast(static_ptr<T, S>&& p);
+
+  template<typename U, std::size_t Z, typename T, std::size_t S>
+  friend static_ptr<U, Z> dynamic_pointer_cast(const static_ptr<T, S>& p);
+  template<typename U, std::size_t Z, typename T, std::size_t S>
+  friend static_ptr<U, Z> dynamic_pointer_cast(static_ptr<T, S>&& p);
+
+  template<typename U, std::size_t Z, typename T, std::size_t S>
+  friend static_ptr<U, Z> const_pointer_cast(const static_ptr<T, S>& p);
+  template<typename U, std::size_t Z, typename T, std::size_t S>
+  friend static_ptr<U, Z> const_pointer_cast(static_ptr<T, S>&& p);
+
+  template<typename U, std::size_t Z, typename T, std::size_t S>
+  friend static_ptr<U, Z> reinterpret_pointer_cast(const static_ptr<T, S>& p);
+  template<typename U, std::size_t Z, typename T, std::size_t S>
+  friend static_ptr<U, Z> reinterpret_pointer_cast(static_ptr<T, S>&& p);
+
+  template<typename U, std::size_t Z, typename T, std::size_t S>
+  friend static_ptr<U, Z> resize_pointer_cast(const static_ptr<T, S>& p);
+  template<typename U, std::size_t Z, typename T, std::size_t S>
+  friend static_ptr<U, Z> resize_pointer_cast(static_ptr<T, S>&& p);
+};
+
+// These are all modeled after the same ones for shared pointer.
+//
+// Also I'm annoyed that the standard library doesn't have
+// *_pointer_cast overloads for a move-only unique pointer. It's a
+// nice idiom. Having to release and reconstruct is obnoxious.
+//
+template<typename U, std::size_t Z, typename T, std::size_t S>
+static_ptr<U, Z> static_pointer_cast(static_ptr<T, S>&& p) {
+  static_assert(Z >= S,
+                "Value too large.");
+  static_ptr<U, Z> r;
+  if (static_cast<U*>(p.get())) {
+    p.operate(_mem::op::move, &p.buf, &r.buf);
+    r.operate = p.operate;
+  }
+  return r;
+}
+
+// Here the conditional is actually important and ensures we have the
+// same behavior as dynamic_cast.
+//
+template<typename U, std::size_t Z, typename T, std::size_t S>
+static_ptr<U, Z> dynamic_pointer_cast(static_ptr<T, S>&& p) {
+  static_assert(Z >= S,
+                "Value too large.");
+  static_ptr<U, Z> r;
+  if (dynamic_cast<U*>(p.get())) {
+    p.operate(_mem::op::move, &p.buf, &r.buf);
+    r.operate = p.operate;
+  }
+  return r;
+}
+
+template<typename U, std::size_t Z, typename T, std::size_t S>
+static_ptr<U, Z> const_pointer_cast(static_ptr<T, S>&& p) {
+  static_assert(Z >= S,
+                "Value too large.");
+  static_ptr<U, Z> r;
+  if (const_cast<U*>(p.get())) {
+    p.operate(_mem::op::move, &p.buf, &r.buf);
+    r.operate = p.operate;
+  }
+  return r;
+}
+
+// I'm not sure if anyone will ever use this. I can imagine situations
+// where they might. It works, though!
+//
+template<typename U, std::size_t Z, typename T, std::size_t S>
+static_ptr<U, Z> reinterpret_pointer_cast(static_ptr<T, S>&& p) {
+  static_assert(Z >= S,
+                "Value too large.");
+  static_ptr<U, Z> r;
+  p.operate(_mem::op::move, &p.buf, &r.buf);
+  r.operate = p.operate;
+  return r;
+}
+
+// This is the only way to move from a bigger static pointer into a
+// smaller static pointer. The size of the total data stored in the
+// pointer is checked at runtime and if the destination size is large
+// enough, we copy it over.
+//
+// I follow cast semantics. Since this is a pointer-like type, it
+// returns a null value rather than throwing.
+template<typename U, std::size_t Z, typename T, std::size_t S>
+static_ptr<U, Z> resize_pointer_cast(static_ptr<T, S>&& p) {
+  static_assert(std::is_same_v<U, T>,
+                "resize_pointer_cast only changes size, not type.");
+  static_ptr<U, Z> r;
+  if (Z >= p.operate(_mem::op::size, &p.buf, nullptr)) {
+    p.operate(_mem::op::move, &p.buf, &r.buf);
+    r.operate = p.operate;
+  }
+  return r;
+}
+
+// Since `make_unique` and `make_shared` exist, we should follow their
+// lead.
+//
+template<typename Base, typename Derived = Base,
+         std::size_t Size = sizeof(Derived), typename... Args>
+static_ptr<Base, Size> make_static(Args&& ...args) {
+  return { std::in_place_type<Derived>, std::forward<Args>(args)... };
+}
+}
diff --git a/src/common/str_list.cc b/src/common/str_list.cc
new file mode 100644
index 000000000..6904e8d13
--- /dev/null
+++ b/src/common/str_list.cc
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2009-2010 Dreamhost
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/str_list.h"
+
+using std::string;
+using std::vector;
+using std::list;
+using ceph::for_each_substr;
+
+void get_str_list(const string& str, const char *delims, list<string>& str_list)
+{
+  str_list.clear();
+  for_each_substr(str, delims, [&str_list] (auto token) {
+      str_list.emplace_back(token.begin(), token.end());
+    });
+}
+
+void get_str_list(const string& str, list<string>& str_list)
+{
+  const char *delims = ";,= \t";
+  get_str_list(str, delims, str_list);
+}
+
+list<string> get_str_list(const string& str, const char *delims)
+{
+  list<string> result;
+  get_str_list(str, delims, result);
+  return result;
+}
+
+void get_str_vec(std::string_view str, const char *delims, vector<string>& str_vec)
+{
+  str_vec.clear();
+  for_each_substr(str, delims, [&str_vec] (auto token) {
+      str_vec.emplace_back(token.begin(), token.end());
+    });
+}
+
+void get_str_vec(std::string_view str, vector<string>& str_vec)
+{
+  const char *delims = ";,= \t";
+  get_str_vec(str, delims, str_vec);
+}
+
+vector<string> get_str_vec(std::string_view str, const char *delims)
+{
+  vector<string> result;
+  for_each_substr(str, delims, [&result] (auto token) {
+      result.emplace_back(token.begin(), token.end());
+    });
+  return result;
+}
diff --git a/src/common/str_map.cc b/src/common/str_map.cc
new file mode 100644
index 000000000..638a30784
--- /dev/null
+++ b/src/common/str_map.cc
@@ -0,0 +1,207 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ * 
+ */
+
+#include "include/str_map.h"
+#include "include/str_list.h"
+
+#include <boost/algorithm/string.hpp>
+
+#include "json_spirit/json_spirit.h"
+
+using namespace std;
+
+int get_json_str_map(
+    const string &str,
+    ostream &ss,
+    str_map_t *str_map,
+    bool fallback_to_plain)
+{
+  json_spirit::mValue json;
+  try {
+    // try json parsing first
+
+    json_spirit::read_or_throw(str, json);
+
+    if (json.type() != json_spirit::obj_type) {
+      ss << str << " must be a JSON object but is of type "
+	 << json.type() << " instead";
+      return -EINVAL;
+    }
+
+    json_spirit::mObject o = json.get_obj();
+
+    for (map<string, json_spirit::mValue>::iterator i = o.begin();
+	 i != o.end();
+	 ++i) {
+      (*str_map)[i->first] = i->second.get_str();
+    }
+  } catch (json_spirit::Error_position &e) {
+    if (fallback_to_plain) {
+      // fallback to key=value format
+      get_str_map(str, str_map, "\t\n ");
+    } else {
+      return -EINVAL;
+    }
+  }
+  return 0;
+}
+
+static std::string_view trim(std::string_view str)
+{
+  static const char* whitespaces = "\t\n ";
+  auto beg = str.find_first_not_of(whitespaces);
+  if (beg == str.npos) {
+    return {};
+  }
+  auto end = str.find_last_not_of(whitespaces);
+  return str.substr(beg, end - beg + 1);
+}
+
+int get_str_map(
+    const string &str,
+    str_map_t* str_map,
+    const char *delims)
+{
+  for_each_pair(str, delims, [str_map](std::string_view key,
+				       std::string_view val) {
+    // is the format 'K=V' or just 'K'?
+    if (val.empty()) {
+      str_map->emplace(std::string(key), "");
+    } else {
+      str_map->emplace(std::string(trim(key)), std::string(trim(val)));
+    }
+  });
+  return 0;
+}
+
+str_map_t get_str_map(
+  const string& str,
+  const char* delim)
+{
+  str_map_t str_map;
+  get_str_map(str, &str_map, delim);
+  return str_map;
+}
+
+string get_str_map_value(
+    const str_map_t &str_map,
+    const string &key,
+    const string *def_val)
+{
+  auto p = str_map.find(key);
+
+  // key exists in str_map
+  if (p != str_map.end()) {
+    // but value is empty
+    if (p->second.empty())
+      return p->first;
+    // and value is not empty
+    return p->second;
+  }
+
+  // key DNE in str_map and def_val was specified
+  if (def_val != nullptr)
+    return *def_val;
+
+  // key DNE in str_map, no def_val was specified
+  return string();
+}
+
+string get_str_map_key(
+    const str_map_t &str_map,
+    const string &key,
+    const string *fallback_key)
+{
+  auto p = str_map.find(key);
+  if (p != str_map.end())
+    return p->second;
+
+  if (fallback_key != nullptr) {
+    p = str_map.find(*fallback_key);
+    if (p != str_map.end())
+      return p->second;
+  }
+  return string();
+}
+
+// This function's only purpose is to check whether a given map has only
+// ONE key with an empty value (which would mean that 'get_str_map()' read
+// a map in the form of 'VALUE', without any KEY/VALUE pairs) and, in such
+// event, to assign said 'VALUE' to a given 'def_key', such that we end up
+// with a map of the form "m = { 'def_key' : 'VALUE' }" instead of the
+// original "m = { 'VALUE' : '' }".
+int get_conf_str_map_helper(
+    const string &str,
+    ostringstream &oss,
+    str_map_t* str_map,
+    const string &default_key)
+{
+  get_str_map(str, str_map);
+
+  if (str_map->size() == 1) {
+    auto p = str_map->begin();
+    if (p->second.empty()) {
+      string s = p->first;
+      str_map->erase(s);
+      (*str_map)[default_key] = s;
+    }
+  }
+  return 0;
+}
+
+std::string get_value_via_strmap(
+  const string& conf_string,
+  std::string_view default_key)
+{
+  auto mp = get_str_map(conf_string);
+  if (mp.size() != 1) {
+    return "";
+  }
+
+  // if the one-elem "map" is of the form { 'value' : '' }
+  // replace it with { 'default_key' : 'value' }
+  const auto& [k, v] = *(mp.begin());
+  if (v.empty()) {
+    return k;
+  }
+  return v;
+}
+
+std::string get_value_via_strmap(
+  const string& conf_string,
+  const string& key,
+  std::string_view default_key)
+{
+  auto mp = get_str_map(conf_string);
+  if (mp.size() != 1) {
+    return std::string{};
+  }
+
+  // if the one-elem "map" is of the form { 'value' : '' }
+  // replace it with { 'default_key' : 'value' }
+  const auto& [k, v] = *(mp.begin());
+  if (v.empty()) {
+    return k;
+  }
+  if (k == key) {
+    return k;
+  }
+  if (k == default_key) {
+    return v;
+  }
+
+  return string{};
+}
diff --git a/src/common/strescape.h b/src/common/strescape.h
new file mode 100644
index 000000000..9bf27fc34
--- /dev/null
+++ b/src/common/strescape.h
@@ -0,0 +1,37 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2021 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_STRESCAPE_H
+#define CEPH_STRESCAPE_H
+
+#include <algorithm>
+#include <ostream>
+#include <string_view>
+
+#include <ctype.h>
+
+inline std::string binstrprint(std::string_view sv, size_t maxlen=0)
+{
+  std::string s;
+  if (maxlen == 0 || sv.size() < maxlen) {
+    s = std::string(sv);
+  } else {
+    maxlen = std::max<size_t>(8, maxlen);
+    s = std::string(sv.substr(0, maxlen-3)) + "...";
+  }
+  std::replace_if(s.begin(), s.end(), [](char c){ return !(isalnum(c) || ispunct(c)); }, '.');
+  return s;
+}
+
+#endif
diff --git a/src/common/strtol.cc b/src/common/strtol.cc
new file mode 100644
index 000000000..c9e982b63
--- /dev/null
+++ b/src/common/strtol.cc
@@ -0,0 +1,279 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "strtol.h"
+
+#include <algorithm>
+#include <climits>
+#include <limits>
+#include <cmath>
+#include <sstream>
+#include <strings.h>
+#include <string_view>
+
+using std::ostringstream;
+
+bool strict_strtob(const char* str, std::string *err)
+{
+  if (strcasecmp(str, "false") == 0) {
+    return false;
+  } else if (strcasecmp(str, "true") == 0) {
+    return true;
+  } else {
+    int b = strict_strtol(str, 10, err);
+    return (bool)!!b;
+  }
+}
+
+long long strict_strtoll(std::string_view str, int base, std::string *err)
+{
+  char *endptr;
+  errno = 0; /* To distinguish success/failure after call (see man page) */
+  long long ret = strtoll(str.data(), &endptr, base);
+  if (endptr == str.data() || endptr != str.data() + str.size()) {
+    *err = (std::string{"Expected option value to be integer, got '"} +
+	    std::string{str} + "'");
+    return 0;
+  }
+  if (errno) {
+    *err = (std::string{"The option value '"} + std::string{str} +
+	    "' seems to be invalid");
+    return 0;
+  }
+  *err = "";
+  return ret;
+}
+
+int strict_strtol(std::string_view str, int base, std::string *err)
+{
+  long long ret = strict_strtoll(str, base, err);
+  if (!err->empty())
+    return 0;
+  if ((ret < INT_MIN) || (ret > INT_MAX)) {
+    ostringstream errStr;
+    errStr << "The option value '" << str << "' seems to be invalid";
+    *err = errStr.str();
+    return 0;
+  }
+  return static_cast<int>(ret);
+}
+
+int strict_strtol(const char *str, int base, std::string *err)
+{
+  return strict_strtol(std::string_view(str), base, err);
+}
+
+double strict_strtod(std::string_view str, std::string *err)
+{
+  char *endptr;
+  errno = 0; /* To distinguish success/failure after call (see man page) */
+  double ret = strtod(str.data(), &endptr);
+  if (errno == ERANGE) {
+    ostringstream oss;
+    oss << "strict_strtod: floating point overflow or underflow parsing '"
+	<< str << "'";
+    *err = oss.str();
+    return 0.0;
+  }
+  if (endptr == str) {
+    ostringstream oss;
+    oss << "strict_strtod: expected double, got: '" << str << "'";
+    *err = oss.str();
+    return 0;
+  }
+  if (*endptr != '\0') {
+    ostringstream oss;
+    oss << "strict_strtod: garbage at end of string. got: '" << str << "'";
+    *err = oss.str();
+    return 0;
+  }
+  *err = "";
+  return ret;
+}
+
+float strict_strtof(std::string_view str, std::string *err)
+{
+  char *endptr;
+  errno = 0; /* To distinguish success/failure after call (see man page) */
+  float ret = strtof(str.data(), &endptr);
+  if (errno == ERANGE) {
+    ostringstream oss;
+    oss << "strict_strtof: floating point overflow or underflow parsing '"
+	<< str << "'";
+    *err = oss.str();
+    return 0.0;
+  }
+  if (endptr == str) {
+    ostringstream oss;
+    oss << "strict_strtof: expected float, got: '" << str << "'";
+    *err = oss.str();
+    return 0;
+  }
+  if (*endptr != '\0') {
+    ostringstream oss;
+    oss << "strict_strtof: garbage at end of string. got: '" << str << "'";
+    *err = oss.str();
+    return 0;
+  }
+  *err = "";
+  return ret;
+}
+
+template<typename T>
+T strict_iec_cast(std::string_view str, std::string *err)
+{
+  if (str.empty()) {
+    *err = "strict_iecstrtoll: value not specified";
+    return 0;
+  }
+  // get a view of the unit and of the value
+  std::string_view unit;
+  std::string_view n = str;
+  size_t u = str.find_first_not_of("0123456789-+");
+  int m = 0;
+  // deal with unit prefix if there is one
+  if (u != std::string_view::npos) {
+    n = str.substr(0, u);
+    unit = str.substr(u, str.length() - u);
+    // we accept both old si prefixes as well as the proper iec prefixes
+    // i.e. K, M, ... and Ki, Mi, ...
+    if (unit.back() == 'i') {
+      if (unit.front() == 'B') {
+        *err = "strict_iecstrtoll: illegal prefix \"Bi\"";
+        return 0;
+      }
+    }
+    if (unit.length() > 2) {
+      *err = "strict_iecstrtoll: illegal prefix (length > 2)";
+      return 0;
+    }
+    switch(unit.front()) {
+      case 'K':
+        m = 10;
+        break;
+      case 'M':
+        m = 20;
+        break;
+      case 'G':
+        m = 30;
+        break;
+      case 'T':
+        m = 40;
+        break;
+      case 'P':
+        m = 50;
+        break;
+      case 'E':
+        m = 60;
+        break;
+      case 'B':
+        break;
+      default:
+        *err = "strict_iecstrtoll: unit prefix not recognized";
+        return 0;
+    }
+  }
+
+  long long ll = strict_strtoll(n, 10, err);
+  if (ll < 0 && !std::numeric_limits<T>::is_signed) {
+    *err = "strict_iecstrtoll: value should not be negative";
+    return 0;
+  }
+  if (static_cast<unsigned>(m) >= sizeof(T) * CHAR_BIT) {
+    *err = ("strict_iecstrtoll: the IEC prefix is too large for the designated "
+        "type");
+    return 0;
+  }
+  using promoted_t = typename std::common_type<decltype(ll), T>::type;
+  if (static_cast<promoted_t>(ll) <
+      static_cast<promoted_t>(std::numeric_limits<T>::min()) >> m) {
+    *err = "strict_iecstrtoll: value seems to be too small";
+    return 0;
+  }
+  if (static_cast<promoted_t>(ll) >
+      static_cast<promoted_t>(std::numeric_limits<T>::max()) >> m) {
+    *err = "strict_iecstrtoll: value seems to be too large";
+    return 0;
+  }
+  return (ll << m);
+}
+
+template int strict_iec_cast<int>(std::string_view str, std::string *err);
+template long strict_iec_cast<long>(std::string_view str, std::string *err);
+template long long strict_iec_cast<long long>(std::string_view str, std::string *err);
+template uint64_t strict_iec_cast<uint64_t>(std::string_view str, std::string *err);
+template uint32_t strict_iec_cast<uint32_t>(std::string_view str, std::string *err);
+
+uint64_t strict_iecstrtoll(std::string_view str, std::string *err)
+{
+  return strict_iec_cast<uint64_t>(str, err);
+}
+
+template<typename T>
+T strict_si_cast(std::string_view str, std::string *err)
+{
+  if (str.empty()) {
+    *err = "strict_sistrtoll: value not specified";
+    return 0;
+  }
+  std::string_view n = str;
+  int m = 0;
+  // deal with unit prefix is there is one
+  if (str.find_first_not_of("0123456789+-") != std::string_view::npos) {
+    const char &u = str.back();
+    if (u == 'K')
+      m = 3;
+    else if (u == 'M')
+      m = 6;
+    else if (u == 'G')
+      m = 9;
+    else if (u == 'T')
+      m = 12;
+    else if (u == 'P')
+      m = 15;
+    else if (u == 'E')
+      m = 18;
+    else if (u != 'B') {
+      *err = "strict_si_cast: unit prefix not recognized";
+      return 0;
+    }
+
+    if (m >= 3)
+      n = str.substr(0, str.length() -1);
+  }
+
+  long long ll = strict_strtoll(n, 10, err);
+  if (ll < 0 && !std::numeric_limits<T>::is_signed) {
+    *err = "strict_sistrtoll: value should not be negative";
+    return 0;
+  }
+  using promoted_t = typename std::common_type<decltype(ll), T>::type;
+  auto v = static_cast<promoted_t>(ll);
+  auto coefficient = static_cast<promoted_t>(powl(10, m));
+  if (v != std::clamp(v,
+		      (static_cast<promoted_t>(std::numeric_limits<T>::min()) /
+		       coefficient),
+		      (static_cast<promoted_t>(std::numeric_limits<T>::max()) /
+		       coefficient))) {
+    *err = "strict_sistrtoll: value out of range";
+    return 0;
+  }
+  return v * coefficient;
+}
+
+template int strict_si_cast<int>(std::string_view str, std::string *err);
+template long strict_si_cast<long>(std::string_view str, std::string *err);
+template long long strict_si_cast<long long>(std::string_view str, std::string *err);
+template uint64_t strict_si_cast<uint64_t>(std::string_view str, std::string *err);
+template uint32_t strict_si_cast<uint32_t>(std::string_view str, std::string *err);
diff --git a/src/common/strtol.h b/src/common/strtol.h
new file mode 100644
index 000000000..2183137b1
--- /dev/null
+++ b/src/common/strtol.h
@@ -0,0 +1,112 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_STRTOL_H
+#define CEPH_COMMON_STRTOL_H
+
+#include <charconv>
+#include <cinttypes>
+#include <cstdlib>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <system_error>
+#include <type_traits>
+
+
+namespace ceph {
+// Wrappers around std::from_chars.
+//
+// Why do we want this instead of strtol and friends? Because the
+// string doesn't have to be NUL-terminated! (Also, for a lot of
+// purposes, just putting a string_view in and getting an optional out
+// is friendly.)
+//
+// Returns the found number on success. Returns an empty optional on
+// failure OR on trailing characters.
+// Sadly GCC < 11 is missing the floating point versions.
+template<typename T>
+auto parse(std::string_view s, int base = 10)
+  -> std::enable_if_t<std::is_integral_v<T>, std::optional<T>>
+{
+  T t;
+  auto r = std::from_chars(s.data(), s.data() + s.size(), t, base);
+  if ((r.ec != std::errc{}) || (r.ptr != s.data() + s.size())) {
+    return std::nullopt;
+  }
+  return t;
+}
+
+// As above, but succeed on trailing characters and trim the supplied
+// string_view to remove the parsed number. Set the supplied
+// string_view to empty if it ends with the number.
+template<typename T>
+auto consume(std::string_view& s, int base = 10)
+  -> std::enable_if_t<std::is_integral_v<T>, std::optional<T>>
+{
+  T t;
+  auto r = std::from_chars(s.data(), s.data() + s.size(), t, base);
+  if (r.ec != std::errc{})
+    return std::nullopt;
+
+  if (r.ptr == s.data() + s.size()) {
+    s = std::string_view{};
+  } else {
+    s.remove_prefix(r.ptr - s.data());
+  }
+  return t;
+}
+} // namespace ceph
+
+bool strict_strtob(const char* str, std::string *err);
+
+long long strict_strtoll(std::string_view str, int base, std::string *err);
+
+int strict_strtol(std::string_view str, int base, std::string *err);
+
+double strict_strtod(std::string_view str, std::string *err);
+
+float strict_strtof(std::string_view str, std::string *err);
+
+uint64_t strict_iecstrtoll(std::string_view str, std::string *err);
+
+template<typename T>
+T strict_iec_cast(std::string_view str, std::string *err);
+
+template<typename T>
+T strict_si_cast(std::string_view str, std::string *err);
+
+/* On enter buf points to the end of the buffer, e.g. where the least
+ * significant digit of the input number will be printed. Returns pointer to
+ * where the most significant digit were printed, including zero padding.
+ * Does NOT add zero at the end of buffer, this is responsibility of the caller.
+ */
+template<typename T, const unsigned base = 10, const unsigned width = 1>
+static inline
+char* ritoa(T u, char *buf)
+{
+  static_assert(std::is_unsigned_v<T>, "signed types are not supported");
+  static_assert(base <= 16, "extend character map below to support higher bases");
+  unsigned digits = 0;
+  while (u) {
+    *--buf = "0123456789abcdef"[u % base];
+    u /= base;
+    digits++;
+  }
+  while (digits++ < width)
+    *--buf = '0';
+  return buf;
+}
+
+#endif
diff --git a/src/common/subsys.h b/src/common/subsys.h
new file mode 100644
index 000000000..3e558b440
--- /dev/null
+++ b/src/common/subsys.h
@@ -0,0 +1,110 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+/**
+ * This header describes the subsystems (each one gets a "--debug-<subsystem>"
+ * log verbosity setting), along with their default verbosities.
+ */
+
+DEFAULT_SUBSYS(0, 5)
+SUBSYS(lockdep, 0, 1)
+SUBSYS(context, 0, 1)
+SUBSYS(crush, 1, 1)
+SUBSYS(mds, 1, 5)
+SUBSYS(mds_balancer, 1, 5)
+SUBSYS(mds_locker, 1, 5)
+SUBSYS(mds_log, 1, 5)
+SUBSYS(mds_log_expire, 1, 5)
+SUBSYS(mds_migrator, 1, 5)
+SUBSYS(buffer, 0, 1)
+SUBSYS(timer, 0, 1)
+SUBSYS(filer, 0, 1)
+SUBSYS(striper, 0, 1)
+SUBSYS(objecter, 0, 1)
+SUBSYS(rados, 0, 5)
+SUBSYS(rbd, 0, 5)
+SUBSYS(rbd_mirror, 0, 5)
+SUBSYS(rbd_replay, 0, 5)
+SUBSYS(rbd_pwl, 0, 5)
+SUBSYS(journaler, 0, 5)
+SUBSYS(objectcacher, 0, 5)
+SUBSYS(immutable_obj_cache, 0, 5)
+SUBSYS(client, 0, 5)
+SUBSYS(osd, 1, 5)
+SUBSYS(optracker, 0, 5)
+SUBSYS(objclass, 0, 5)
+SUBSYS(filestore, 1, 3)
+SUBSYS(journal, 1, 3)
+SUBSYS(ms, 0, 0)
+SUBSYS(mon, 1, 5)
+SUBSYS(monc, 0, 10)
+SUBSYS(paxos, 1, 5)
+SUBSYS(tp, 0, 5)
+SUBSYS(auth, 1, 5)
+SUBSYS(crypto, 1, 5)
+SUBSYS(finisher, 1, 1)
+SUBSYS(reserver, 1, 1)
+SUBSYS(heartbeatmap, 1, 5)
+SUBSYS(perfcounter, 1, 5)
+SUBSYS(rgw, 1, 5)                 // log level for the Rados gateway
+SUBSYS(rgw_sync, 1, 5)
+SUBSYS(rgw_datacache, 1, 5)
+SUBSYS(rgw_access, 1, 5)
+SUBSYS(rgw_dbstore, 1, 5)
+SUBSYS(rgw_flight, 1, 5)
+SUBSYS(javaclient, 1, 5)
+SUBSYS(asok, 1, 5)
+SUBSYS(throttle, 1, 1)
+SUBSYS(refs, 0, 0)
+SUBSYS(compressor, 1, 5)
+SUBSYS(bluestore, 1, 5)
+SUBSYS(bluefs, 1, 5)
+SUBSYS(bdev, 1, 3)
+SUBSYS(kstore, 1, 5)
+SUBSYS(rocksdb, 4, 5)
+SUBSYS(leveldb, 4, 5)
+SUBSYS(fuse, 1, 5)
+SUBSYS(mgr, 2, 5)
+SUBSYS(mgrc, 1, 5)
+SUBSYS(dpdk, 1, 5)
+SUBSYS(eventtrace, 1, 5)
+SUBSYS(prioritycache, 1, 5)
+SUBSYS(test, 0, 5)
+SUBSYS(cephfs_mirror, 0, 5)
+SUBSYS(cephsqlite, 0, 5)
+SUBSYS(seastore, 0, 5)       // logs above seastore tm
+SUBSYS(seastore_onode, 0, 5)
+SUBSYS(seastore_odata, 0, 5)
+SUBSYS(seastore_omap, 0, 5)
+SUBSYS(seastore_tm, 0, 5)    // logs below seastore tm
+SUBSYS(seastore_t, 0, 5)
+SUBSYS(seastore_cleaner, 0, 5)
+SUBSYS(seastore_epm, 0, 5)
+SUBSYS(seastore_lba, 0, 5)
+SUBSYS(seastore_fixedkv_tree, 0, 5)
+SUBSYS(seastore_cache, 0, 5)
+SUBSYS(seastore_journal, 0, 5)
+SUBSYS(seastore_device, 0, 5)
+SUBSYS(seastore_backref, 0, 5)
+SUBSYS(alienstore, 0, 5)
+SUBSYS(mclock, 1, 5)
+SUBSYS(cyanstore, 0, 5)
+SUBSYS(ceph_exporter, 1, 5)
+SUBSYS(memstore, 1, 5)
+// *********************************************************************
+// Developers should update /doc/rados/troubleshooting/log-and-debug.rst
+// when adding or removing a subsystem accordingly.
+// *********************************************************************
+
diff --git a/src/common/subsys_types.h b/src/common/subsys_types.h
new file mode 100644
index 000000000..bd7cc439e
--- /dev/null
+++ b/src/common/subsys_types.h
@@ -0,0 +1,87 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_SUBSYS_TYPES_H
+#define CEPH_SUBSYS_TYPES_H
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+
+enum ceph_subsys_id_t {
+  ceph_subsys_,   // default
+#define SUBSYS(name, log, gather) \
+  ceph_subsys_##name,
+#define DEFAULT_SUBSYS(log, gather)
+#include "common/subsys.h"
+#undef SUBSYS
+#undef DEFAULT_SUBSYS
+  ceph_subsys_max
+};
+
+constexpr static std::size_t ceph_subsys_get_num() {
+  return static_cast<std::size_t>(ceph_subsys_max);
+}
+
+struct ceph_subsys_item_t {
+  const char* name;
+  uint8_t log_level;
+  uint8_t gather_level;
+};
+
+constexpr static std::array<ceph_subsys_item_t, ceph_subsys_get_num()>
+ceph_subsys_get_as_array() {
+#define SUBSYS(name, log, gather) \
+  ceph_subsys_item_t{ #name, log, gather },
+#define DEFAULT_SUBSYS(log, gather) \
+  ceph_subsys_item_t{ "none", log, gather },
+
+  return {
+#include "common/subsys.h"
+  };
+#undef SUBSYS
+#undef DEFAULT_SUBSYS
+}
+
+constexpr static std::uint8_t
+ceph_subsys_get_max_default_level(const std::size_t subidx) {
+  const auto item = ceph_subsys_get_as_array()[subidx];
+  return std::max(item.log_level, item.gather_level);
+}
+
+// Compile time-capable version of std::strlen. Resorting to own
+// implementation only because C++17 doesn't mandate constexpr
+// on the standard one.
+constexpr static std::size_t strlen_ct(const char* const s) {
+  std::size_t l = 0;
+  while (s[l] != '\0') {
+    ++l;
+  }
+  return l;
+}
+
+constexpr static std::size_t ceph_subsys_max_name_length() {
+  return std::max({
+#define SUBSYS(name, log, gather) \
+  strlen_ct(#name),
+#define DEFAULT_SUBSYS(log, gather) \
+  strlen_ct("none"),
+#include "common/subsys.h"
+#undef SUBSYS
+#undef DEFAULT_SUBSYS
+  });
+}
+
+#endif // CEPH_SUBSYS_TYPES_H
+
diff --git a/src/common/sync_filesystem.h b/src/common/sync_filesystem.h
new file mode 100644
index 000000000..f457f655d
--- /dev/null
+++ b/src/common/sync_filesystem.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_SYNC_FILESYSTEM_H
+#define CEPH_SYNC_FILESYSTEM_H
+
+#include <unistd.h>
+
+#if defined(__linux__)
+#include <sys/ioctl.h>
+#include <syscall.h>
+#include "os/fs/btrfs_ioctl.h"
+#endif
+
+inline int sync_filesystem(int fd)
+{
+  /* On Linux, newer versions of glibc have a function called syncfs that
+   * performs a sync on only one filesystem. If we don't have this call, we
+   * have to fall back on sync(), which synchronizes every filesystem on the
+   * computer. */
+#ifdef HAVE_SYS_SYNCFS
+  if (syncfs(fd) == 0)
+    return 0;
+#elif defined(SYS_syncfs)
+  if (syscall(SYS_syncfs, fd) == 0)
+    return 0;
+#elif defined(__NR_syncfs)
+  if (syscall(__NR_syncfs, fd) == 0)
+    return 0;
+#endif
+
+#if defined(HAVE_SYS_SYNCFS) || defined(SYS_syncfs) || defined(__NR_syncfs)
+  else if (errno == ENOSYS) {
+    sync();
+    return 0;
+  } else {
+    return -errno;
+  }
+#else
+  sync();
+  return 0;
+#endif
+}
+
+#endif
diff --git a/src/common/tracer.cc b/src/common/tracer.cc
new file mode 100644
index 000000000..ffabc0b20
--- /dev/null
+++ b/src/common/tracer.cc
@@ -0,0 +1,113 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_context.h"
+#include "global/global_context.h"
+#include "tracer.h"
+
+#ifdef HAVE_JAEGER
+#include "opentelemetry/sdk/trace/batch_span_processor.h"
+#include "opentelemetry/sdk/trace/tracer_provider.h"
+#include "opentelemetry/exporters/jaeger/jaeger_exporter.h"
+
+namespace tracing {
+
+const opentelemetry::nostd::shared_ptr<opentelemetry::trace::Tracer> Tracer::noop_tracer = opentelemetry::trace::Provider::GetTracerProvider()->GetTracer("no-op", OPENTELEMETRY_SDK_VERSION);
+const jspan Tracer::noop_span = noop_tracer->StartSpan("noop");
+
+using bufferlist = ceph::buffer::list;
+
+Tracer::Tracer(opentelemetry::nostd::string_view service_name) {
+  init(service_name);
+}
+
+void Tracer::init(opentelemetry::nostd::string_view service_name) {
+  if (!tracer) {
+    opentelemetry::exporter::jaeger::JaegerExporterOptions exporter_options;
+    if (g_ceph_context) {
+      exporter_options.server_port = g_ceph_context->_conf.get_val<int64_t>("jaeger_agent_port");
+    }
+    const opentelemetry::sdk::trace::BatchSpanProcessorOptions processor_options;
+    const auto jaeger_resource = opentelemetry::sdk::resource::Resource::Create(std::move(opentelemetry::sdk::resource::ResourceAttributes{{"service.name", service_name}}));
+    auto jaeger_exporter = std::unique_ptr<opentelemetry::sdk::trace::SpanExporter>(new opentelemetry::exporter::jaeger::JaegerExporter(exporter_options));
+    auto processor = std::unique_ptr<opentelemetry::sdk::trace::SpanProcessor>(new opentelemetry::sdk::trace::BatchSpanProcessor(std::move(jaeger_exporter), processor_options));
+    const auto provider = opentelemetry::nostd::shared_ptr<opentelemetry::trace::TracerProvider>(new opentelemetry::sdk::trace::TracerProvider(std::move(processor), jaeger_resource));
+    opentelemetry::trace::Provider::SetTracerProvider(provider);
+    tracer = provider->GetTracer(service_name, OPENTELEMETRY_SDK_VERSION);
+  }
+}
+
+jspan Tracer::start_trace(opentelemetry::nostd::string_view trace_name) {
+  if (is_enabled()) {
+    return tracer->StartSpan(trace_name);
+  }
+  return noop_span;
+}
+
+jspan Tracer::start_trace(opentelemetry::nostd::string_view trace_name, bool trace_is_enabled) {
+  if (trace_is_enabled) {
+    return tracer->StartSpan(trace_name);
+  }
+  return noop_tracer->StartSpan(trace_name);
+}
+
+jspan Tracer::add_span(opentelemetry::nostd::string_view span_name, const jspan& parent_span) {
+  if (is_enabled() && parent_span->IsRecording()) {
+    opentelemetry::trace::StartSpanOptions span_opts;
+    span_opts.parent = parent_span->GetContext();
+    return tracer->StartSpan(span_name, span_opts);
+  }
+  return noop_span;
+}
+
+jspan Tracer::add_span(opentelemetry::nostd::string_view span_name, const jspan_context& parent_ctx) {
+  if (is_enabled() && parent_ctx.IsValid()) {
+    opentelemetry::trace::StartSpanOptions span_opts;
+    span_opts.parent = parent_ctx;
+    return tracer->StartSpan(span_name, span_opts);
+  }
+  return noop_span;
+}
+
+bool Tracer::is_enabled() const {
+  return g_ceph_context->_conf->jaeger_tracing_enable;
+}
+
+void encode(const jspan_context& span_ctx, bufferlist& bl, uint64_t f) {
+  ENCODE_START(1, 1, bl);
+  using namespace opentelemetry;
+  using namespace trace;
+  auto is_valid = span_ctx.IsValid();
+  encode(is_valid, bl);
+  if (is_valid) {
+    encode_nohead(std::string_view(reinterpret_cast<const char*>(span_ctx.trace_id().Id().data()), TraceId::kSize), bl);
+    encode_nohead(std::string_view(reinterpret_cast<const char*>(span_ctx.span_id().Id().data()), SpanId::kSize), bl);
+    encode(span_ctx.trace_flags().flags(), bl);
+  }
+  ENCODE_FINISH(bl);
+}
+
+void decode(jspan_context& span_ctx, bufferlist::const_iterator& bl) {
+  using namespace opentelemetry;
+  using namespace trace;
+  DECODE_START(1, bl);
+  bool is_valid;
+  decode(is_valid, bl);
+  if (is_valid) {
+    std::array<uint8_t, TraceId::kSize> trace_id;
+    std::array<uint8_t, SpanId::kSize> span_id;
+    uint8_t flags;
+    decode(trace_id, bl);
+    decode(span_id, bl);
+    decode(flags, bl);
+    span_ctx = SpanContext(
+      TraceId(nostd::span<uint8_t, TraceId::kSize>(trace_id)),
+      SpanId(nostd::span<uint8_t, SpanId::kSize>(span_id)),
+      TraceFlags(flags),
+      true);
+  }
+  DECODE_FINISH(bl);
+}
+} // namespace tracing
+
+#endif // HAVE_JAEGER
diff --git a/src/common/tracer.h b/src/common/tracer.h
new file mode 100644
index 000000000..9d13c78aa
--- /dev/null
+++ b/src/common/tracer.h
@@ -0,0 +1,107 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "acconfig.h"
+#include "include/buffer.h"
+
+#ifdef HAVE_JAEGER
+#include "opentelemetry/trace/provider.h"
+
+using jspan = opentelemetry::nostd::shared_ptr<opentelemetry::trace::Span>;
+using jspan_context = opentelemetry::trace::SpanContext;
+using jspan_attribute = opentelemetry::common::AttributeValue;
+
+namespace tracing {
+
+class Tracer {
+ private:
+  const static opentelemetry::nostd::shared_ptr<opentelemetry::trace::Tracer> noop_tracer;
+  const static jspan noop_span;
+  opentelemetry::nostd::shared_ptr<opentelemetry::trace::Tracer> tracer;
+
+ public:
+  Tracer() = default;
+  Tracer(opentelemetry::nostd::string_view service_name);
+
+  void init(opentelemetry::nostd::string_view service_name);
+
+  bool is_enabled() const;
+  // creates and returns a new span with `trace_name`
+  // this span represents a trace, since it has no parent.
+  jspan start_trace(opentelemetry::nostd::string_view trace_name);
+
+  // creates and returns a new span with `trace_name`
+  // if false is given to `trace_is_enabled` param, noop span will be returned
+  jspan start_trace(opentelemetry::nostd::string_view trace_name, bool trace_is_enabled);
+
+  // creates and returns a new span with `span_name` which parent span is `parent_span'
+  jspan add_span(opentelemetry::nostd::string_view span_name, const jspan& parent_span);
+  // creates and return a new span with `span_name`
+  // the span is added to the trace which it's context is `parent_ctx`.
+  // parent_ctx contains the required information of the trace.
+  jspan add_span(opentelemetry::nostd::string_view span_name, const jspan_context& parent_ctx);
+
+};
+
+void encode(const jspan_context& span, ceph::buffer::list& bl, uint64_t f = 0);
+void decode(jspan_context& span_ctx, ceph::buffer::list::const_iterator& bl);
+
+} // namespace tracing
+
+
+#else  // !HAVE_JAEGER
+
+#include <string_view>
+
+class Value {
+ public:
+  template <typename T> Value(T val) {}
+};
+
+using jspan_attribute = Value;
+
+struct jspan_context {
+  jspan_context() {}
+  jspan_context(bool sampled_flag, bool is_remote) {}
+};
+
+struct span_stub {
+  jspan_context _ctx;
+  template <typename T>
+  void SetAttribute(std::string_view key, const T& value) const noexcept {}
+  void AddEvent(std::string_view) {}
+  void AddEvent(std::string_view, std::initializer_list<std::pair<std::string_view, jspan_attribute>> fields) {}
+  template <typename T> void AddEvent(std::string_view name, const T& fields = {}) {}
+  const jspan_context& GetContext() { return _ctx; }
+  void UpdateName(std::string_view) {}
+  bool IsRecording() { return false; }
+};
+
+class jspan {
+  span_stub span;
+ public:
+  span_stub& operator*() { return span; }
+  const span_stub& operator*() const { return span; }
+
+  span_stub* operator->() { return &span; }
+  const span_stub* operator->() const { return &span; }
+
+  operator bool() const { return false; }
+};
+
+namespace tracing {
+
+struct Tracer {
+  bool is_enabled() const { return false; }
+  jspan start_trace(std::string_view, bool enabled = true) { return {}; }
+  jspan add_span(std::string_view, const jspan&) { return {}; }
+  jspan add_span(std::string_view span_name, const jspan_context& parent_ctx) { return {}; }
+  void init(std::string_view service_name) {}
+};
+  inline void encode(const jspan_context& span, bufferlist& bl, uint64_t f=0) {}
+  inline void decode(jspan_context& span_ctx, ceph::buffer::list::const_iterator& bl) {}
+}
+
+#endif // !HAVE_JAEGER
diff --git a/src/common/tracked_int_ptr.hpp b/src/common/tracked_int_ptr.hpp
new file mode 100644
index 000000000..fc54c4cb9
--- /dev/null
+++ b/src/common/tracked_int_ptr.hpp
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_TRACKEDINTPTR_H
+#define CEPH_TRACKEDINTPTR_H
+
+
+template <class T>
+class TrackedIntPtr {
+  T *ptr;
+  uint64_t id;
+public:
+  TrackedIntPtr() : ptr(NULL), id(0) {}
+  TrackedIntPtr(T *ptr) : ptr(ptr), id(ptr ? get_with_id(ptr) : 0) {}
+  ~TrackedIntPtr() {
+    if (ptr)
+      put_with_id(ptr, id);
+    else
+      ceph_assert(id == 0);
+  }
+  void swap(TrackedIntPtr &other) {
+    T *optr = other.ptr;
+    uint64_t oid = other.id;
+    other.ptr = ptr;
+    other.id = id;
+    ptr = optr;
+    id = oid;
+  }
+  TrackedIntPtr(const TrackedIntPtr &rhs) :
+    ptr(rhs.ptr), id(ptr ? get_with_id(ptr) : 0) {}
+
+  TrackedIntPtr& operator=(const TrackedIntPtr &rhs) {
+    TrackedIntPtr o(rhs.ptr);
+    swap(o);
+    return *this;
+  }
+  T &operator*() const {
+    return *ptr;
+  }
+  T *operator->() const {
+    return ptr;
+  }
+  T *get() const { return ptr; }
+
+  operator bool() const {
+    return ptr != NULL;
+  }
+  bool operator<(const TrackedIntPtr &lhs) const {
+    return ptr < lhs.ptr;
+  }
+  bool operator==(const TrackedIntPtr &lhs) const {
+    return ptr == lhs.ptr;
+  }
+
+  void reset() {
+    if (ptr) 
+      put_with_id(ptr, id);
+    ptr = nullptr;
+    id = 0;
+  }
+};
+
+#endif
diff --git a/src/common/types.cc b/src/common/types.cc
new file mode 100644
index 000000000..7f11cd798
--- /dev/null
+++ b/src/common/types.cc
@@ -0,0 +1,32 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ * 
+ */
+#ifndef __CEPH_TYPES_H
+#define __CEPH_TYPES_H
+
+#include <include/types.h>
+
+#ifndef UINT8_MAX
+#define UINT8_MAX (255)
+#endif
+
+const shard_id_t shard_id_t::NO_SHARD(-1);
+
+std::ostream& operator<<(std::ostream& lhs, const shard_id_t& rhs)
+{
+  return lhs << (unsigned)(uint8_t)rhs.id;
+}
+
+#endif
diff --git a/src/common/url_escape.cc b/src/common/url_escape.cc
new file mode 100644
index 000000000..6580d28c6
--- /dev/null
+++ b/src/common/url_escape.cc
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "url_escape.h"
+
+#include <stdexcept>
+#include <sstream>
+
+std::string url_escape(const std::string& s)
+{
+  std::string out;
+  for (auto c : s) {
+    if (std::isalnum(c) || c == '-' || c == '.' || c == '_' || c == '~' ||
+	c == '/') {
+      out.push_back(c);
+    } else {
+      char t[4];
+      snprintf(t, sizeof(t), "%%%02x", (int)(unsigned char)c);
+      out.append(t);
+    }
+  }
+  return out;
+}
+
+std::string url_unescape(const std::string& s)
+{
+  std::string out;
+  const char *end = s.c_str() + s.size();
+  for (const char *c = s.c_str(); c < end; ++c) {
+    switch (*c) {
+    case '%':
+      {
+	unsigned char v = 0;
+	for (unsigned i=0; i<2; ++i) {
+	  ++c;
+	  if (c >= end) {
+	    std::ostringstream ss;
+	    ss << "invalid escaped string at pos " << (c - s.c_str()) << " of '"
+	       << s << "'";
+	    throw std::runtime_error(ss.str());
+	  }
+	  v <<= 4;
+	  if (*c >= '0' && *c <= '9') {
+	    v += *c - '0';
+	  } else if (*c >= 'a' && *c <= 'f') {
+	    v += *c - 'a' + 10;
+	  } else if (*c >= 'A' && *c <= 'F') {
+	    v += *c - 'A' + 10;
+	  } else {
+	    std::ostringstream ss;
+	    ss << "invalid escaped string at pos " << (c - s.c_str()) << " of '"
+	       << s << "'";
+	    throw std::runtime_error(ss.str());
+	  }
+	}
+	out.push_back(v);
+      }
+      break;
+    default:
+      out.push_back(*c);
+    }
+  }
+  return out;
+}
diff --git a/src/common/url_escape.h b/src/common/url_escape.h
new file mode 100644
index 000000000..3cb539b10
--- /dev/null
+++ b/src/common/url_escape.h
@@ -0,0 +1,9 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+
+extern std::string url_escape(const std::string& s);
+extern std::string url_unescape(const std::string& s);
diff --git a/src/common/utf8.c b/src/common/utf8.c
new file mode 100644
index 000000000..3a05789f6
--- /dev/null
+++ b/src/common/utf8.c
@@ -0,0 +1,235 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "common/utf8.h"
+
+#include <string.h>
+
+/*
+ * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
+ *
+ * Table 3-7. Well-Formed UTF-8 Byte Sequences
+ *
+ * +--------------------+------------+-------------+------------+-------------+
+ * | Code Points        | First Byte | Second Byte | Third Byte | Fourth Byte |
+ * +--------------------+------------+-------------+------------+-------------+
+ * | U+0000..U+007F     | 00..7F     |             |            |             |
+ * +--------------------+------------+-------------+------------+-------------+
+ * | U+0080..U+07FF     | C2..DF     | 80..BF      |            |             |
+ * +--------------------+------------+-------------+------------+-------------+
+ * | U+0800..U+0FFF     | E0         | A0..BF      | 80..BF     |             |
+ * +--------------------+------------+-------------+------------+-------------+
+ * | U+1000..U+CFFF     | E1..EC     | 80..BF      | 80..BF     |             |
+ * +--------------------+------------+-------------+------------+-------------+
+ * | U+D000..U+D7FF     | ED         | 80..9F      | 80..BF     |             |
+ * +--------------------+------------+-------------+------------+-------------+
+ * | U+E000..U+FFFF     | EE..EF     | 80..BF      | 80..BF     |             |
+ * +--------------------+------------+-------------+------------+-------------+
+ * | U+10000..U+3FFFF   | F0         | 90..BF      | 80..BF     | 80..BF      |
+ * +--------------------+------------+-------------+------------+-------------+
+ * | U+40000..U+FFFFF   | F1..F3     | 80..BF      | 80..BF     | 80..BF      |
+ * +--------------------+------------+-------------+------------+-------------+
+ * | U+100000..U+10FFFF | F4         | 80..8F      | 80..BF     | 80..BF      |
+ * +--------------------+------------+-------------+------------+-------------+
+ */
+
+static int high_bits_set(int c)
+{
+	int ret = 0;
+	while (1) {
+		if ((c & 0x80) != 0x080)
+			break;
+		c <<= 1;
+		++ret;
+	}
+	return ret;
+}
+
+/* Encode a 31-bit UTF8 code point to 'buf'.
+ * Assumes buf is of size MAX_UTF8_SZ
+ * Returns -1 on failure; number of bytes in the encoded value otherwise.
+ */
+int encode_utf8(unsigned long u, unsigned char *buf)
+{
+	/* Unroll loop for common code points  */
+	if (u <= 0x0000007F) {
+		buf[0] = u;
+		return 1;
+	} else if (u <= 0x000007FF) {
+		buf[0] = 0xC0 | (u >> 6);
+		buf[1] = 0x80 | (u & 0x3F);
+		return 2;
+	} else if (u <= 0x0000FFFF) {
+		buf[0] = 0xE0 | (u >> 12);
+		buf[1] = 0x80 | ((u >> 6) & 0x3F);
+		buf[2] = 0x80 | (u & 0x3F);
+		return 3;
+	} else if (u <= 0x001FFFFF) {
+		buf[0] = 0xF0 | (u >> 18);
+		buf[1] = 0x80 | ((u >> 12) & 0x3F);
+		buf[2] = 0x80 | ((u >> 6) & 0x3F);
+		buf[3] = 0x80 | (u & 0x3F);
+		return 4;
+	} else {
+		/* Rare/illegal code points */
+		if (u <= 0x03FFFFFF) {
+			for (int i = 4; i >= 1; --i) {
+				buf[i] = 0x80 | (u & 0x3F);
+				u >>= 6;
+			}
+			buf[0] = 0xF8 | u;
+			return 5;
+		} else if (u <= 0x7FFFFFFF) {
+			for (int i = 5; i >= 1; --i) {
+				buf[i] = 0x80 | (u & 0x3F);
+				u >>= 6;
+			}
+			buf[0] = 0xFC | u;
+			return 6;
+		}
+		return -1;
+	}
+}
+
+/*
+ * Decode a UTF8 character from an array of bytes. Return character code.
+ * Upon error, return INVALID_UTF8_CHAR.
+ */
+unsigned long decode_utf8(unsigned char *buf, int nbytes)
+{
+	unsigned long code;
+	int i, j;
+
+	if (nbytes <= 0)
+		return INVALID_UTF8_CHAR;
+
+	if (nbytes == 1) {
+		if (buf[0] >= 0x80)
+			return INVALID_UTF8_CHAR;
+		return buf[0];
+	}
+
+	i = high_bits_set(buf[0]);
+	if (i != nbytes)
+		return INVALID_UTF8_CHAR;
+	code = buf[0] & (0xff >> i);
+	for (j = 1; j < nbytes; ++j) {
+		if ((buf[j] & 0xc0) != 0x80)
+			    return INVALID_UTF8_CHAR;
+		code = (code << 6) | (buf[j] & 0x3f);
+	}
+
+	// Check for invalid code points
+	if (code == 0xFFFE)
+	    return INVALID_UTF8_CHAR;
+	if (code == 0xFFFF)
+	    return INVALID_UTF8_CHAR;
+	if (code >= 0xD800 && code <= 0xDFFF)
+	    return INVALID_UTF8_CHAR;
+
+	return code;
+}
+
+int check_utf8(const char *buf, int len)
+{
+	/*
+	 * "char" is "signed" on x86 but "unsigned" on aarch64 by default.
+	 * Below code depends on signed/unsigned comparisons, define an
+	 * unsigned buffer explicitly to fix the gap.
+	 */
+	const unsigned char *bufu = (const unsigned char *)buf;
+	int err_pos = 1;
+
+	while (len) {
+		int nbytes;
+		unsigned char byte1 = bufu[0];
+
+		/* 00..7F */
+		if (byte1 <= 0x7F) {
+			nbytes = 1;
+		/* C2..DF, 80..BF */
+		} else if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF &&
+				(signed char)bufu[1] <= (signed char)0xBF) {
+			nbytes = 2;
+		} else if (len >= 3) {
+			unsigned char byte2 = bufu[1];
+
+			/* Is byte2, byte3 between 0x80 ~ 0xBF */
+			int byte2_ok = (signed char)byte2 <= (signed char)0xBF;
+			int byte3_ok = (signed char)bufu[2] <= (signed char)0xBF;
+
+			if (byte2_ok && byte3_ok &&
+					/* E0, A0..BF, 80..BF */
+					((byte1 == 0xE0 && byte2 >= 0xA0) ||
+					 /* E1..EC, 80..BF, 80..BF */
+					 (byte1 >= 0xE1 && byte1 <= 0xEC) ||
+					 /* ED, 80..9F, 80..BF */
+					 (byte1 == 0xED && byte2 <= 0x9F) ||
+					 /* EE..EF, 80..BF, 80..BF */
+					 (byte1 >= 0xEE && byte1 <= 0xEF))) {
+				nbytes = 3;
+			} else if (len >= 4) {
+				/* Is byte4 between 0x80 ~ 0xBF */
+				int byte4_ok = (signed char)bufu[3] <= (signed char)0xBF;
+
+				if (byte2_ok && byte3_ok && byte4_ok &&
+						/* F0, 90..BF, 80..BF, 80..BF */
+						((byte1 == 0xF0 && byte2 >= 0x90) ||
+						 /* F1..F3, 80..BF, 80..BF, 80..BF */
+						 (byte1 >= 0xF1 && byte1 <= 0xF3) ||
+						 /* F4, 80..8F, 80..BF, 80..BF */
+						 (byte1 == 0xF4 && byte2 <= 0x8F))) {
+					nbytes = 4;
+				} else {
+					return err_pos;
+				}
+			} else {
+				return err_pos;
+			}
+		} else {
+			return err_pos;
+		}
+
+		len -= nbytes;
+		err_pos += nbytes;
+		bufu += nbytes;
+	}
+
+	return 0;
+}
+
+int check_utf8_cstr(const char *buf)
+{
+	return check_utf8(buf, strlen(buf));
+}
+
+int is_control_character(int c)
+{
+	return (((c != 0) && (c < 0x20)) || (c == 0x7f));
+}
+
+int check_for_control_characters(const char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len; ++i) {
+		if (is_control_character((int)(unsigned char)buf[i])) {
+ 			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int check_for_control_characters_cstr(const char *buf)
+{
+	return check_for_control_characters(buf, strlen(buf));
+}
diff --git a/src/common/utf8.h b/src/common/utf8.h
new file mode 100644
index 000000000..83efe6fd6
--- /dev/null
+++ b/src/common/utf8.h
@@ -0,0 +1,66 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_UTF8_H
+#define CEPH_COMMON_UTF8_H
+
+#define MAX_UTF8_SZ 6
+#define INVALID_UTF8_CHAR 0xfffffffful
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Checks if a buffer is valid UTF-8.
+ * Returns 0 if it is, and one plus the offset of the first invalid byte
+ * if it is not.
+ */
+int check_utf8(const char *buf, int len);
+
+/* Checks if a null-terminated string is valid UTF-8.
+ * Returns 0 if it is, and one plus the offset of the first invalid byte
+ * if it is not.
+ */
+int check_utf8_cstr(const char *buf);
+
+/* Returns true if 'ch' is a control character.
+ * We do count newline as a control character, but not NULL.
+ */
+int is_control_character(int ch);
+
+/* Checks if a buffer contains control characters.
+ */
+int check_for_control_characters(const char *buf, int len);
+
+/* Checks if a null-terminated string contains control characters.
+ */
+int check_for_control_characters_cstr(const char *buf);
+
+/* Encode a 31-bit UTF8 code point to 'buf'.
+ * Assumes buf is of size MAX_UTF8_SZ
+ * Returns -1 on failure; number of bytes in the encoded value otherwise.
+ */
+int encode_utf8(unsigned long u, unsigned char *buf);
+
+/*
+ * Decode a UTF8 character from an array of bytes. Return character code.
+ * Upon error, return INVALID_UTF8_CHAR.
+ */
+unsigned long decode_utf8(unsigned char *buf, int nbytes);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/common/util.cc b/src/common/util.cc
new file mode 100644
index 000000000..dc676c264
--- /dev/null
+++ b/src/common/util.cc
@@ -0,0 +1,460 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef _WIN32
+#include <sys/utsname.h>
+#endif
+
+#include <fstream>
+#include <boost/algorithm/string.hpp>
+
+#include "include/compat.h"
+#include "include/util.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/version.h"
+
+#ifdef HAVE_SYS_VFS_H
+#include <sys/vfs.h>
+#endif
+
+#if defined(__APPLE__) || defined(__FreeBSD__)
+#include <sys/param.h>
+#include <sys/mount.h>
+#if defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+#endif
+
+#include <string>
+
+#include <stdio.h>
+
+using std::list;
+using std::map;
+using std::string;
+
+using ceph::bufferlist;
+using ceph::Formatter;
+
+#ifndef _WIN32
+int get_fs_stats(ceph_data_stats_t &stats, const char *path)
+{
+  if (!path)
+    return -EINVAL;
+
+  struct statfs stbuf;
+  int err = ::statfs(path, &stbuf);
+  if (err < 0) {
+    return -errno;
+  }
+
+  stats.byte_total = stbuf.f_blocks * stbuf.f_bsize;
+  stats.byte_used = (stbuf.f_blocks - stbuf.f_bfree) * stbuf.f_bsize;
+  stats.byte_avail = stbuf.f_bavail * stbuf.f_bsize;
+  stats.avail_percent = (((float)stats.byte_avail/stats.byte_total)*100);
+  return 0;
+}
+#else
+int get_fs_stats(ceph_data_stats_t &stats, const char *path)
+{
+  ULARGE_INTEGER avail_bytes, total_bytes, total_free_bytes;
+
+  if (!GetDiskFreeSpaceExA(path, &avail_bytes,
+                           &total_bytes, &total_free_bytes)) {
+    return -EINVAL;
+  }
+
+  stats.byte_total = total_bytes.QuadPart;
+  stats.byte_used = total_bytes.QuadPart - total_free_bytes.QuadPart;
+  // may not be equal to total_free_bytes due to quotas
+  stats.byte_avail = avail_bytes.QuadPart;
+  stats.avail_percent = ((float)stats.byte_avail / stats.byte_total) * 100;
+  return 0;
+}
+#endif
+
+static char* value_sanitize(char *value)
+{
+  while (isspace(*value) || *value == '"')
+    value++;
+
+  char* end = value + strlen(value) - 1;
+  while (end > value && (isspace(*end) || *end == '"'))
+    end--;
+
+  *(end + 1) = '\0';
+
+  return value;
+}
+
+static bool value_set(char *buf, const char *prefix,
+		      map<string, string> *pm, const char *key)
+{
+  if (strncmp(buf, prefix, strlen(prefix))) {
+    return false;
+  }
+
+  (*pm)[key] = value_sanitize(buf + strlen(prefix));
+  return true;
+}
+
+static void file_values_parse(const map<string, string>& kvm, FILE *fp, map<string, string> *m, CephContext *cct) {
+  char buf[512];
+  while (fgets(buf, sizeof(buf) - 1, fp) != NULL) {
+    for (auto& kv : kvm) {
+      if (value_set(buf, kv.second.c_str(), m, kv.first.c_str()))
+        continue;
+    }
+  }
+}
+
+static bool os_release_parse(map<string, string> *m, CephContext *cct)
+{
+#if defined(__linux__)
+  static const map<string, string> kvm = {
+    { "distro", "ID=" },
+    { "distro_description", "PRETTY_NAME=" },
+    { "distro_version", "VERSION_ID=" }
+  };
+
+  FILE *fp = fopen("/etc/os-release", "r");
+  if (!fp) {
+    int ret = -errno;
+    lderr(cct) << "os_release_parse - failed to open /etc/os-release: " << cpp_strerror(ret) << dendl;
+    return false;
+  }
+
+  file_values_parse(kvm, fp, m, cct);
+
+  fclose(fp);
+#elif defined(__FreeBSD__)
+  struct utsname u;
+  int r = uname(&u);
+  if (!r) {
+     m->insert(std::make_pair("distro", u.sysname));
+     m->insert(std::make_pair("distro_description", u.version));
+     m->insert(std::make_pair("distro_version", u.release));
+  }
+#endif
+
+  return true;
+}
+
+static void distro_detect(map<string, string> *m, CephContext *cct)
+{
+  if (!os_release_parse(m, cct)) {
+    lderr(cct) << "distro_detect - /etc/os-release is required" << dendl;
+  }
+
+  for (const char* rk: {"distro", "distro_description"}) {
+    if (m->find(rk) == m->end())
+      lderr(cct) << "distro_detect - can't detect " << rk << dendl;
+  }
+}
+
+int get_cgroup_memory_limit(uint64_t *limit)
+{
+#if defined(__linux__)
+  // /sys/fs/cgroup/memory/memory.limit_in_bytes
+
+  // the magic value 9223372036854771712 or 0x7ffffffffffff000
+  // appears to mean no limit.
+  FILE *f = fopen(PROCPREFIX "/sys/fs/cgroup/memory/memory.limit_in_bytes", "r");
+  if (!f) {
+    return -errno;
+  }
+  char buf[100];
+  int ret = 0;
+  long long value;
+  char *line = fgets(buf, sizeof(buf), f);
+  if (!line) {
+    ret = -EINVAL;
+    goto out;
+  }
+  if (sscanf(line, "%lld", &value) != 1) {
+    ret = -EINVAL;
+  }
+  if (value == 0x7ffffffffffff000) {
+    *limit = 0;  // no limit
+  } else {
+    *limit = value;
+  }
+out:
+  fclose(f);
+  return ret;
+#else
+  return 0;
+#endif
+}
+
+#ifdef _WIN32
+int get_windows_version(POSVERSIONINFOEXW ver) {
+  using  get_version_func_t = DWORD (WINAPI *)(OSVERSIONINFOEXW*);
+
+  // We'll load the library directly to avoid depending on the NTDDK.
+  HMODULE ntdll_lib = LoadLibraryW(L"Ntdll.dll");
+  if (!ntdll_lib) {
+    return -EINVAL;
+  }
+
+  // The standard "GetVersion" returned values depend on the application
+  // manifest. We'll get the "real" version by using the Rtl* version.
+  auto get_version_func = (
+    get_version_func_t)GetProcAddress(ntdll_lib, "RtlGetVersion");
+  int ret = 0;
+  if (!get_version_func || get_version_func(ver)) {
+    // RtlGetVersion returns non-zero values in case of errors.
+    ret = -EINVAL;
+  }
+
+  FreeLibrary(ntdll_lib);
+  return ret;
+}
+#endif
+
+void collect_sys_info(map<string, string> *m, CephContext *cct)
+{
+  // version
+  (*m)["ceph_version"] = pretty_version_to_str();
+  (*m)["ceph_version_short"] = ceph_version_to_str();
+  (*m)["ceph_release"] = ceph_release_to_str();
+
+  #ifndef _WIN32
+  // kernel info
+  struct utsname u;
+  int r = uname(&u);
+  if (r >= 0) {
+    (*m)["os"] = u.sysname;
+    (*m)["kernel_version"] = u.release;
+    (*m)["kernel_description"] = u.version;
+    (*m)["hostname"] = u.nodename;
+    (*m)["arch"] = u.machine;
+  }
+  #else
+  OSVERSIONINFOEXW ver = {0};
+  ver.dwOSVersionInfoSize = sizeof(ver);
+  get_windows_version(&ver);
+
+  char version_str[64];
+  snprintf(version_str, 64, "%lu.%lu (%lu)",
+           ver.dwMajorVersion, ver.dwMinorVersion, ver.dwBuildNumber);
+
+  char hostname[64];
+  DWORD hostname_sz = sizeof(hostname);
+  GetComputerNameA(hostname, &hostname_sz);
+
+  SYSTEM_INFO sys_info;
+  const char* arch_str;
+  GetNativeSystemInfo(&sys_info);
+
+  switch (sys_info.wProcessorArchitecture) {
+    case PROCESSOR_ARCHITECTURE_AMD64:
+      arch_str = "x86_64";
+      break;
+    case PROCESSOR_ARCHITECTURE_INTEL:
+      arch_str = "x86";
+      break;
+    case PROCESSOR_ARCHITECTURE_ARM:
+      arch_str = "arm";
+      break;
+    default:
+      arch_str = "unknown";
+      break;
+  }
+
+  (*m)["os"] = "Windows";
+  (*m)["kernel_version"] = version_str;
+  (*m)["kernel_description"] = version_str;
+  (*m)["hostname"] = hostname;
+  (*m)["arch"] = arch_str;
+  #endif
+
+  // but wait, am i in a container?
+  bool in_container = false;
+
+  if (const char *pod_name = getenv("POD_NAME")) {
+    (*m)["pod_name"] = pod_name;
+    in_container = true;
+  }
+  if (const char *container_name = getenv("CONTAINER_NAME")) {
+    (*m)["container_name"] = container_name;
+    in_container = true;
+  }
+  if (const char *container_image = getenv("CONTAINER_IMAGE")) {
+    (*m)["container_image"] = container_image;
+    in_container = true;
+  }
+  if (in_container) {
+    if (const char *node_name = getenv("NODE_NAME")) {
+      (*m)["container_hostname"] = (*m)["hostname"];
+      (*m)["hostname"] = node_name;
+    }
+    if (const char *ns = getenv("POD_NAMESPACE")) {
+      (*m)["pod_namespace"] = ns;
+    }
+  }
+
+#ifdef __APPLE__
+  // memory
+  {
+    uint64_t size;
+    size_t len = sizeof(size);
+    r = sysctlbyname("hw.memsize", &size, &len, NULL, 0);
+    if (r == 0) {
+      (*m)["mem_total_kb"] = std::to_string(size);
+    }
+  }
+  {
+    xsw_usage vmusage;
+    size_t len = sizeof(vmusage);
+    r = sysctlbyname("vm.swapusage", &vmusage, &len, NULL, 0);
+    if (r == 0) {
+      (*m)["mem_swap_kb"] = std::to_string(vmusage.xsu_total);
+    }
+  }
+  // processor
+  {
+    char buf[100];
+    size_t len = sizeof(buf);
+    r = sysctlbyname("machdep.cpu.brand_string", buf, &len, NULL, 0);
+    if (r == 0) {
+      buf[len - 1] = '\0';
+      (*m)["cpu"] = buf;
+    }
+  }
+#elif !defined(_WIN32)
+  // memory
+  if (std::ifstream f{PROCPREFIX "/proc/meminfo"}; !f.fail()) {
+    for (std::string line; std::getline(f, line); ) {
+      std::vector<string> parts;
+      boost::split(parts, line, boost::is_any_of(":\t "), boost::token_compress_on);
+      if (parts.size() != 3) {
+	continue;
+      }
+      if (parts[0] == "MemTotal") {
+	(*m)["mem_total_kb"] = parts[1];
+      } else if (parts[0] == "SwapTotal") {
+	(*m)["mem_swap_kb"] = parts[1];
+      }
+    }
+  }
+  uint64_t cgroup_limit;
+  if (get_cgroup_memory_limit(&cgroup_limit) == 0 &&
+      cgroup_limit > 0) {
+    (*m)["mem_cgroup_limit"] = std::to_string(cgroup_limit);
+  }
+
+  // processor
+  if (std::ifstream f{PROCPREFIX "/proc/cpuinfo"}; !f.fail()) {
+    for (std::string line; std::getline(f, line); ) {
+      std::vector<string> parts;
+      boost::split(parts, line, boost::is_any_of(":"));
+      if (parts.size() != 2) {
+	continue;
+      }
+      boost::trim(parts[0]);
+      boost::trim(parts[1]);
+      if (parts[0] == "model name") {
+	(*m)["cpu"] = parts[1];
+	break;
+      }
+    }
+  }
+#endif
+  // distro info
+  distro_detect(m, cct);
+}
+
+void dump_services(Formatter* f, const map<string, list<int> >& services, const char* type)
+{
+  ceph_assert(f);
+
+  f->open_object_section(type);
+  for (auto host = services.begin();
+       host != services.end(); ++host) {
+    f->open_array_section(host->first.c_str());
+    const list<int>& hosted = host->second;
+    for (auto s = hosted.cbegin();
+	 s != hosted.cend(); ++s) {
+      f->dump_int(type, *s);
+    }
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void dump_services(Formatter* f, const map<string, list<string> >& services, const char* type)
+{
+  ceph_assert(f);
+
+  f->open_object_section(type);
+  for (const auto& host : services) {
+    f->open_array_section(host.first.c_str());
+    const auto& hosted = host.second;
+    for (const auto& s : hosted) {
+      f->dump_string(type, s);
+    }
+    f->close_section();
+  }
+  f->close_section();
+}
+
+// If non-printable characters found then convert bufferlist to
+// base64 encoded string indicating whether it did.
+string cleanbin(bufferlist &bl, bool &base64, bool show)
+{
+  bufferlist::iterator it;
+  for (it = bl.begin(); it != bl.end(); ++it) {
+    if (iscntrl(*it))
+      break;
+  }
+  if (it == bl.end()) {
+    base64 = false;
+    string result(bl.c_str(), bl.length());
+    return result;
+  }
+
+  bufferlist b64;
+  bl.encode_base64(b64);
+  string encoded(b64.c_str(), b64.length());
+  if (show)
+    encoded = "Base64:" + encoded;
+  base64 = true;
+  return encoded;
+}
+
+// If non-printable characters found then convert to "Base64:" followed by
+// base64 encoding
+string cleanbin(string &str)
+{
+  bool base64;
+  bufferlist bl;
+  bl.append(str);
+  string result = cleanbin(bl, base64, true);
+  return result;
+}
+
+std::string bytes2str(uint64_t count) {
+  static char s[][2] = {"\0", "k", "M", "G", "T", "P", "E", "\0"};
+  int i = 0;
+  while (count >= 1024 && *s[i+1]) {
+    count >>= 10;
+    i++;
+  }
+  char str[128];
+  snprintf(str, sizeof str, "%" PRIu64 "%sB", count, s[i]);
+  return std::string(str);
+}
diff --git a/src/common/valgrind.h b/src/common/valgrind.h
new file mode 100644
index 000000000..1faa9cd85
--- /dev/null
+++ b/src/common/valgrind.h
@@ -0,0 +1,19 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_VALGRIND_H
+#define CEPH_VALGRIND_H
+
+#include "acconfig.h"
+
+#if defined(HAVE_VALGRIND_HELGRIND_H) && !defined(NDEBUG)
+  #include <valgrind/helgrind.h>
+#else
+  #define ANNOTATE_HAPPENS_AFTER(x)             (void)0
+  #define ANNOTATE_HAPPENS_BEFORE_FORGET_ALL(x) (void)0
+  #define ANNOTATE_HAPPENS_BEFORE(x)            (void)0
+
+  #define ANNOTATE_BENIGN_RACE_SIZED(address, size, description) (void)0
+#endif
+
+#endif // CEPH_VALGRIND_H
diff --git a/src/common/version.cc b/src/common/version.cc
new file mode 100644
index 000000000..96f17863e
--- /dev/null
+++ b/src/common/version.cc
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/version.h"
+
+#include <stdlib.h>
+#include <sstream>
+
+#include "ceph_ver.h"
+#include "common/ceph_strings.h"
+
+#define _STR(x) #x
+#define STRINGIFY(x) _STR(x)
+
+const char *ceph_version_to_str()
+{
+  char* debug_version_for_testing = getenv("ceph_debug_version_for_testing");
+  if (debug_version_for_testing) {
+    return debug_version_for_testing;
+  } else {
+    return CEPH_GIT_NICE_VER;
+  }
+}
+
+const char *ceph_release_to_str(void)
+{
+  return ceph_release_name(CEPH_RELEASE);
+}
+
+const char *git_version_to_str(void)
+{
+  return STRINGIFY(CEPH_GIT_VER);
+}
+
+std::string const pretty_version_to_str(void)
+{
+  std::ostringstream oss;
+  oss << "ceph version " << CEPH_GIT_NICE_VER
+      << " (" << STRINGIFY(CEPH_GIT_VER) << ") "
+      << ceph_release_name(CEPH_RELEASE)
+      << " (" << CEPH_RELEASE_TYPE << ")";
+  return oss.str();
+}
+
+const char *ceph_release_type(void)
+{
+  return CEPH_RELEASE_TYPE;
+}
diff --git a/src/common/version.h b/src/common/version.h
new file mode 100644
index 000000000..ffa6a17a5
--- /dev/null
+++ b/src/common/version.h
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_VERSION_H
+#define CEPH_COMMON_VERSION_H
+
+#include <string>
+
+// Return a string describing the Ceph version
+const char *ceph_version_to_str();
+
+// Return a string with the Ceph release
+const char *ceph_release_to_str(void);
+
+// Return a string describing the git version
+const char *git_version_to_str(void);
+
+// Return a formatted string describing the ceph and git versions
+std::string const pretty_version_to_str(void);
+
+// Release type ("dev", "rc", or "stable")
+const char *ceph_release_type(void);
+
+#endif
diff --git a/src/common/weighted_shuffle.h b/src/common/weighted_shuffle.h
new file mode 100644
index 000000000..10def0a01
--- /dev/null
+++ b/src/common/weighted_shuffle.h
@@ -0,0 +1,25 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <algorithm>
+#include <iterator>
+#include <random>
+
+template <class RandomIt, class DistIt, class URBG>
+void weighted_shuffle(RandomIt first, RandomIt last,
+		      DistIt weight_first, DistIt weight_last,
+		      URBG &&g)
+{
+  if (first == last) {
+    return;
+  } else {
+    std::discrete_distribution d{weight_first, weight_last};
+    if (auto n = d(g); n > 0) {
+      std::iter_swap(first, std::next(first, n));
+      std::iter_swap(weight_first, std::next(weight_first, n));
+    }
+    weighted_shuffle(++first, last, ++weight_first, weight_last, std::move(g));
+  }
+}
diff --git a/src/common/win32/SubProcess.cc b/src/common/win32/SubProcess.cc
new file mode 100644
index 000000000..ce6b851e0
--- /dev/null
+++ b/src/common/win32/SubProcess.cc
@@ -0,0 +1,306 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <stdarg.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <iostream>
+#include <iomanip>
+
+#include "common/SubProcess.h"
+#include "common/errno.h"
+#include "include/ceph_assert.h"
+#include "include/compat.h"
+
+SubProcess::SubProcess(const char *cmd_, std_fd_op stdin_op_, std_fd_op stdout_op_, std_fd_op stderr_op_) :
+  cmd(cmd_),
+  cmd_args(),
+  stdin_op(stdin_op_),
+  stdout_op(stdout_op_),
+  stderr_op(stderr_op_),
+  stdin_pipe_out_fd(-1),
+  stdout_pipe_in_fd(-1),
+  stderr_pipe_in_fd(-1),
+  pid(0),
+  errstr() {
+}
+
+SubProcess::~SubProcess() {
+  ceph_assert(!is_spawned());
+  ceph_assert(stdin_pipe_out_fd == -1);
+  ceph_assert(stdout_pipe_in_fd == -1);
+  ceph_assert(stderr_pipe_in_fd == -1);
+}
+
+void SubProcess::add_cmd_args(const char *arg, ...) {
+  ceph_assert(!is_spawned());
+
+  va_list ap;
+  va_start(ap, arg);
+  const char *p = arg;
+  do {
+    add_cmd_arg(p);
+    p = va_arg(ap, const char*);
+  } while (p != NULL);
+  va_end(ap);
+}
+
+void SubProcess::add_cmd_arg(const char *arg) {
+  ceph_assert(!is_spawned());
+
+  cmd_args.push_back(arg);
+}
+
+int SubProcess::get_stdin() const {
+  ceph_assert(is_spawned());
+  ceph_assert(stdin_op == PIPE);
+
+  return stdin_pipe_out_fd;
+}
+
+int SubProcess::get_stdout() const {
+  ceph_assert(is_spawned());
+  ceph_assert(stdout_op == PIPE);
+
+  return stdout_pipe_in_fd;
+}
+
+int SubProcess::get_stderr() const {
+  ceph_assert(is_spawned());
+  ceph_assert(stderr_op == PIPE);
+
+  return stderr_pipe_in_fd;
+}
+
+void SubProcess::close(int &fd) {
+  if (fd == -1)
+    return;
+
+  ::close(fd);
+  fd = -1;
+}
+
+void SubProcess::close_stdin() {
+  ceph_assert(is_spawned());
+  ceph_assert(stdin_op == PIPE);
+
+  close(stdin_pipe_out_fd);
+}
+
+void SubProcess::close_stdout() {
+  ceph_assert(is_spawned());
+  ceph_assert(stdout_op == PIPE);
+
+  close(stdout_pipe_in_fd);
+}
+
+void SubProcess::close_stderr() {
+  ceph_assert(is_spawned());
+  ceph_assert(stderr_op == PIPE);
+
+  close(stderr_pipe_in_fd);
+}
+
+const std::string SubProcess::err() const {
+  return errstr.str();
+}
+
+SubProcessTimed::SubProcessTimed(const char *cmd, std_fd_op stdin_op,
+                 std_fd_op stdout_op, std_fd_op stderr_op,
+                 int timeout_, int sigkill_) :
+  SubProcess(cmd, stdin_op, stdout_op, stderr_op),
+  timeout(timeout_),
+  sigkill(sigkill_) {
+}
+
+static bool timedout = false;
+void timeout_sighandler(int sig) {
+  timedout = true;
+}
+
+void SubProcess::close_h(HANDLE &handle) {
+  if (handle == INVALID_HANDLE_VALUE)
+    return;
+
+  CloseHandle(handle);
+  handle = INVALID_HANDLE_VALUE;
+}
+
+int SubProcess::join() {
+  ceph_assert(is_spawned());
+
+  close(stdin_pipe_out_fd);
+  close(stdout_pipe_in_fd);
+  close(stderr_pipe_in_fd);
+
+  int status = 0;
+
+  if (WaitForSingleObject(proc_handle, INFINITE) != WAIT_FAILED) {
+    if (!GetExitCodeProcess(proc_handle, &status)) {
+      errstr << cmd << ": Could not get exit code: " << pid
+             << ". Error code: " << GetLastError();
+      status = -ECHILD;
+    } else if (status) {
+      errstr << cmd << ": exit status: " << status;
+    }
+  } else {
+    errstr << cmd << ": Waiting for child process failed: " << pid
+           << ". Error code: " << GetLastError();
+    status = -ECHILD;
+  }
+
+  close_h(proc_handle);
+  pid = 0;
+  return status;
+}
+
+void SubProcess::kill(int signo) const {
+  ceph_assert(is_spawned());
+  ceph_assert(TerminateProcess(proc_handle, 128 + SIGTERM));
+}
+
+int SubProcess::spawn() {
+  std::ostringstream cmdline;
+  cmdline << cmd;
+  for (auto& arg : cmd_args) {
+    cmdline << " " << std::quoted(arg);
+  }
+
+  STARTUPINFO si = {0};
+  PROCESS_INFORMATION pi = {0};
+  SECURITY_ATTRIBUTES sa = {0};
+
+  sa.nLength = sizeof(SECURITY_ATTRIBUTES);
+  sa.bInheritHandle = TRUE;
+  sa.lpSecurityDescriptor = NULL;
+
+  HANDLE stdin_r = INVALID_HANDLE_VALUE, stdin_w = INVALID_HANDLE_VALUE,
+         stdout_r = INVALID_HANDLE_VALUE, stdout_w = INVALID_HANDLE_VALUE,
+         stderr_r = INVALID_HANDLE_VALUE, stderr_w = INVALID_HANDLE_VALUE;
+
+  if ((stdin_op == PIPE && !CreatePipe(&stdin_r, &stdin_w, &sa, 0)) ||
+      (stdout_op == PIPE && !CreatePipe(&stdout_r, &stdout_w, &sa, 0)) ||
+      (stderr_op == PIPE && !CreatePipe(&stderr_r, &stderr_w, &sa, 0))) {
+    errstr << cmd << ": CreatePipe failed: " << GetLastError();
+    return -1;
+  }
+
+  // The following handles will be used by the parent process and
+  // must be marked as non-inheritable.
+  if ((stdin_op == PIPE && !SetHandleInformation(stdin_w, HANDLE_FLAG_INHERIT, 0)) ||
+      (stdout_op == PIPE && !SetHandleInformation(stdout_r, HANDLE_FLAG_INHERIT, 0)) ||
+      (stderr_op == PIPE && !SetHandleInformation(stderr_r, HANDLE_FLAG_INHERIT, 0))) {
+    errstr << cmd << ": SetHandleInformation failed: "
+           << GetLastError();
+    goto fail;
+  }
+
+  si.cb = sizeof(STARTUPINFO);
+  si.hStdInput = stdin_op == KEEP ? GetStdHandle(STD_INPUT_HANDLE) : stdin_r;
+  si.hStdOutput = stdout_op == KEEP ? GetStdHandle(STD_OUTPUT_HANDLE) : stdout_w;
+  si.hStdError = stderr_op == KEEP ? GetStdHandle(STD_ERROR_HANDLE) : stderr_w;
+  si.dwFlags |= STARTF_USESTDHANDLES;
+
+  stdin_pipe_out_fd = stdin_op == PIPE ? _open_osfhandle((intptr_t)stdin_w, 0) : -1;
+  stdout_pipe_in_fd = stdout_op == PIPE ? _open_osfhandle((intptr_t)stdout_r, _O_RDONLY) : - 1;
+  stderr_pipe_in_fd = stderr_op == PIPE ? _open_osfhandle((intptr_t)stderr_r, _O_RDONLY) : -1;
+
+  if (stdin_op == PIPE && stdin_pipe_out_fd == -1 ||
+      stdout_op == PIPE && stdout_pipe_in_fd == -1 ||
+      stderr_op == PIPE && stderr_pipe_in_fd == -1) {
+    errstr << cmd << ": _open_osfhandle failed: " << GetLastError();
+    goto fail;
+  }
+
+  // We've transfered ownership from those handles.
+  stdin_w = stdout_r = stderr_r = INVALID_HANDLE_VALUE;
+
+  if (!CreateProcess(
+      NULL, const_cast<char*>(cmdline.str().c_str()),
+      NULL, NULL, /* No special security attributes */
+      1, /* Inherit handles marked as inheritable */
+      0, /* No special flags */
+      NULL, /* Use the same environment variables */
+      NULL, /* use the same cwd */
+      &si, &pi)) {
+    errstr << cmd << ": CreateProcess failed: " << GetLastError();
+    goto fail;
+  }
+
+  proc_handle = pi.hProcess;
+  pid = GetProcessId(proc_handle);
+  if (!pid) {
+    errstr << cmd << ": Could not get child process id.";
+    goto fail;
+  }
+
+  // The following are used by the subprocess.
+  CloseHandle(stdin_r);
+  CloseHandle(stdout_w);
+  CloseHandle(stderr_w);
+  CloseHandle(pi.hThread);
+  return 0;
+
+fail:
+  // fd copies
+  close(stdin_pipe_out_fd);
+  close(stdout_pipe_in_fd);
+  close(stderr_pipe_in_fd);
+
+  // the original handles
+  close_h(stdin_r);
+  close_h(stdin_w);
+  close_h(stdout_r);
+  close_h(stdout_w);
+  close_h(stderr_r);
+  close_h(stderr_w);
+
+  // We may consider mapping some of the Windows errors.
+  return -1;
+}
+
+void SubProcess::exec() {
+}
+
+int SubProcessTimed::spawn() {
+  if (auto ret = SubProcess::spawn(); ret < 0) {
+    return ret;
+  }
+
+  if (timeout > 0) {
+    waiter = std::thread([&](){
+      DWORD wait_status = WaitForSingleObject(proc_handle, timeout * 1000);
+      ceph_assert(wait_status != WAIT_FAILED);
+      if (wait_status == WAIT_TIMEOUT) {
+        // 128 + sigkill is just the return code, which is expected by
+        // the unit tests and possibly by other code. We can't pick a
+        // termination signal unless we use window events.
+        ceph_assert(TerminateProcess(proc_handle, 128 + sigkill));
+        timedout = 1;
+      }
+    });
+  }
+  return 0;
+}
+
+int SubProcessTimed::join() {
+  ceph_assert(is_spawned());
+
+  if (waiter.joinable()) {
+    waiter.join();
+  }
+
+  return SubProcess::join();;
+}
+
+void SubProcessTimed::exec() {
+}
diff --git a/src/common/win32/blkdev.cc b/src/common/win32/blkdev.cc
new file mode 100644
index 000000000..3714441e7
--- /dev/null
+++ b/src/common/win32/blkdev.cc
@@ -0,0 +1,122 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <errno.h>
+#include "common/blkdev.h"
+
+int get_device_by_path(const char *path, char* partition, char* device,
+               size_t max)
+{
+  return -EOPNOTSUPP;
+}
+
+
+BlkDev::BlkDev(int f)
+  : fd(f)
+{}
+
+BlkDev::BlkDev(const std::string& devname)
+  : devname(devname)
+{}
+
+int BlkDev::get_devid(dev_t *id) const
+{
+  return -EOPNOTSUPP;
+}
+
+const char *BlkDev::sysfsdir() const {
+  assert(false);  // Should never be called on Windows
+  return "";
+}
+
+int BlkDev::dev(char *dev, size_t max) const
+{
+  return -EOPNOTSUPP;
+}
+
+int BlkDev::get_size(int64_t *psize) const
+{
+  return -EOPNOTSUPP;
+}
+
+bool BlkDev::support_discard() const
+{
+  return false;
+}
+
+int BlkDev::discard(int64_t offset, int64_t len) const
+{
+  return -EOPNOTSUPP;
+}
+
+bool BlkDev::is_rotational() const
+{
+  return false;
+}
+
+int BlkDev::model(char *model, size_t max) const
+{
+  return -EOPNOTSUPP;
+}
+
+int BlkDev::serial(char *serial, size_t max) const
+{
+  return -EOPNOTSUPP;
+}
+
+int BlkDev::partition(char *partition, size_t max) const
+{
+  return -EOPNOTSUPP;
+}
+
+int BlkDev::wholedisk(char *wd, size_t max) const
+{
+  return -EOPNOTSUPP;
+}
+
+void get_dm_parents(const std::string& dev, std::set<std::string> *ls)
+{
+}
+
+void get_raw_devices(const std::string& in,
+             std::set<std::string> *ls)
+{
+}
+
+std::string get_device_id(const std::string& devname,
+              std::string *err)
+{
+  if (err) {
+    *err = "not implemented";
+  }
+  return std::string();
+}
+
+int block_device_run_smartctl(const char *device, int timeout,
+                  std::string *result)
+{
+  return -EOPNOTSUPP;
+}
+
+int block_device_get_metrics(const std::string& devname, int timeout,
+                             json_spirit::mValue *result)
+{
+  return -EOPNOTSUPP;
+}
+
+int block_device_run_nvme(const char *device, const char *vendor, int timeout,
+            std::string *result)
+{
+  return -EOPNOTSUPP;
+}
diff --git a/src/common/win32/dlfcn.cc b/src/common/win32/dlfcn.cc
new file mode 100644
index 000000000..329d14677
--- /dev/null
+++ b/src/common/win32/dlfcn.cc
@@ -0,0 +1,38 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <sstream>
+#include <windows.h>
+
+#include "common/errno.h"
+#include "include/dlfcn_compat.h"
+
+
+void* dlopen(const char *filename, int flags) {
+  return LoadLibrary(filename);
+}
+
+int dlclose(void* handle) {
+  //FreeLibrary returns 0 on error, as opposed to dlclose.
+  return !FreeLibrary(handle);
+}
+
+void* dlsym(void* handle, const char* symbol) {
+  return (void*)GetProcAddress(handle, symbol);
+}
+
+dl_errmsg_t dlerror() {
+  return win32_lasterror_str();
+}
+
diff --git a/src/common/win32/dns_resolve.cc b/src/common/win32/dns_resolve.cc
new file mode 100644
index 000000000..6901399a7
--- /dev/null
+++ b/src/common/win32/dns_resolve.cc
@@ -0,0 +1,66 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/scope_guard.h"
+#include "common/dns_resolve.h"
+#include "common/debug.h"
+
+#define dout_subsys ceph_subsys_
+
+using namespace std;
+
+namespace ceph {
+
+int ResolvHWrapper::res_query(const char *hostname, int cls,
+    int type, u_char *buf, int bufsz) {
+  return -1;
+}
+
+int ResolvHWrapper::res_search(const char *hostname, int cls,
+    int type, u_char *buf, int bufsz) {
+  return -1;
+}
+
+DNSResolver::~DNSResolver()
+{
+  delete resolv_h;
+}
+
+int DNSResolver::resolve_cname(CephContext *cct, const string& hostname,
+    string *cname, bool *found)
+{
+  return -ENOTSUP;
+}
+
+int DNSResolver::resolve_ip_addr(CephContext *cct, const string& hostname,
+    entity_addr_t *addr)
+{
+  return -ENOTSUP;
+}
+
+int DNSResolver::resolve_srv_hosts(CephContext *cct, const string& service_name,
+    const SRV_Protocol trans_protocol,
+    map<string, DNSResolver::Record> *srv_hosts)
+{
+  return this->resolve_srv_hosts(cct, service_name, trans_protocol, "", srv_hosts);
+}
+
+int DNSResolver::resolve_srv_hosts(CephContext *cct, const string& service_name,
+    const SRV_Protocol trans_protocol, const string& domain,
+    map<string, DNSResolver::Record> *srv_hosts)
+{
+  return -ENOTSUP;
+}
+
+}
diff --git a/src/common/win32/errno.cc b/src/common/win32/errno.cc
new file mode 100644
index 000000000..d0942fac9
--- /dev/null
+++ b/src/common/win32/errno.cc
@@ -0,0 +1,652 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#include <errno.h>
+#include <stdlib.h>
+
+#include "include/int_types.h"
+#include <ntdef.h>
+#include <ntstatus.h>
+
+#include "include/compat.h"
+#include "include/int_types.h"
+#include "include/types.h"
+#include "include/fs_types.h"
+
+// We're only converting errors defined in errno.h, not standard Windows
+// system error codes that are usually retrievied using GetLastErrorCode().
+// TODO: consider converting WinSock2 (WSA*) error codes, which are quite
+// similar to the errno.h ones.
+
+__u32 ceph_to_hostos_errno_unsigned(__u32 r)
+{
+  // using an array like like freebsd_errno.cc might be more readable but
+  // we have some large values defined by Boost.
+  switch(r) {
+    case 1: return EPERM;
+    case 2: return ENOENT;
+    case 3: return ESRCH;
+    case 4: return EINTR;
+    case 5: return EIO;
+    case 6: return ENXIO;
+    case 7: return E2BIG;
+    case 8: return ENOEXEC;
+    case 9: return EBADF;
+    case 10: return ECHILD;
+    // same as EWOULDBLOCK
+    case 11: return EAGAIN;
+    case 12: return ENOMEM;
+    case 13: return EACCES;
+    case 14: return EFAULT;
+    case 15: return ENOTBLK;
+    case 16: return EBUSY;
+    case 17: return EEXIST;
+    case 18: return EXDEV;
+    case 19: return ENODEV;
+    case 20: return ENOTDIR;
+    case 21: return EISDIR;
+    case 22: return EINVAL;
+    case 23: return ENFILE;
+    case 24: return EMFILE;
+    case 25: return ENOTTY;
+    case 26: return ETXTBSY;
+    case 27: return EFBIG;
+    case 28: return ENOSPC;
+    case 29: return ESPIPE;
+    case 30: return EROFS;
+    case 31: return EMLINK;
+    case 32: return EPIPE;
+    case 33: return EDOM;
+    case 34: return ERANGE;
+    // same as EDEADLK
+    case 35: return EDEADLOCK;
+    case 36: return ENAMETOOLONG;
+    case 37: return ENOLCK;
+    case 38: return ENOSYS;
+    case 39: return ENOTEMPTY;
+    case 40: return ELOOP;
+    case 42: return ENOMSG;
+    case 43: return EIDRM;
+    case 44: return ECHRNG;
+    case 45: return EL2NSYNC;
+    case 46: return EL3HLT;
+    case 47: return EL3RST;
+    case 48: return ELNRNG;
+    case 49: return EUNATCH;
+    case 50: return ENOCSI;
+    case 51: return EL2HLT;
+    case 52: return EBADE;
+    case 53: return EBADR;
+    case 54: return EXFULL;
+    case 55: return ENOANO;
+    case 56: return EBADRQC;
+    case 57: return EBADSLT;
+    case 59: return EBFONT;
+    case 60: return ENOSTR;
+    case 61: return ENODATA;
+    case 62: return ETIME;
+    case 63: return ENOSR;
+    case 64: return ENONET;
+    case 65: return ENOPKG;
+    case 66: return EREMOTE;
+    case 67: return ENOLINK;
+    case 68: return EADV;
+    case 69: return ESRMNT;
+    case 70: return ECOMM;
+    case 71: return EPROTO;
+    case 72: return EMULTIHOP;
+    case 73: return EDOTDOT;
+    case 74: return EBADMSG;
+    case 75: return EOVERFLOW;
+    case 76: return ENOTUNIQ;
+    case 77: return EBADFD;
+    case 78: return EREMCHG;
+    case 79: return ELIBACC;
+    case 80: return ELIBBAD;
+    case 81: return ELIBSCN;
+    case 82: return ELIBMAX;
+    case 83: return ELIBEXEC;
+    case 84: return EILSEQ;
+    case 85: return ERESTART;
+    case 86: return ESTRPIPE;
+    case 87: return EUSERS;
+    case 88: return ENOTSOCK;
+    case 89: return EDESTADDRREQ;
+    case 90: return EMSGSIZE;
+    case 91: return EPROTOTYPE;
+    case 92: return ENOPROTOOPT;
+    case 93: return EPROTONOSUPPORT;
+    case 94: return ESOCKTNOSUPPORT;
+    // same as ENOTSUP
+    case 95: return EOPNOTSUPP;
+    case 96: return EPFNOSUPPORT;
+    case 97: return EAFNOSUPPORT;
+    case 98: return EADDRINUSE;
+    case 99: return EADDRNOTAVAIL;
+    case 100: return ENETDOWN;
+    case 101: return ENETUNREACH;
+    case 102: return ENETRESET;
+    case 103: return ECONNABORTED;
+    case 104: return ECONNRESET;
+    case 105: return ENOBUFS;
+    case 106: return EISCONN;
+    case 107: return ENOTCONN;
+    case 108: return ESHUTDOWN;
+    case 109: return ETOOMANYREFS;
+    case 110: return ETIMEDOUT;
+    case 111: return ECONNREFUSED;
+    case 112: return EHOSTDOWN;
+    case 113: return EHOSTUNREACH;
+    case 114: return EALREADY;
+    case 115: return EINPROGRESS;
+    case 116: return ESTALE;
+    case 117: return EUCLEAN;
+    case 118: return ENOTNAM;
+    case 119: return ENAVAIL;
+    case 120: return EISNAM;
+    case 121: return EREMOTEIO;
+    case 122: return EDQUOT;
+    case 123: return ENOMEDIUM;
+    case 124: return EMEDIUMTYPE;
+    case 125: return ECANCELED;
+    case 126: return ENOKEY;
+    case 127: return EKEYEXPIRED;
+    case 128: return EKEYREVOKED;
+    case 129: return EKEYREJECTED;
+    case 130: return EOWNERDEAD;
+    case 131: return ENOTRECOVERABLE;
+    case 132: return ERFKILL;
+    case 133: return EHWPOISON;
+    default:
+      return r;
+  }
+}
+
+__u32 hostos_to_ceph_errno_unsigned(__u32 r) {
+  // Windows errno -> Linux errno
+  switch(r) {
+    case EPERM: return 1;
+    case ENOENT: return 2;
+    case ESRCH: return 3;
+    case EINTR: return 4;
+    case EIO: return 5;
+    case ENXIO: return 6;
+    case E2BIG: return 7;
+    case ENOEXEC: return 8;
+    case EBADF: return 9;
+    case ECHILD: return 10;
+    case EAGAIN: return 11;
+    case EWOULDBLOCK: return 11;
+    case ENOMEM: return 12;
+    case EACCES: return 13;
+    case EFAULT: return 14;
+    case ENOTBLK: return 15;
+    case EBUSY: return 16;
+    case EEXIST: return 17;
+    case EXDEV: return 18;
+    case ENODEV: return 19;
+    case ENOTDIR: return 20;
+    case EISDIR: return 21;
+    case EINVAL: return 22;
+    case ENFILE: return 23;
+    case EMFILE: return 24;
+    case ENOTTY: return 25;
+    case ETXTBSY: return 26;
+    case EFBIG: return 27;
+    case ENOSPC: return 28;
+    case ESPIPE: return 29;
+    case EROFS: return 30;
+    case EMLINK: return 31;
+    case EPIPE: return 32;
+    case EDOM: return 33;
+    case ERANGE: return 34;
+    // same as EDEADLOCK
+    // case EDEADLK: return 35;
+    case EDEADLOCK: return 35;
+    case ENAMETOOLONG: return 36;
+    case ENOLCK: return 37;
+    case ENOSYS: return 38;
+    case ENOTEMPTY: return 39;
+    case ELOOP: return 40;
+    case ENOMSG: return 42;
+    case EIDRM: return 43;
+    case ECHRNG: return 44;
+    case EL2NSYNC: return 45;
+    case EL3HLT: return 46;
+    case EL3RST: return 47;
+    case ELNRNG: return 48;
+    case EUNATCH: return 49;
+    case ENOCSI: return 50;
+    case EL2HLT: return 51;
+    case EBADE: return 52;
+    case EBADR: return 53;
+    case EXFULL: return 54;
+    case ENOANO: return 55;
+    case EBADRQC: return 56;
+    case EBADSLT: return 57;
+    case EBFONT: return 59;
+    case ENOSTR: return 60;
+    case ENODATA: return 61;
+    case ETIME: return 62;
+    case ENOSR: return 63;
+    case ENONET: return 64;
+    case ENOPKG: return 65;
+    case EREMOTE: return 66;
+    case ENOLINK: return 67;
+    case EADV: return 68;
+    case ESRMNT: return 69;
+    case ECOMM: return 70;
+    case EPROTO: return 71;
+    case EMULTIHOP: return 72;
+    case EDOTDOT: return 73;
+    case EBADMSG: return 74;
+    case EOVERFLOW: return 75;
+    case ENOTUNIQ: return 76;
+    case EBADFD: return 77;
+    case EREMCHG: return 78;
+    case ELIBACC: return 79;
+    case ELIBBAD: return 80;
+    case ELIBSCN: return 81;
+    case ELIBMAX: return 82;
+    case ELIBEXEC: return 83;
+    case EILSEQ: return 84;
+    // compat.h defines ERESTART as EINTR
+    // case ERESTART: return 85;
+    case ESTRPIPE: return 86;
+    case EUSERS: return 87;
+    case ENOTSOCK: return 88;
+    case EDESTADDRREQ: return 89;
+    case EMSGSIZE: return 90;
+    case EPROTOTYPE: return 91;
+    case ENOPROTOOPT: return 92;
+    case EPROTONOSUPPORT: return 93;
+    case ESOCKTNOSUPPORT: return 94;
+    case EOPNOTSUPP: return 95;
+    case ENOTSUP: return 95;
+    case EPFNOSUPPORT: return 96;
+    case EAFNOSUPPORT: return 97;
+    case EADDRINUSE: return 98;
+    case EADDRNOTAVAIL: return 99;
+    case ENETDOWN: return 100;
+    case ENETUNREACH: return 101;
+    case ENETRESET: return 102;
+    case ECONNABORTED: return 103;
+    case ECONNRESET: return 104;
+    case ENOBUFS: return 105;
+    case EISCONN: return 106;
+    case ENOTCONN: return 107;
+    case ESHUTDOWN: return 108;
+    case ETOOMANYREFS: return 109;
+    case ETIMEDOUT: return 110;
+    case ECONNREFUSED: return 111;
+    case EHOSTDOWN: return 112;
+    case EHOSTUNREACH: return 113;
+    case EALREADY: return 114;
+    case EINPROGRESS: return 115;
+    case ESTALE: return 116;
+    case EUCLEAN: return 117;
+    case ENOTNAM: return 118;
+    case ENAVAIL: return 119;
+    case EISNAM: return 120;
+    case EREMOTEIO: return 121;
+    case EDQUOT: return 122;
+    case ENOMEDIUM: return 123;
+    case EMEDIUMTYPE: return 124;
+    case ECANCELED: return 125;
+    case ENOKEY: return 126;
+    case EKEYEXPIRED: return 127;
+    case EKEYREVOKED: return 128;
+    case EKEYREJECTED: return 129;
+    case EOWNERDEAD: return 130;
+    case ENOTRECOVERABLE: return 131;
+    case ERFKILL: return 132;
+    case EHWPOISON: return 133;
+    default:
+      return r;
+ }
+}
+
+__s32 wsae_to_errno_unsigned(__s32 r)
+{
+  switch(r) {
+    case WSAEINTR: return EINTR;
+    case WSAEBADF: return EBADF;
+    case WSAEACCES: return EACCES;
+    case WSAEFAULT: return EFAULT;
+    case WSAEINVAL: return EINVAL;
+    case WSAEMFILE: return EMFILE;
+    // Linux defines WSAEWOULDBLOCK as EAGAIN, but not Windows headers.
+    // Since all ceph code uses EAGAIN instead of EWOULDBLOCK, we'll do
+    // the same here.
+    case WSAEWOULDBLOCK: return EAGAIN;
+    // Some functions (e.g. connect) can return WSAEWOULDBLOCK instead of
+    // EINPROGRESS.
+    case WSAEINPROGRESS: return EINPROGRESS;
+    case WSAEALREADY: return EALREADY;
+    case WSAENOTSOCK: return ENOTSOCK;
+    case WSAEDESTADDRREQ: return EDESTADDRREQ;
+    case WSAEMSGSIZE: return EMSGSIZE;
+    case WSAEPROTOTYPE: return EPROTOTYPE;
+    case WSAENOPROTOOPT: return ENOPROTOOPT;
+    case WSAEPROTONOSUPPORT: return EPROTONOSUPPORT;
+    case WSAESOCKTNOSUPPORT: return ESOCKTNOSUPPORT;
+    case WSAEOPNOTSUPP: return EOPNOTSUPP;
+    case WSAEPFNOSUPPORT: return EPFNOSUPPORT;
+    case WSAEAFNOSUPPORT: return EAFNOSUPPORT;
+    case WSAEADDRINUSE: return EADDRINUSE;
+    case WSAEADDRNOTAVAIL: return EADDRNOTAVAIL;
+    case WSAENETDOWN: return ENETDOWN;
+    case WSAENETUNREACH: return ENETUNREACH;
+    case WSAENETRESET: return ENETRESET;
+    case WSAECONNABORTED: return ECONNABORTED;
+    case WSAECONNRESET: return ECONNRESET;
+    case WSAENOBUFS: return ENOBUFS;
+    case WSAEISCONN: return EISCONN;
+    case WSAENOTCONN: return ENOTCONN;
+    case WSAESHUTDOWN: return ESHUTDOWN;
+    case WSAETOOMANYREFS: return ETOOMANYREFS;
+    case WSAETIMEDOUT: return ETIMEDOUT;
+    case WSAECONNREFUSED: return ECONNREFUSED;
+    case WSAELOOP: return ELOOP;
+    case WSAENAMETOOLONG: return ENAMETOOLONG;
+    case WSAEHOSTDOWN: return EHOSTDOWN;
+    case WSAEHOSTUNREACH: return EHOSTUNREACH;
+    case WSAENOTEMPTY: return ENOTEMPTY;
+    // case WSAEPROCLIM
+    case WSAEUSERS: return EUSERS;
+    case WSAEDQUOT: return EDQUOT;
+    case WSAESTALE: return ESTALE;
+    case WSAEREMOTE: return EREMOTE;
+    // case WSASYSNOTREADY
+    // case WSAVERNOTSUPPORTED
+    // case WSANOTINITIALISED
+    case WSAEDISCON: return ESHUTDOWN;
+    // case WSAENOMORE
+    case WSAECANCELLED: return ECANCELED;
+    // We might return EINVAL, but it's probably better if we propagate the
+    // original error code here.
+    // case WSAEINVALIDPROCTABLE
+    // case WSAEINVALIDPROVIDER
+    // case WSAEPROVIDERFAILEDINIT
+    // case WSASYSCALLFAILURE
+    // case WSASERVICE_NOT_FOUND:
+    // case WSATYPE_NOT_FOUND:
+    // case WSA_E_NO_MORE:
+    case WSA_E_CANCELLED: return ECANCELED;
+    case WSAEREFUSED: return ECONNREFUSED;
+    case WSAHOST_NOT_FOUND: return EHOSTUNREACH;
+    case WSATRY_AGAIN: return EAGAIN;
+    // case WSANO_RECOVERY
+    // case WSANO_DATA:
+    default: return r;
+  }
+}
+
+// converts from linux errno values to host values
+__s32 ceph_to_hostos_errno(__s32 r)
+{
+  int sign = (r < 0 ? -1 : 1);
+  return ceph_to_hostos_errno_unsigned(abs(r)) * sign;
+}
+
+// converts Host OS errno values to linux/Ceph values
+__s32 hostos_to_ceph_errno(__s32 r)
+{
+  int sign = (r < 0 ? -1 : 1);
+  return hostos_to_ceph_errno_unsigned(abs(r)) * sign;
+}
+
+__s32 wsae_to_errno(__s32 r)
+{
+  int sign = (r < 0 ? -1 : 1);
+  return wsae_to_errno_unsigned(abs(r)) * sign;
+}
+
+__u32 errno_to_ntstatus(__s32 r) {
+  // errno -> NTSTATUS
+  // In some cases, there might be more than one applicable NTSTATUS
+  // value or there might be none. Certain values can be overridden
+  // when the caller (or whoever is supposed to handle the error) is
+  // expecting a different NTSTATUS value.
+  r = abs(r);
+
+  switch(r) {
+    case 0: return 0;
+    case EPERM: return STATUS_ACCESS_DENIED;
+    case ENOENT: return STATUS_OBJECT_NAME_NOT_FOUND;
+    case ESRCH: return STATUS_NOT_FOUND;
+    case EINTR: return STATUS_RETRY;
+    case EIO: return STATUS_DATA_ERROR;
+    case ENXIO: return STATUS_NOT_FOUND;
+    case E2BIG: return STATUS_FILE_TOO_LARGE;
+    case ENOEXEC: return STATUS_ACCESS_DENIED;
+    case EBADF: return STATUS_INVALID_HANDLE;
+    case ECHILD: return STATUS_INTERNAL_ERROR;
+    case EAGAIN: return STATUS_RETRY;
+    case EWOULDBLOCK: return STATUS_RETRY;
+    case ENOMEM: return STATUS_NO_MEMORY;
+    case EACCES: return STATUS_ACCESS_DENIED;
+    case EFAULT: return STATUS_INVALID_ADDRESS;
+    case ENOTBLK: return STATUS_BAD_DEVICE_TYPE;
+    case EBUSY: return STATUS_DEVICE_BUSY;
+    case EEXIST: return STATUS_OBJECT_NAME_COLLISION;
+    case EXDEV: return STATUS_NOT_SAME_DEVICE;
+    case ENODEV: return STATUS_SYSTEM_DEVICE_NOT_FOUND;
+    case ENOTDIR: return STATUS_NOT_A_DIRECTORY;
+    case EISDIR: return STATUS_FILE_IS_A_DIRECTORY;
+    case EINVAL: return STATUS_INVALID_PARAMETER;
+    case ENFILE: return STATUS_TOO_MANY_OPENED_FILES;
+    case EMFILE: return STATUS_TOO_MANY_OPENED_FILES;
+    case ENOTTY: return STATUS_INVALID_PARAMETER;
+    case ETXTBSY: return STATUS_DEVICE_BUSY;
+    case EFBIG: return STATUS_FILE_TOO_LARGE;
+    case ENOSPC: return STATUS_DISK_FULL;
+    case ESPIPE: return STATUS_INVALID_PARAMETER;
+    case EROFS: return STATUS_MEDIA_WRITE_PROTECTED;
+    case EMLINK: return STATUS_TOO_MANY_LINKS;
+    case EPIPE: return STATUS_PIPE_BROKEN;
+    case EDOM: return STATUS_INVALID_PARAMETER;
+    case ERANGE: return STATUS_INVALID_PARAMETER;
+    // same as EDEADLOCK
+    // case EDEADLK: return 35;
+    case EDEADLOCK: return STATUS_POSSIBLE_DEADLOCK;
+    case ENAMETOOLONG: return STATUS_NAME_TOO_LONG;
+    case ENOLCK: return STATUS_NOT_LOCKED;
+    case ENOSYS: return STATUS_NOT_IMPLEMENTED;
+    case ENOTEMPTY: return STATUS_DIRECTORY_NOT_EMPTY;
+    case ELOOP: return STATUS_TOO_MANY_LINKS;
+    case ENOMSG: return STATUS_MESSAGE_NOT_FOUND;
+    case EIDRM: return STATUS_INVALID_PARAMETER;
+    case ECHRNG: return STATUS_INVALID_PARAMETER;
+    case EL2NSYNC: return STATUS_INTERNAL_ERROR;
+    case EL3HLT: return STATUS_INTERNAL_ERROR;
+    case EL3RST: return STATUS_INTERNAL_ERROR;
+    case ELNRNG: return STATUS_INTERNAL_ERROR;
+    case EUNATCH: return STATUS_INTERNAL_ERROR;
+    case ENOCSI: return STATUS_INTERNAL_ERROR;
+    case EL2HLT: return STATUS_INTERNAL_ERROR;
+    case EBADE: return STATUS_INTERNAL_ERROR;
+    case EBADR: return STATUS_INVALID_HANDLE;
+    case EXFULL: return STATUS_DISK_FULL;
+    case ENOANO: return STATUS_INTERNAL_ERROR;
+    case EBADRQC: return STATUS_INVALID_PARAMETER;
+    case EBADSLT: return STATUS_INVALID_PARAMETER;
+    case EBFONT: return STATUS_INVALID_PARAMETER;
+    case ENOSTR: return STATUS_INVALID_PARAMETER;
+    case ENODATA: return STATUS_NOT_FOUND;
+    case ETIME: return STATUS_TIMEOUT;
+    case ENOSR: return STATUS_INSUFFICIENT_RESOURCES;
+    case ENONET: return STATUS_NETWORK_UNREACHABLE;
+    case ENOPKG: return STATUS_NO_SUCH_PACKAGE;
+    case EREMOTE: return STATUS_INVALID_PARAMETER;
+    case ENOLINK: return STATUS_INTERNAL_ERROR;
+    case EADV: return STATUS_INTERNAL_ERROR;
+    case ESRMNT: return STATUS_INTERNAL_ERROR;
+    case ECOMM: return STATUS_INTERNAL_ERROR;
+    case EPROTO: return STATUS_PROTOCOL_NOT_SUPPORTED;
+    case EMULTIHOP: return STATUS_INTERNAL_ERROR;
+    case EDOTDOT: return STATUS_INTERNAL_ERROR;
+    case EBADMSG: return STATUS_INVALID_PARAMETER;
+    case EOVERFLOW: return STATUS_BUFFER_OVERFLOW;
+    case ENOTUNIQ: return STATUS_DUPLICATE_NAME;
+    case EBADFD: return STATUS_INVALID_HANDLE;
+    case EREMCHG: return STATUS_FILE_RENAMED;
+    case ELIBACC: return STATUS_DLL_NOT_FOUND;
+    case ELIBBAD: return STATUS_BAD_DLL_ENTRYPOINT;
+    case ELIBSCN: return STATUS_BAD_DLL_ENTRYPOINT;
+    case ELIBMAX: return STATUS_TOO_MANY_OPENED_FILES;
+    case ELIBEXEC: return STATUS_INVALID_PARAMETER;
+    case EILSEQ: return STATUS_INVALID_PARAMETER;
+    // compat.h defines ERESTART as EINTR
+    // case ERESTART: return 85;
+    case ESTRPIPE: return STATUS_RETRY;
+    case EUSERS: return STATUS_TOO_MANY_SIDS;
+    case ENOTSOCK: return STATUS_INVALID_HANDLE;
+    case EDESTADDRREQ: return STATUS_INVALID_PARAMETER;
+    case EMSGSIZE: return STATUS_BUFFER_OVERFLOW;
+    case EPROTOTYPE: return STATUS_INVALID_PARAMETER;
+    case ENOPROTOOPT: return STATUS_PROTOCOL_NOT_SUPPORTED;
+    case EPROTONOSUPPORT: return STATUS_PROTOCOL_NOT_SUPPORTED;
+    case ESOCKTNOSUPPORT: return STATUS_NOT_SUPPORTED;
+    case EOPNOTSUPP: return STATUS_NOT_SUPPORTED;
+    case ENOTSUP: return STATUS_NOT_SUPPORTED;
+    case EPFNOSUPPORT: return STATUS_PROTOCOL_NOT_SUPPORTED;
+    case EAFNOSUPPORT: return STATUS_NOT_SUPPORTED;
+    case EADDRINUSE: return STATUS_ADDRESS_ALREADY_EXISTS;
+    case EADDRNOTAVAIL: return STATUS_INVALID_ADDRESS;
+    case ENETDOWN: return STATUS_NETWORK_UNREACHABLE;
+    case ENETUNREACH: return STATUS_NETWORK_UNREACHABLE;
+    case ENETRESET: return STATUS_CONNECTION_RESET;
+    case ECONNABORTED: return STATUS_CONNECTION_ABORTED;
+    case ECONNRESET: return STATUS_CONNECTION_DISCONNECTED;
+    case ENOBUFS: return STATUS_BUFFER_TOO_SMALL;
+    case EISCONN: return STATUS_CONNECTION_ACTIVE;
+    case ENOTCONN: return STATUS_CONNECTION_DISCONNECTED;
+    case ESHUTDOWN: return STATUS_SYSTEM_SHUTDOWN;
+    case ETOOMANYREFS: return STATUS_TOO_MANY_LINKS;
+    case ETIMEDOUT: return STATUS_TIMEOUT;
+    case ECONNREFUSED: return STATUS_CONNECTION_REFUSED;
+    case EHOSTDOWN: return STATUS_FILE_CLOSED;
+    case EHOSTUNREACH: return STATUS_HOST_UNREACHABLE;
+    case EALREADY: return STATUS_PENDING;
+    case EINPROGRESS: return STATUS_PENDING;
+    case ESTALE: return STATUS_INVALID_HANDLE;
+    case EUCLEAN: return STATUS_INVALID_PARAMETER;
+    case ENOTNAM: return STATUS_INVALID_PARAMETER;
+    case ENAVAIL: return STATUS_INVALID_PARAMETER;
+    case EISNAM: return STATUS_INVALID_PARAMETER;
+    case EREMOTEIO: return STATUS_DATA_ERROR;
+    case EDQUOT: return STATUS_QUOTA_EXCEEDED;
+    case ENOMEDIUM: return STATUS_NO_MEDIA;
+    case EMEDIUMTYPE: return STATUS_INVALID_PARAMETER;
+    case ECANCELED: return STATUS_REQUEST_CANCELED;
+    case ENOKEY: return STATUS_NO_USER_KEYS;
+    case EKEYEXPIRED: return STATUS_SMARTCARD_CERT_EXPIRED;
+    case EKEYREVOKED: return STATUS_IMAGE_CERT_REVOKED;
+    case EKEYREJECTED: return STATUS_ACCESS_DENIED;
+    case EOWNERDEAD: return STATUS_INTERNAL_ERROR;
+    case ENOTRECOVERABLE: return STATUS_INTERNAL_ERROR;
+    case ERFKILL: return STATUS_INTERNAL_ERROR;
+    case EHWPOISON: return STATUS_INTERNAL_ERROR;
+    default:
+      return STATUS_INTERNAL_ERROR;
+ }
+}
+
+std::string win32_strerror(int err)
+{
+  // As opposed to dlerror messages, this has to be freed.
+  LPSTR msg = NULL;
+  DWORD msg_len = ::FormatMessageA(
+    FORMAT_MESSAGE_ALLOCATE_BUFFER |
+    FORMAT_MESSAGE_FROM_SYSTEM |
+    FORMAT_MESSAGE_IGNORE_INSERTS,
+    NULL,
+    err,
+    0,
+    (LPSTR) &msg,
+    0,
+    NULL);
+
+  std::ostringstream msg_stream;
+  msg_stream << "(" << err << ") ";
+  if (!msg_len) {
+    msg_stream << "Unknown error";
+  }
+  else {
+    msg_stream << msg;
+    ::LocalFree(msg);
+  }
+  return msg_stream.str();
+}
+
+std::string win32_lasterror_str()
+{
+  DWORD err = ::GetLastError();
+  return win32_strerror(err);
+}
+
+static const ceph::unordered_map<int,NTSTATUS> cephfs_errno_to_ntstatus = {
+  {CEPHFS_EBLOCKLISTED,    STATUS_SYSTEM_SHUTDOWN},
+  {CEPHFS_EPERM,           STATUS_ACCESS_DENIED},
+  {CEPHFS_ESTALE,          STATUS_INVALID_HANDLE},
+  {CEPHFS_ENOSPC,          STATUS_DISK_FULL},
+  {CEPHFS_ETIMEDOUT,       STATUS_TIMEOUT},
+  {CEPHFS_EIO,             STATUS_DATA_ERROR},
+  {CEPHFS_ENOTCONN,        STATUS_CONNECTION_DISCONNECTED},
+  {CEPHFS_EEXIST,          STATUS_OBJECT_NAME_COLLISION},
+  {CEPHFS_EINTR,           STATUS_RETRY},
+  {CEPHFS_EINVAL,          STATUS_INVALID_PARAMETER},
+  {CEPHFS_EBADF,           STATUS_INVALID_HANDLE},
+  {CEPHFS_EROFS,           STATUS_MEDIA_WRITE_PROTECTED},
+  {CEPHFS_EAGAIN,          STATUS_RETRY},
+  {CEPHFS_EACCES,          STATUS_ACCESS_DENIED},
+  {CEPHFS_ELOOP,           STATUS_TOO_MANY_LINKS},
+  {CEPHFS_EISDIR,          STATUS_FILE_IS_A_DIRECTORY},
+  {CEPHFS_ENOENT,          STATUS_OBJECT_NAME_NOT_FOUND},
+  {CEPHFS_ENOTDIR,         STATUS_NOT_A_DIRECTORY},
+  {CEPHFS_ENAMETOOLONG,    STATUS_NAME_TOO_LONG},
+  {CEPHFS_EBUSY,           STATUS_DEVICE_BUSY},
+  {CEPHFS_EDQUOT,          STATUS_QUOTA_EXCEEDED},
+  {CEPHFS_EFBIG,           STATUS_FILE_TOO_LARGE},
+  {CEPHFS_ERANGE,          STATUS_INVALID_PARAMETER},
+  {CEPHFS_ENXIO,           STATUS_NOT_FOUND},
+  {CEPHFS_ECANCELED,       STATUS_REQUEST_CANCELED},
+  {CEPHFS_ENODATA,         STATUS_NOT_FOUND},
+  {CEPHFS_EOPNOTSUPP,      STATUS_NOT_SUPPORTED},
+  {CEPHFS_EXDEV,           STATUS_NOT_SAME_DEVICE},
+  {CEPHFS_ENOMEM,          STATUS_NO_MEMORY},
+  {CEPHFS_ENOTRECOVERABLE, STATUS_INTERNAL_ERROR},
+  {CEPHFS_ENOSYS,          STATUS_NOT_IMPLEMENTED},
+  {CEPHFS_ENOTEMPTY,       STATUS_DIRECTORY_NOT_EMPTY},
+  {CEPHFS_EDEADLK,         STATUS_POSSIBLE_DEADLOCK},
+  {CEPHFS_EDOM,            STATUS_INVALID_PARAMETER},
+  {CEPHFS_EMLINK,          STATUS_TOO_MANY_LINKS},
+  {CEPHFS_ETIME,           STATUS_TIMEOUT},
+  {CEPHFS_EOLDSNAPC,       STATUS_DATA_ERROR}
+};
+
+__u32 cephfs_errno_to_ntstatus_map(int cephfs_errno)
+{
+  cephfs_errno = abs(cephfs_errno);
+
+  if (cephfs_errno == 0)
+    return 0;
+
+  auto it = cephfs_errno_to_ntstatus.find(cephfs_errno);
+  if (it != cephfs_errno_to_ntstatus.end())
+    return it->second;
+  return STATUS_INTERNAL_ERROR;
+}
diff --git a/src/common/win32/event_logging.mc b/src/common/win32/event_logging.mc
new file mode 100644
index 000000000..3d08889aa
--- /dev/null
+++ b/src/common/win32/event_logging.mc
@@ -0,0 +1,35 @@
+SeverityNames=(Success=0x0:STATUS_SEVERITY_SUCCESS
+               Informational=0x1:STATUS_SEVERITY_INFORMATIONAL
+               Warning=0x2:STATUS_SEVERITY_WARNING
+               Error=0x3:STATUS_SEVERITY_ERROR
+              )
+
+
+MessageId=0x0001
+Severity=Success
+SymbolicName=SUCCESS_EVENTMSG
+Language=English
+%1
+.
+
+MessageId=0x0002
+Severity=Informational
+SymbolicName=INFO_EVENTMSG
+Language=English
+%1
+.
+
+MessageId=0x0003
+Severity=Warning
+SymbolicName=WARN_EVENTMSG
+Language=English
+%1
+.
+
+MessageId=0x0004
+Severity=Error
+SymbolicName=ERROR_EVENTMSG
+Language=English
+%1
+.
+
diff --git a/src/common/win32/ifaddrs.cc b/src/common/win32/ifaddrs.cc
new file mode 100644
index 000000000..d7de4a5ff
--- /dev/null
+++ b/src/common/win32/ifaddrs.cc
@@ -0,0 +1,109 @@
+#include <errno.h>
+#include <winsock2.h>
+#include <wincrypt.h>
+#include <iphlpapi.h>
+#include <ws2tcpip.h>
+#include <ifaddrs.h>
+#include <stdio.h>
+
+#include "include/compat.h"
+
+int getifaddrs(struct ifaddrs **ifap)
+{
+  int ret = 0;
+
+  DWORD size, res = 0;
+  res = GetAdaptersAddresses(
+    AF_UNSPEC, GAA_FLAG_INCLUDE_PREFIX,
+    NULL, NULL, &size);
+  if (res != ERROR_BUFFER_OVERFLOW) {
+    errno = ENOMEM;
+    return -1;
+  }
+
+  PIP_ADAPTER_ADDRESSES adapter_addrs = (PIP_ADAPTER_ADDRESSES)malloc(size);
+  res = GetAdaptersAddresses(
+    AF_UNSPEC, GAA_FLAG_INCLUDE_PREFIX,
+    NULL, adapter_addrs, &size);
+  if (res != ERROR_SUCCESS) {
+    errno = ENOMEM;
+    return -1;
+  }
+
+  struct ifaddrs *out_list_head = NULL;
+  struct ifaddrs *out_list_curr;
+
+  for (PIP_ADAPTER_ADDRESSES curr_addrs = adapter_addrs;
+       curr_addrs != NULL;
+       curr_addrs = curr_addrs->Next) {
+    if (curr_addrs->OperStatus != 1)
+      continue;
+
+    for (PIP_ADAPTER_UNICAST_ADDRESS unicast_addrs = curr_addrs->FirstUnicastAddress;
+         unicast_addrs != NULL;
+         unicast_addrs = unicast_addrs->Next) {
+      SOCKADDR* unicast_sockaddr = unicast_addrs->Address.lpSockaddr;
+      if (unicast_sockaddr->sa_family != AF_INET &&
+          unicast_sockaddr->sa_family != AF_INET6)
+        continue;
+      out_list_curr = calloc(sizeof(*out_list_curr), 1);
+      if (!out_list_curr) {
+        errno = ENOMEM;
+        ret = -1;
+        goto out;
+      }
+
+      out_list_curr->ifa_next = out_list_head;
+      out_list_head = out_list_curr;
+
+      out_list_curr->ifa_flags = IFF_UP;
+      if (curr_addrs->IfType == IF_TYPE_SOFTWARE_LOOPBACK)
+        out_list_curr->ifa_flags |= IFF_LOOPBACK;
+
+      out_list_curr->ifa_addr = (struct sockaddr *) &out_list_curr->in_addrs;
+      out_list_curr->ifa_netmask = (struct sockaddr *) &out_list_curr->in_netmasks;
+      out_list_curr->ifa_name = out_list_curr->ad_name;
+
+      if (unicast_sockaddr->sa_family == AF_INET) {
+        ULONG subnet_mask = 0;
+        if (ConvertLengthToIpv4Mask(unicast_addrs->OnLinkPrefixLength, &subnet_mask)) {
+          errno = ENODATA;
+          ret = -1;
+          goto out;
+        }
+        struct sockaddr_in *addr4 = (struct sockaddr_in *) &out_list_curr->in_addrs;
+        struct sockaddr_in *netmask4 = (struct sockaddr_in *) &out_list_curr->in_netmasks;
+        netmask4->sin_family = unicast_sockaddr->sa_family;
+        addr4->sin_family = unicast_sockaddr->sa_family;
+        netmask4->sin_addr.S_un.S_addr = subnet_mask;
+        addr4->sin_addr = ((struct sockaddr_in*) unicast_sockaddr)->sin_addr;
+      } else {
+        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *) &out_list_curr->in_addrs;
+        (*addr6) = *(struct sockaddr_in6 *) unicast_sockaddr;
+      }
+      out_list_curr->speed = curr_addrs->TransmitLinkSpeed;
+      // TODO maybe use friendly name instead of adapter GUID
+      sprintf_s(out_list_curr->ad_name,
+                sizeof(out_list_curr->ad_name),
+                curr_addrs->AdapterName);
+    }
+  }
+  ret = 0;
+out:
+  free(adapter_addrs);
+  if (ret && out_list_head)
+    free(out_list_head);
+  else if (ifap)
+    *ifap = out_list_head;
+
+  return ret;
+}
+
+void freeifaddrs(struct ifaddrs *ifa)
+{
+  while (ifa) {
+    struct ifaddrs *next = ifa->ifa_next;
+    free(ifa);
+    ifa = next;
+  }
+}
diff --git a/src/common/win32/registry.cc b/src/common/win32/registry.cc
new file mode 100644
index 000000000..85ab80df9
--- /dev/null
+++ b/src/common/win32/registry.cc
@@ -0,0 +1,165 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/win32/registry.h"
+
+RegistryKey::RegistryKey(CephContext *cct_, HKEY hRootKey, LPCTSTR strKey,
+                         bool create_value): cct(cct_)
+{
+  DWORD status = RegOpenKeyEx(hRootKey, strKey, 0, KEY_ALL_ACCESS, &hKey);
+
+  if (status == ERROR_FILE_NOT_FOUND && create_value)
+  {
+    ldout(cct_, 10) << "Creating registry key: " << strKey << dendl;
+    status = RegCreateKeyEx(
+        hRootKey, strKey, 0, NULL, REG_OPTION_NON_VOLATILE,
+        KEY_ALL_ACCESS, NULL, &hKey, NULL);
+  }
+
+  if (ERROR_SUCCESS != status) {
+    if (ERROR_FILE_NOT_FOUND == status) {
+      missingKey = true;
+    } else {
+      lderr(cct_) << "Error: " << win32_strerror(status)
+                  << ". Could not open registry key: "
+                  << strKey << dendl;
+    }
+  }
+}
+
+RegistryKey::~RegistryKey() {
+  if (!hKey)
+    return;
+
+  DWORD status = RegCloseKey(hKey);
+  if (ERROR_SUCCESS != status) {
+    derr << "Error: " << win32_strerror(status)
+         << ". Could not close registry key." << dendl;
+  } else {
+    hKey = NULL;
+  }
+}
+
+int RegistryKey::remove(CephContext *cct_, HKEY hRootKey, LPCTSTR strKey)
+{
+  DWORD status = RegDeleteKeyEx(hRootKey, strKey, KEY_WOW64_64KEY, 0);
+
+  if (status == ERROR_FILE_NOT_FOUND)
+  {
+    ldout(cct_, 20) << "Registry key : " << strKey
+                    << " does not exist." << dendl;
+    return 0;
+  }
+
+  if (ERROR_SUCCESS != status) {
+    lderr(cct_) << "Error: " << win32_strerror(status)
+                << ". Could not delete registry key: "
+                << strKey << dendl;
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+int RegistryKey::flush() {
+  DWORD status = RegFlushKey(hKey);
+  if (ERROR_SUCCESS != status) {
+    derr << "Error: " << win32_strerror(status)
+         << ". Could not flush registry key." << dendl;
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+int RegistryKey::set(LPCTSTR lpValue, DWORD data)
+{
+  DWORD status = RegSetValueEx(hKey, lpValue, 0, REG_DWORD,
+                               (LPBYTE)&data, sizeof(DWORD));
+  if (ERROR_SUCCESS != status) {
+    derr << "Error: " << win32_strerror(status)
+         << ". Could not set registry value: " << (char*)lpValue << dendl;
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+int RegistryKey::set(LPCTSTR lpValue, std::string data)
+{
+  DWORD status = RegSetValueEx(hKey, lpValue, 0, REG_SZ,
+                               (LPBYTE)data.c_str(), data.length());
+  if (ERROR_SUCCESS != status) {
+    derr << "Error: " << win32_strerror(status)
+         << ". Could not set registry value: "
+         << (char*)lpValue << dendl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+int RegistryKey::get(LPCTSTR lpValue, bool& value)
+{
+  DWORD value_dw = 0;
+  int r = get(lpValue, value_dw);
+  if (!r) {
+    value = !!value_dw;
+  }
+  return r;
+}
+
+int RegistryKey::get(LPCTSTR lpValue, DWORD& value)
+{
+  DWORD data;
+  DWORD size = sizeof(data);
+  DWORD type = REG_DWORD;
+  DWORD status = RegQueryValueEx(hKey, lpValue, NULL,
+                                 &type, (LPBYTE)&data, &size);
+  if (ERROR_SUCCESS != status) {
+    derr << "Error: " << win32_strerror(status)
+         << ". Could not get registry value: "
+         << (char*)lpValue << dendl;
+    return -EINVAL;
+  }
+  value = data;
+
+  return 0;
+}
+
+int RegistryKey::get(LPCTSTR lpValue, std::string& value)
+{
+  std::string data{""};
+  DWORD size = 0;
+  DWORD type = REG_SZ;
+  DWORD status = RegQueryValueEx(hKey, lpValue, NULL, &type,
+                                 (LPBYTE)data.c_str(), &size);
+  if (ERROR_MORE_DATA == status) {
+    data.resize(size);
+    status = RegQueryValueEx(hKey, lpValue, NULL, &type,
+                             (LPBYTE)data.c_str(), &size);
+  }
+
+  if (ERROR_SUCCESS != status) {
+    derr << "Error: " << win32_strerror(status)
+         << ". Could not get registry value: "
+         << (char*)lpValue << dendl;
+    return -EINVAL;
+  }
+  value.assign(data.c_str());
+
+  return 0;
+}
diff --git a/src/common/win32/registry.h b/src/common/win32/registry.h
new file mode 100644
index 000000000..fdaf9708a
--- /dev/null
+++ b/src/common/win32/registry.h
@@ -0,0 +1,38 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+#include "common/ceph_context.h"
+
+
+class RegistryKey {
+public:
+  RegistryKey(CephContext *cct_, HKEY hRootKey, LPCTSTR strKey, bool create_value);
+  ~RegistryKey();
+
+  static remove(CephContext *cct_, HKEY hRootKey, LPCTSTR strKey);
+
+  int flush();
+
+  int set(LPCTSTR lpValue, DWORD data);
+  int set(LPCTSTR lpValue, std::string data);
+
+  int get(LPCTSTR lpValue, bool& value);
+  int get(LPCTSTR lpValue, DWORD& value);
+  int get(LPCTSTR lpValue, std::string& value);
+
+  HKEY hKey = NULL;
+  bool missingKey = false;
+
+private:
+  CephContext *cct;
+};
diff --git a/src/common/win32/service.cc b/src/common/win32/service.cc
new file mode 100644
index 000000000..7cf7620bf
--- /dev/null
+++ b/src/common/win32/service.cc
@@ -0,0 +1,156 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/win32/service.h"
+
+// Initialize the singleton service instance.
+ServiceBase *ServiceBase::s_service = NULL;
+
+ServiceBase::ServiceBase(CephContext *cct_): cct(cct_)
+{
+  status.dwServiceType = SERVICE_WIN32_OWN_PROCESS;
+  status.dwControlsAccepted = SERVICE_ACCEPT_STOP | SERVICE_ACCEPT_SHUTDOWN;
+  status.dwCurrentState = SERVICE_START_PENDING;
+  status.dwWin32ExitCode = NO_ERROR;
+  status.dwCheckPoint = 0;
+  /* The estimated time required for the stop operation in ms. */
+  status.dwWaitHint = 0;
+}
+
+/* Register service action callbacks */
+int ServiceBase::initialize(ServiceBase *service)
+{
+  s_service = service;
+
+  SERVICE_TABLE_ENTRY service_table[] = {
+    {"", (LPSERVICE_MAIN_FUNCTION)run},
+    {NULL, NULL}
+  };
+
+  /* StartServiceCtrlDispatcher blocks until the service is stopped. */
+  if (!StartServiceCtrlDispatcher(service_table)) {
+    int err = GetLastError();
+    lderr(service->cct) << "StartServiceCtrlDispatcher error: "
+                        << err << dendl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+void WINAPI ServiceBase::run()
+{
+  assert(s_service != NULL);
+
+  /* Register the control handler. This function is called by the service
+   * manager to stop the service. The service name that we're passing here
+   * doesn't have to be valid as we're using SERVICE_WIN32_OWN_PROCESS. */
+  s_service->hstatus = RegisterServiceCtrlHandler(
+    "", (LPHANDLER_FUNCTION)control_handler);
+  if (!s_service->hstatus) {
+    lderr(s_service->cct) << "Could not initialize service control handler. "
+                          << "Error: " << GetLastError() << dendl;
+    return;
+  }
+
+  s_service->set_status(SERVICE_START_PENDING);
+
+  // TODO: should we expect exceptions?
+  ldout(s_service->cct, 0) << "Starting service." << dendl;
+  int err = s_service->run_hook();
+  if (err) {
+    lderr(s_service->cct) << "Failed to start service. Error code: "
+                          << err << dendl;
+    s_service->shutdown(true);
+  } else {
+    ldout(s_service->cct, 0) << "Successfully started service." << dendl;
+    s_service->set_status(SERVICE_RUNNING);
+  }
+}
+
+void ServiceBase::shutdown(bool ignore_errors)
+{
+  DWORD original_state = status.dwCurrentState;
+  set_status(SERVICE_STOP_PENDING);
+
+  int err = shutdown_hook();
+  if (err) {
+    derr << "Shutdown service hook failed. Error code: " << err << dendl;
+    if (ignore_errors) {
+      derr << "Ignoring shutdown hook failure, marking the service as stopped."
+           << dendl;
+      set_status(SERVICE_STOPPED);
+    } else {
+      derr << "Reverting to original service state." << dendl;
+      set_status(original_state);
+    }
+  } else {
+    dout(0) << "Shutdown hook completed." << dendl;
+    set_status(SERVICE_STOPPED);
+  }
+}
+
+void ServiceBase::stop()
+{
+  DWORD original_state = status.dwCurrentState;
+  set_status(SERVICE_STOP_PENDING);
+
+  int err = stop_hook();
+  if (err) {
+    derr << "Service stop hook failed. Error code: " << err << dendl;
+    set_status(original_state);
+  } else {
+    dout(0) << "Successfully stopped service." << dendl;
+    set_status(SERVICE_STOPPED);
+  }
+}
+
+/* This function is registered with the Windows services manager through
+ * a call to RegisterServiceCtrlHandler() and will be called by the Windows
+ * service manager asynchronously to stop the service. */
+void ServiceBase::control_handler(DWORD request)
+{
+  switch (request) {
+  case SERVICE_CONTROL_STOP:
+    s_service->stop();
+    break;
+  case SERVICE_CONTROL_SHUTDOWN:
+    s_service->shutdown();
+    break;
+  default:
+    break;
+  }
+}
+
+void ServiceBase::set_status(DWORD current_state, DWORD exit_code) {
+  static DWORD dwCheckPoint = 1;
+  if (current_state == SERVICE_RUNNING || current_state == SERVICE_STOPPED) {
+    status.dwCheckPoint = dwCheckPoint++;
+  }
+
+  status.dwCurrentState = current_state;
+  status.dwWin32ExitCode = exit_code;
+
+  if (hstatus) {
+    dout(5) << "Updating service service status (" << current_state
+             << ") and exit code(" << exit_code << ")." << dendl;
+    ::SetServiceStatus(hstatus, &status);
+  } else {
+    derr << "Service control handler not initialized. Cannot "
+         << "update service status (" << current_state
+         << ") and exit code(" << exit_code << ")." << dendl;
+  }
+}
diff --git a/src/common/win32/service.h b/src/common/win32/service.h
new file mode 100644
index 000000000..20d37a876
--- /dev/null
+++ b/src/common/win32/service.h
@@ -0,0 +1,49 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+#include "common/ceph_context.h"
+
+class ServiceBase {
+
+public:
+  ServiceBase(CephContext *cct_);
+  virtual ~ServiceBase() {};
+
+  static int initialize(ServiceBase *service);
+protected:
+  static void run();
+  static void control_handler(DWORD request);
+
+  void shutdown(bool ignore_errors = false);
+  void stop();
+
+  void set_status(DWORD current_state, DWORD exit_code = NO_ERROR);
+
+  /* Subclasses should implement the following service hooks. */
+  virtual int run_hook() = 0;
+  /* Invoked when the service is requested to stop. */
+  virtual int stop_hook() = 0;
+  /* Invoked when the system is shutting down. */
+  virtual int shutdown_hook() = 0;
+
+  CephContext *cct;
+
+private:
+  /* A handle used when reporting the current status. */
+  SERVICE_STATUS_HANDLE hstatus;
+  /* The current service status. */
+  SERVICE_STATUS status;
+
+  /* singleton service instance */
+  static ServiceBase *s_service;
+};
diff --git a/src/common/win32/syslog.cc b/src/common/win32/syslog.cc
new file mode 100644
index 000000000..7a1a90c9b
--- /dev/null
+++ b/src/common/win32/syslog.cc
@@ -0,0 +1,77 @@
+#include <windows.h>
+#include <syslog.h>
+#include "event_logging.h"
+#include "common/code_environment.h"
+
+static HANDLE g_event_source = NULL;
+
+bool get_event_source()
+{
+  if (!g_event_source) {
+    HANDLE temp = RegisterEventSourceA(NULL, get_process_name_cpp().c_str());
+    if (!temp)
+      return false;
+
+    if (InterlockedCompareExchangePointer(&g_event_source, temp, NULL)) {
+      // There already was an event source, let's cleanup the one that we've
+      // just created.
+      DeregisterEventSource(temp);
+    }
+  }
+
+  return true;
+}
+
+void write_event_log_entry(int level, const char* msg)
+{
+  if (!get_event_source()) {
+    return;
+  }
+
+  WORD type;
+  DWORD event_id;
+  switch (level) {
+    case LOG_DEBUG:
+      event_id = SUCCESS_EVENTMSG;
+      type = EVENTLOG_SUCCESS;
+      break;
+
+    case LOG_INFO:
+    case LOG_NOTICE:
+      event_id = INFO_EVENTMSG;
+      type = EVENTLOG_INFORMATION_TYPE;
+      break;
+
+    case LOG_WARNING:
+      event_id = WARN_EVENTMSG;
+      type = EVENTLOG_WARNING_TYPE;
+      break;
+
+    default:
+      event_id = ERROR_EVENTMSG;
+      type = EVENTLOG_ERROR_TYPE;
+  }
+
+  ReportEventA(g_event_source, type,
+	       0, event_id, NULL, 1, 0, &msg, NULL);
+}
+
+void syslog(int priority, const char* format, ...)
+{
+  va_list args;
+  va_start(args, format);
+
+  size_t length = (size_t)_vscprintf(format, args) + 1;
+
+  char* buffer = (char*) malloc(length);
+  if (NULL == buffer) {
+    va_end(args);
+    return;
+  }
+
+  vsnprintf_s(buffer, length, length - 1, format, args);
+  va_end(args);
+
+  write_event_log_entry(LOG_PRI(priority), buffer);
+  free(buffer);
+}
diff --git a/src/common/win32/wstring.cc b/src/common/win32/wstring.cc
new file mode 100644
index 000000000..1f9b49a58
--- /dev/null
+++ b/src/common/win32/wstring.cc
@@ -0,0 +1,27 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Cloudbase Solutions
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "wstring.h"
+
+#include <boost/locale/encoding_utf.hpp>
+
+using boost::locale::conv::utf_to_utf;
+
+std::wstring to_wstring(const std::string& str)
+{
+  return utf_to_utf<wchar_t>(str.c_str(), str.c_str() + str.size());
+}
+
+std::string to_string(const std::wstring& str)
+{
+  return utf_to_utf<char>(str.c_str(), str.c_str() + str.size());
+}
diff --git a/src/common/win32/wstring.h b/src/common/win32/wstring.h
new file mode 100644
index 000000000..cb308c70d
--- /dev/null
+++ b/src/common/win32/wstring.h
@@ -0,0 +1,18 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Cloudbase Solutions
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <string>
+
+std::wstring to_wstring(const std::string& str);
+std::string to_string(const std::wstring& wstr);
diff --git a/src/common/zipkin_trace.h b/src/common/zipkin_trace.h
new file mode 100644
index 000000000..5a960610d
--- /dev/null
+++ b/src/common/zipkin_trace.h
@@ -0,0 +1,93 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef COMMON_ZIPKIN_TRACE_H
+#define COMMON_ZIPKIN_TRACE_H
+
+#include "acconfig.h"
+#include "include/encoding.h"
+
+#ifdef WITH_BLKIN
+
+#include <ztracer.hpp>
+
+#else // !WITH_BLKIN
+
+// add stubs for noop Trace and Endpoint
+
+// match the "real" struct
+struct blkin_trace_info {
+    int64_t trace_id;
+    int64_t span_id;
+    int64_t parent_span_id;
+};
+
+namespace ZTracer
+{
+static inline int ztrace_init() { return 0; }
+
+class Endpoint {
+ public:
+  Endpoint(const char *name) {}
+  Endpoint(const char *ip, int port, const char *name) {}
+
+  void copy_ip(const std::string &newip) {}
+  void copy_name(const std::string &newname) {}
+  void copy_address_from(const Endpoint *endpoint) {}
+  void share_address_from(const Endpoint *endpoint) {}
+  void set_port(int p) {}
+};
+
+class Trace {
+ public:
+  Trace() {}
+  Trace(const char *name, const Endpoint *ep, const Trace *parent = NULL) {}
+  Trace(const char *name, const Endpoint *ep,
+        const blkin_trace_info *i, bool child=false) {}
+
+  bool valid() const { return false; }
+  operator bool() const { return false; }
+
+  int init(const char *name, const Endpoint *ep, const Trace *parent = NULL) {
+    return 0;
+  }
+  int init(const char *name, const Endpoint *ep,
+           const blkin_trace_info *i, bool child=false) {
+    return 0;
+  }
+
+  void copy_name(const std::string &newname) {}
+
+  const blkin_trace_info* get_info() const { return NULL; }
+  void set_info(const blkin_trace_info *i) {}
+
+  void keyval(const char *key, const char *val) const {}
+  void keyval(const char *key, int64_t val) const {}
+  void keyval(const char *key, const char *val, const Endpoint *ep) const {}
+  void keyval(const char *key, int64_t val, const Endpoint *ep) const {}
+
+  void event(const char *event) const {}
+  void event(const char *event, const Endpoint *ep) const {}
+};
+} // namespace ZTrace
+
+#endif // !WITH_BLKIN
+
+static inline void encode(const blkin_trace_info& b, ceph::buffer::list& bl)
+{
+  using ceph::encode;
+  encode(b.trace_id, bl);
+  encode(b.span_id, bl);
+  encode(b.parent_span_id, bl);
+}
+
+static inline void decode(blkin_trace_info& b, ceph::buffer::list::const_iterator& p)
+{
+  using ceph::decode;
+  decode(b.trace_id, p);
+  decode(b.span_id, p);
+  decode(b.parent_span_id, p);
+}
+
+
+
+#endif // COMMON_ZIPKIN_TRACE_H