Adding upstream version 14.2.21.upstream/14.2.21 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-27 18:24:20 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-27 18:24:20 +0000
commit: 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
tree: e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/common
parent: Initial commit. (diff)
download: ceph-upstream.tar.xz
ceph-upstream.zip
279 files changed, 69430 insertions, 0 deletions
diff --git a/src/common/AsyncOpTracker.cc b/src/common/AsyncOpTracker.cc
new file mode 100644
index 00000000..fb6439d3
--- /dev/null
+++ b/src/common/AsyncOpTracker.cc
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/AsyncOpTracker.h"
+#include "include/Context.h"
+
+AsyncOpTracker::AsyncOpTracker()
+{
+}
+
+AsyncOpTracker::~AsyncOpTracker() {
+  std::lock_guard locker(m_lock);
+  ceph_assert(m_pending_ops == 0);
+}
+
+void AsyncOpTracker::start_op() {
+  std::lock_guard locker(m_lock);
+  ++m_pending_ops;
+}
+
+void AsyncOpTracker::finish_op() {
+  Context *on_finish = nullptr;
+  {
+    std::lock_guard locker(m_lock);
+    ceph_assert(m_pending_ops > 0);
+    if (--m_pending_ops == 0) {
+      std::swap(on_finish, m_on_finish);
+    }
+  }
+
+  if (on_finish != nullptr) {
+    on_finish->complete(0);
+  }
+}
+
+void AsyncOpTracker::wait_for_ops(Context *on_finish) {
+  {
+    std::lock_guard locker(m_lock);
+    ceph_assert(m_on_finish == nullptr);
+    if (m_pending_ops > 0) {
+      m_on_finish = on_finish;
+      return;
+    }
+  }
+  on_finish->complete(0);
+}
+
+bool AsyncOpTracker::empty() {
+  std::lock_guard locker(m_lock);
+  return (m_pending_ops == 0);
+}
+
diff --git a/src/common/AsyncOpTracker.h b/src/common/AsyncOpTracker.h
new file mode 100644
index 00000000..d913032a
--- /dev/null
+++ b/src/common/AsyncOpTracker.h
@@ -0,0 +1,30 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_ASYNC_OP_TRACKER_H
+#define CEPH_ASYNC_OP_TRACKER_H
+
+#include "common/ceph_mutex.h"
+
+struct Context;
+
+class AsyncOpTracker {
+public:
+  AsyncOpTracker();
+  ~AsyncOpTracker();
+
+  void start_op();
+  void finish_op();
+
+  void wait_for_ops(Context *on_finish);
+
+  bool empty();
+
+private:
+  ceph::mutex m_lock = ceph::make_mutex("AsyncOpTracker::m_lock");
+  uint32_t m_pending_ops = 0;
+  Context *m_on_finish = nullptr;
+
+};
+
+#endif // CEPH_ASYNC_OP_TRACKER_H
diff --git a/src/common/AsyncReserver.h b/src/common/AsyncReserver.h
new file mode 100644
index 00000000..8632a5f0
--- /dev/null
+++ b/src/common/AsyncReserver.h
@@ -0,0 +1,321 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef ASYNC_RESERVER_H
+#define ASYNC_RESERVER_H
+
+#include "common/Finisher.h"
+#include "common/Formatter.h"
+
+#define rdout(x) lgeneric_subdout(cct,reserver,x)
+
+/**
+ * Manages a configurable number of asynchronous reservations.
+ *
+ * Memory usage is linear with the number of items queued and
+ * linear with respect to the total number of priorities used
+ * over all time.
+ */
+template <typename T>
+class AsyncReserver {
+  CephContext *cct;
+  Finisher *f;
+  unsigned max_allowed;
+  unsigned min_priority;
+  ceph::mutex lock = ceph::make_mutex("AsyncReserver::lock");
+
+  struct Reservation {
+    T item;
+    unsigned prio = 0;
+    Context *grant = 0;
+    Context *preempt = 0;
+    Reservation() {}
+    Reservation(T i, unsigned pr, Context *g, Context *p = 0)
+      : item(i), prio(pr), grant(g), preempt(p) {}
+    void dump(Formatter *f) const {
+      f->dump_stream("item") << item;
+      f->dump_unsigned("prio", prio);
+      f->dump_bool("can_preempt", !!preempt);
+    }
+    friend ostream& operator<<(ostream& out, const Reservation& r) {
+      return out << r.item << "(prio " << r.prio << " grant " << r.grant
+		 << " preempt " << r.preempt << ")";
+    }
+  };
+
+  map<unsigned, list<Reservation>> queues;
+  map<T, pair<unsigned, typename list<Reservation>::iterator>> queue_pointers;
+  map<T,Reservation> in_progress;
+  set<pair<unsigned,T>> preempt_by_prio;  ///< in_progress that can be preempted
+
+  void preempt_one() {
+    ceph_assert(!preempt_by_prio.empty());
+    auto q = in_progress.find(preempt_by_prio.begin()->second);
+    ceph_assert(q != in_progress.end());
+    Reservation victim = q->second;
+    rdout(10) << __func__ << " preempt " << victim << dendl;
+    f->queue(victim.preempt);
+    victim.preempt = nullptr;
+    in_progress.erase(q);
+    preempt_by_prio.erase(preempt_by_prio.begin());
+  }
+
+  void do_queues() {
+    rdout(20) << __func__ << ":\n";
+    JSONFormatter jf(true);
+    jf.open_object_section("queue");
+    _dump(&jf);
+    jf.close_section();
+    jf.flush(*_dout);
+    *_dout << dendl;
+
+    // in case min_priority was adjusted up or max_allowed was adjusted down
+    while (!preempt_by_prio.empty() &&
+	   (in_progress.size() > max_allowed ||
+	    preempt_by_prio.begin()->first < min_priority)) {
+      preempt_one();
+    }
+
+    while (!queues.empty()) {
+      // choose highest priority queue
+      auto it = queues.end();
+      --it;
+      ceph_assert(!it->second.empty());
+      if (it->first < min_priority) {
+	break;
+      }
+      if (in_progress.size() >= max_allowed &&
+	  !preempt_by_prio.empty() &&
+	  it->first > preempt_by_prio.begin()->first) {
+	preempt_one();
+      }
+      if (in_progress.size() >= max_allowed) {
+	break; // no room
+      }
+      // grant
+      Reservation p = it->second.front();
+      rdout(10) << __func__ << " grant " << p << dendl;
+      queue_pointers.erase(p.item);
+      it->second.pop_front();
+      if (it->second.empty()) {
+	queues.erase(it);
+      }
+      f->queue(p.grant);
+      p.grant = nullptr;
+      in_progress[p.item] = p;
+      if (p.preempt) {
+	preempt_by_prio.insert(make_pair(p.prio, p.item));
+      }
+    }
+  }
+public:
+  AsyncReserver(
+    CephContext *cct,
+    Finisher *f,
+    unsigned max_allowed,
+    unsigned min_priority = 0)
+    : cct(cct),
+      f(f),
+      max_allowed(max_allowed),
+      min_priority(min_priority) {}
+
+  void set_max(unsigned max) {
+    std::lock_guard l(lock);
+    max_allowed = max;
+    do_queues();
+  }
+
+  void set_min_priority(unsigned min) {
+    std::lock_guard l(lock);
+    min_priority = min;
+    do_queues();
+  }
+
+  /**
+   * Update the priority of a reservation
+   *
+   * Note, on_reserved may be called following update_priority.  Thus,
+   * the callback must be safe in that case.  Callback will be called
+   * with no locks held.  cancel_reservation must be called to release the
+   * reservation slot.
+   *
+   * Cases
+   * 1. Item is queued, re-queue with new priority
+   * 2. Item is queued, re-queue and preempt if new priority higher than an in progress item
+   * 3. Item is in progress, just adjust priority if no higher priority waiting
+   * 4. Item is in progress, adjust priority if higher priority items waiting preempt item
+   *
+   */
+  void update_priority(T item, unsigned newprio) {
+    std::lock_guard l(lock);
+    auto i = queue_pointers.find(item);
+    if (i != queue_pointers.end()) {
+      unsigned prio = i->second.first;
+      if (newprio == prio)
+        return;
+      Reservation r = *i->second.second;
+      rdout(10) << __func__ << " update " << r << " (was queued)" << dendl;
+      // Like cancel_reservation() without preempting
+      queues[prio].erase(i->second.second);
+      if (queues[prio].empty()) {
+	queues.erase(prio);
+      }
+      queue_pointers.erase(i);
+
+      // Like request_reservation() to re-queue it but with new priority
+      ceph_assert(!queue_pointers.count(item) &&
+	   !in_progress.count(item));
+      r.prio = newprio;
+      queues[newprio].push_back(r);
+      queue_pointers.insert(make_pair(item,
+				    make_pair(newprio,--(queues[newprio]).end())));
+    } else {
+      auto p = in_progress.find(item);
+      if (p != in_progress.end()) {
+        if (p->second.prio == newprio)
+          return;
+	rdout(10) << __func__ << " update " << p->second
+		  << " (in progress)" << dendl;
+        // We want to preempt if priority goes down
+        // and smaller then highest priority waiting
+	if (p->second.preempt) {
+	  if (newprio < p->second.prio && !queues.empty()) {
+            // choose highest priority queue
+            auto it = queues.end();
+            --it;
+            ceph_assert(!it->second.empty());
+            if (it->first > newprio) {
+	      rdout(10) << __func__ << " update " << p->second
+		        << " lowered priority let do_queues() preempt it" << dendl;
+            }
+          }
+	  preempt_by_prio.erase(make_pair(p->second.prio, p->second.item));
+          p->second.prio = newprio;
+	  preempt_by_prio.insert(make_pair(p->second.prio, p->second.item));
+	} else {
+          p->second.prio = newprio;
+        }
+      } else {
+	rdout(10) << __func__ << " update " << item << " (not found)" << dendl;
+      }
+    }
+    do_queues();
+    return;
+  }
+
+  void dump(Formatter *f) {
+    std::lock_guard l(lock);
+    _dump(f);
+  }
+  void _dump(Formatter *f) {
+    f->dump_unsigned("max_allowed", max_allowed);
+    f->dump_unsigned("min_priority", min_priority);
+    f->open_array_section("queues");
+    for (auto& p : queues) {
+      f->open_object_section("queue");
+      f->dump_unsigned("priority", p.first);
+      f->open_array_section("items");
+      for (auto& q : p.second) {
+	f->dump_object("item", q);
+      }
+      f->close_section();
+      f->close_section();
+    }
+    f->close_section();
+    f->open_array_section("in_progress");
+    for (auto& p : in_progress) {
+      f->dump_object("item", p.second);
+    }
+    f->close_section();
+  }
+
+  /**
+   * Requests a reservation
+   *
+   * Note, on_reserved may be called following cancel_reservation.  Thus,
+   * the callback must be safe in that case.  Callback will be called
+   * with no locks held.  cancel_reservation must be called to release the
+   * reservation slot.
+   */
+  void request_reservation(
+    T item,                   ///< [in] reservation key
+    Context *on_reserved,     ///< [in] callback to be called on reservation
+    unsigned prio,            ///< [in] priority
+    Context *on_preempt = 0   ///< [in] callback to be called if we are preempted (optional)
+    ) {
+    std::lock_guard l(lock);
+    Reservation r(item, prio, on_reserved, on_preempt);
+    rdout(10) << __func__ << " queue " << r << dendl;
+    ceph_assert(!queue_pointers.count(item) &&
+	   !in_progress.count(item));
+    queues[prio].push_back(r);
+    queue_pointers.insert(make_pair(item,
+				    make_pair(prio,--(queues[prio]).end())));
+    do_queues();
+  }
+
+  /**
+   * Cancels reservation
+   *
+   * Frees the reservation under key for use.
+   * Note, after cancel_reservation, the reservation_callback may or
+   * may not still be called. 
+   */
+  void cancel_reservation(
+    T item                   ///< [in] key for reservation to cancel
+    ) {
+    std::lock_guard l(lock);
+    auto i = queue_pointers.find(item);
+    if (i != queue_pointers.end()) {
+      unsigned prio = i->second.first;
+      const Reservation& r = *i->second.second;
+      rdout(10) << __func__ << " cancel " << r << " (was queued)" << dendl;
+      delete r.grant;
+      delete r.preempt;
+      queues[prio].erase(i->second.second);
+      if (queues[prio].empty()) {
+	queues.erase(prio);
+      }
+      queue_pointers.erase(i);
+    } else {
+      auto p = in_progress.find(item);
+      if (p != in_progress.end()) {
+	rdout(10) << __func__ << " cancel " << p->second
+		  << " (was in progress)" << dendl;
+	if (p->second.preempt) {
+	  preempt_by_prio.erase(make_pair(p->second.prio, p->second.item));
+	  delete p->second.preempt;
+	}
+	in_progress.erase(p);
+      } else {
+	rdout(10) << __func__ << " cancel " << item << " (not found)" << dendl;
+      }
+    }
+    do_queues();
+  }
+
+  /**
+   * Has reservations
+   *
+   * Return true if there are reservations in progress
+   */
+  bool has_reservation() {
+    std::lock_guard l(lock);
+    return !in_progress.empty();
+  }
+  static const unsigned MAX_PRIORITY = (unsigned)-1;
+};
+
+#undef rdout
+#endif
diff --git a/src/common/BackTrace.cc b/src/common/BackTrace.cc
new file mode 100644
index 00000000..90b83df3
--- /dev/null
+++ b/src/common/BackTrace.cc
@@ -0,0 +1,139 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <ostream>
+#include <cxxabi.h>
+#include <string.h>
+
+#include "BackTrace.h"
+#include "common/version.h"
+#include "common/Formatter.h"
+
+#define _STR(x) #x
+#define STRINGIFY(x) _STR(x)
+
+namespace ceph {
+
+void BackTrace::print(std::ostream& out) const
+{
+  out << " " << pretty_version_to_str() << std::endl;
+  for (size_t i = skip; i < size; i++) {
+    //      out << " " << (i-skip+1) << ": " << strings[i] << std::endl;
+
+    size_t sz = 1024; // just a guess, template names will go much wider
+    char *function = (char *)malloc(sz);
+    if (!function)
+      return;
+    char *begin = 0, *end = 0;
+    
+    // find the parentheses and address offset surrounding the mangled name
+#ifdef __FreeBSD__
+    static constexpr char OPEN = '<';
+#else
+    static constexpr char OPEN = '(';
+#endif
+    for (char *j = strings[i]; *j; ++j) {
+      if (*j == OPEN)
+	begin = j+1;
+      else if (*j == '+')
+	end = j;
+    }
+    if (begin && end) {
+      int len = end - begin;
+      char *foo = (char *)malloc(len+1);
+      if (!foo) {
+	free(function);
+	return;
+      }
+      memcpy(foo, begin, len);
+      foo[len] = 0;
+
+      int status;
+      char *ret = nullptr;
+      // only demangle a C++ mangled name
+      if (foo[0] == '_' && foo[1] == 'Z')
+	ret = abi::__cxa_demangle(foo, function, &sz, &status);
+      if (ret) {
+	// return value may be a realloc() of the input
+	function = ret;
+      }
+      else {
+	// demangling failed, just pretend it's a C function with no args
+	strncpy(function, foo, sz);
+	strncat(function, "()", sz);
+	function[sz-1] = 0;
+      }
+      out << " " << (i-skip+1) << ": " << OPEN << function << end << std::endl;
+      //fprintf(out, "    %s:%s\n", stack.strings[i], function);
+      free(foo);
+    } else {
+      // didn't find the mangled name, just print the whole line
+      out << " " << (i-skip+1) << ": " << strings[i] << std::endl;
+    }
+    free(function);
+  }
+}
+
+void BackTrace::dump(Formatter *f) const
+{
+  f->open_array_section("backtrace");
+  for (size_t i = skip; i < size; i++) {
+    //      out << " " << (i-skip+1) << ": " << strings[i] << std::endl;
+
+    size_t sz = 1024; // just a guess, template names will go much wider
+    char *function = (char *)malloc(sz);
+    if (!function)
+      return;
+    char *begin = 0, *end = 0;
+
+    // find the parentheses and address offset surrounding the mangled name
+#ifdef __FreeBSD__
+    static constexpr char OPEN = '<';
+#else
+    static constexpr char OPEN = '(';
+#endif
+    for (char *j = strings[i]; *j; ++j) {
+      if (*j == OPEN)
+	begin = j+1;
+      else if (*j == '+')
+	end = j;
+    }
+    if (begin && end) {
+      int len = end - begin;
+      char *foo = (char *)malloc(len+1);
+      if (!foo) {
+	free(function);
+	return;
+      }
+      memcpy(foo, begin, len);
+      foo[len] = 0;
+
+      int status;
+      char *ret = nullptr;
+      // only demangle a C++ mangled name
+      if (foo[0] == '_' && foo[1] == 'Z')
+	ret = abi::__cxa_demangle(foo, function, &sz, &status);
+      if (ret) {
+	// return value may be a realloc() of the input
+	function = ret;
+      }
+      else {
+	// demangling failed, just pretend it's a C function with no args
+	strncpy(function, foo, sz);
+	strncat(function, "()", sz);
+	function[sz-1] = 0;
+      }
+      f->dump_stream("frame") << OPEN << function << end;
+      //fprintf(out, "    %s:%s\n", stack.strings[i], function);
+      free(foo);
+    } else {
+      // didn't find the mangled name, just print the whole line
+      //out << " " << (i-skip+1) << ": " << strings[i] << std::endl;
+      f->dump_string("frame", strings[i]);
+    }
+    free(function);
+  }
+  f->close_section();
+}
+
+}
diff --git a/src/common/BackTrace.h b/src/common/BackTrace.h
new file mode 100644
index 00000000..5cb73d47
--- /dev/null
+++ b/src/common/BackTrace.h
@@ -0,0 +1,54 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_BACKTRACE_H
+#define CEPH_BACKTRACE_H
+
+#include "acconfig.h"
+#include <iosfwd>
+#ifdef HAVE_EXECINFO_H
+#include <execinfo.h>
+#endif
+#include <stdlib.h>
+
+namespace ceph {
+
+class Formatter;
+
+struct BackTrace {
+  const static int max = 100;
+
+  int skip;
+  void *array[max]{};
+  size_t size;
+  char **strings;
+
+  explicit BackTrace(int s) : skip(s) {
+#ifdef HAVE_EXECINFO_H
+    size = backtrace(array, max);
+    strings = backtrace_symbols(array, size);
+#else
+    skip = 0;
+    size = 0;
+    strings = nullptr;
+#endif
+  }
+  ~BackTrace() {
+    free(strings);
+  }
+
+  BackTrace(const BackTrace& other);
+  const BackTrace& operator=(const BackTrace& other);
+
+  void print(std::ostream& out) const;
+  void dump(Formatter *f) const;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const BackTrace& bt) {
+  bt.print(out);
+  return out;
+}
+
+}
+
+#endif
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
new file mode 100644
index 00000000..b77159f4
--- /dev/null
+++ b/src/common/CMakeLists.txt
@@ -0,0 +1,185 @@
+add_library(common_buffer_obj OBJECT
+  buffer.cc)
+
+add_library(common_texttable_obj OBJECT
+  TextTable.cc)
+
+add_library(common_prioritycache_obj OBJECT
+  PriorityCache.cc)
+
+set(common_srcs
+  AsyncOpTracker.cc
+  BackTrace.cc
+  ConfUtils.cc
+  Cycles.cc
+  DecayCounter.cc
+  Finisher.cc
+  Formatter.cc
+  Graylog.cc
+  HTMLFormatter.cc
+  HeartbeatMap.cc
+  LogClient.cc
+  LogEntry.cc
+  Mutex.cc
+  OutputDataSocket.cc
+  PluginRegistry.cc
+  Readahead.cc
+  SloppyCRCMap.cc
+  SubProcess.cc
+  Thread.cc
+  Throttle.cc
+  Timer.cc
+  TracepointProvider.cc
+  TrackedOp.cc
+  WorkQueue.cc
+  address_helper.cc
+  admin_socket.cc
+  admin_socket_client.cc
+  assert.cc
+  bit_str.cc
+  blkdev.cc
+  bloom_filter.cc
+  ceph_argparse.cc
+  ceph_context.cc
+  ceph_crypto.cc
+  ceph_crypto_cms.cc
+  ceph_frag.cc
+  ceph_fs.cc
+  ceph_hash.cc
+  ceph_json.cc
+  ceph_strings.cc
+  ceph_time.cc
+  cmdparse.cc
+  code_environment.cc
+  common_init.cc
+  compat.cc
+  condition_variable_debug.cc
+  config.cc
+  config_values.cc
+  dns_resolve.cc
+  dout.cc
+  entity_name.cc
+  environment.cc
+  errno.cc
+  escape.cc
+  fd.cc
+  fs_types.cc
+  hex.cc
+  histogram.cc
+  hobject.cc
+  hostname.cc
+  ipaddr.cc
+  iso_8601.cc
+  linux_version.c
+  lockdep.cc
+  mempool.cc
+  mime.c
+  mutex_debug.cc
+  numa.cc
+  options.cc
+  page.cc
+  perf_counters.cc
+  perf_counters_collection.cc
+  perf_histogram.cc
+  pick_address.cc
+  random_string.cc
+  reverse.c
+  run_cmd.cc
+  scrub_types.cc
+  shared_mutex_debug.cc
+  signal.cc
+  snap_types.cc
+  str_list.cc
+  str_map.cc
+  strtol.cc
+  types.cc
+  url_escape.cc
+  utf8.c
+  util.cc
+  version.cc
+  xattr.c)
+
+set_source_files_properties(${CMAKE_SOURCE_DIR}/src/common/version.cc
+  APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_BINARY_DIR}/src/include/ceph_ver.h)
+
+if(HAS_VTA)
+  set_source_files_properties(
+    config.cc
+    options.cc
+    PROPERTIES COMPILE_FLAGS -fno-var-tracking-assignments)
+endif()
+
+if(FREEBSD)
+  list(APPEND common_srcs freebsd_errno.cc)
+elseif(APPLE)
+  list(APPEND common_srcs darwin_errno.cc)
+elseif(SUN)
+  list(APPEND common_srcs solaris_errno.cc)
+elseif(AIX)
+  list(APPEND common_srcs aix_errno.cc)
+endif()
+
+if(WITH_LTTNG AND WITH_EVENTTRACE)
+  message(STATUS " Using EventTrace class.")
+  add_definitions("-DWITH_EVENTTRACE")
+  list(APPEND common_srcs EventTrace.cc)
+endif()
+
+add_library(common-common-objs OBJECT
+  ${common_srcs})
+# for options.cc
+target_compile_definitions(common-common-objs PRIVATE
+  "CEPH_LIBDIR=\"${CMAKE_INSTALL_FULL_LIBDIR}\""
+  "CEPH_PKGLIBDIR=\"${CEPH_INSTALL_FULL_PKGLIBDIR}\""
+  "CEPH_DATADIR=\"${CEPH_INSTALL_DATADIR}\"")
+
+set(common_mountcephfs_srcs
+  armor.c
+  safe_io.c
+  module.c
+  addr_parsing.c)
+add_library(common_mountcephfs_objs OBJECT
+  ${common_mountcephfs_srcs})
+
+
+set(crc32_srcs
+  crc32c.cc
+  crc32c_intel_baseline.c
+  sctp_crc32.c)
+
+if(HAVE_INTEL)
+  list(APPEND crc32_srcs
+    crc32c_intel_fast.c)
+  if(HAVE_GOOD_YASM_ELF64)
+    list(APPEND crc32_srcs
+      crc32c_intel_fast_asm.s
+      crc32c_intel_fast_zero_asm.s)
+  endif(HAVE_GOOD_YASM_ELF64)
+elseif(HAVE_POWER8)
+  list(APPEND crc32_srcs
+    crc32c_ppc.c)
+  if(HAVE_PPC64LE)
+    list(APPEND crc32_srcs
+      crc32c_ppc_asm.S
+      crc32c_ppc_fast_zero_asm.S)
+  endif(HAVE_PPC64LE)
+elseif(HAVE_ARMV8_CRC)
+  list(APPEND crc32_srcs
+    crc32c_aarch64.c)
+endif(HAVE_INTEL)
+
+add_library(crc32 ${crc32_srcs})
+if(HAVE_ARMV8_CRC)
+  set_target_properties(crc32 PROPERTIES
+    COMPILE_FLAGS "${CMAKE_C_FLAGS} ${ARMV8_CRC_COMPILE_FLAGS}")
+endif()
+target_link_libraries(crc32
+  arch)
+
+add_library(common_utf8 STATIC utf8.c)
+
+if(HAVE_KEYUTILS)
+  set(parse_secret_srcs
+    secret.c)
+  add_library(parse_secret_objs OBJECT ${parse_secret_srcs})
+endif()
diff --git a/src/common/Checksummer.h b/src/common/Checksummer.h
new file mode 100644
index 00000000..ceb551bc
--- /dev/null
+++ b/src/common/Checksummer.h
@@ -0,0 +1,271 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OS_BLUESTORE_CHECKSUMMER
+#define CEPH_OS_BLUESTORE_CHECKSUMMER
+
+#include "xxHash/xxhash.h"
+#include "include/byteorder.h"
+
+class Checksummer {
+public:
+  enum CSumType {
+    CSUM_NONE = 1,	//intentionally set to 1 to be aligned with OSDMnitor's pool_opts_t handling - it treats 0 as unset while we need to distinguish none and unset cases
+    CSUM_XXHASH32 = 2,
+    CSUM_XXHASH64 = 3,
+    CSUM_CRC32C = 4,
+    CSUM_CRC32C_16 = 5, // low 16 bits of crc32c
+    CSUM_CRC32C_8 = 6,  // low 8 bits of crc32c
+    CSUM_MAX,
+  };
+  static const char *get_csum_type_string(unsigned t) {
+    switch (t) {
+    case CSUM_NONE: return "none";
+    case CSUM_XXHASH32: return "xxhash32";
+    case CSUM_XXHASH64: return "xxhash64";
+    case CSUM_CRC32C: return "crc32c";
+    case CSUM_CRC32C_16: return "crc32c_16";
+    case CSUM_CRC32C_8: return "crc32c_8";
+    default: return "???";
+    }
+  }
+  static int get_csum_string_type(const std::string &s) {
+    if (s == "none")
+      return CSUM_NONE;
+    if (s == "xxhash32")
+      return CSUM_XXHASH32;
+    if (s == "xxhash64")
+      return CSUM_XXHASH64;
+    if (s == "crc32c")
+      return CSUM_CRC32C;
+    if (s == "crc32c_16")
+      return CSUM_CRC32C_16;
+    if (s == "crc32c_8")
+      return CSUM_CRC32C_8;
+    return -EINVAL;
+  }
+
+  static size_t get_csum_init_value_size(int csum_type) {
+    switch (csum_type) {
+    case CSUM_NONE: return 0;
+    case CSUM_XXHASH32: return sizeof(xxhash32::init_value_t);
+    case CSUM_XXHASH64: return sizeof(xxhash64::init_value_t);
+    case CSUM_CRC32C: return sizeof(crc32c::init_value_t);
+    case CSUM_CRC32C_16: return sizeof(crc32c_16::init_value_t);
+    case CSUM_CRC32C_8: return sizeof(crc32c_8::init_value_t);
+    default: return 0;
+    }
+  }
+  static size_t get_csum_value_size(int csum_type) {
+    switch (csum_type) {
+    case CSUM_NONE: return 0;
+    case CSUM_XXHASH32: return 4;
+    case CSUM_XXHASH64: return 8;
+    case CSUM_CRC32C: return 4;
+    case CSUM_CRC32C_16: return 2;
+    case CSUM_CRC32C_8: return 1;
+    default: return 0;
+    }
+  }
+
+  struct crc32c {
+    typedef uint32_t init_value_t;
+    typedef ceph_le32 value_t;
+
+    // we have no execution context/state.
+    typedef int state_t;
+    static void init(state_t *state) {
+    }
+    static void fini(state_t *state) {
+    }
+
+    static init_value_t calc(
+      state_t state,
+      init_value_t init_value,
+      size_t len,
+      bufferlist::const_iterator& p
+      ) {
+      return p.crc32c(len, init_value);
+    }
+  };
+
+  struct crc32c_16 {
+    typedef uint32_t init_value_t;
+    typedef ceph_le16 value_t;
+
+    // we have no execution context/state.
+    typedef int state_t;
+    static void init(state_t *state) {
+    }
+    static void fini(state_t *state) {
+    }
+
+    static init_value_t calc(
+      state_t state,
+      init_value_t init_value,
+      size_t len,
+      bufferlist::const_iterator& p
+      ) {
+      return p.crc32c(len, init_value) & 0xffff;
+    }
+  };
+
+  struct crc32c_8 {
+    typedef uint32_t init_value_t;
+    typedef __u8 value_t;
+
+    // we have no execution context/state.
+    typedef int state_t;
+    static void init(state_t *state) {
+    }
+    static void fini(state_t *state) {
+    }
+
+    static init_value_t calc(
+      state_t state,
+      init_value_t init_value,
+      size_t len,
+      bufferlist::const_iterator& p
+      ) {
+      return p.crc32c(len, init_value) & 0xff;
+    }
+  };
+
+  struct xxhash32 {
+    typedef uint32_t init_value_t;
+    typedef ceph_le32 value_t;
+
+    typedef XXH32_state_t *state_t;
+    static void init(state_t *s) {
+      *s = XXH32_createState();
+    }
+    static void fini(state_t *s) {
+      XXH32_freeState(*s);
+    }
+
+    static init_value_t calc(
+      state_t state,
+      init_value_t init_value,
+      size_t len,
+      bufferlist::const_iterator& p
+      ) {
+      XXH32_reset(state, init_value);
+      while (len > 0) {
+	const char *data;
+	size_t l = p.get_ptr_and_advance(len, &data);
+	XXH32_update(state, data, l);
+	len -= l;
+      }
+      return XXH32_digest(state);
+    }
+  };
+
+  struct xxhash64 {
+    typedef uint64_t init_value_t;
+    typedef ceph_le64 value_t;
+
+    typedef XXH64_state_t *state_t;
+    static void init(state_t *s) {
+      *s = XXH64_createState();
+    }
+    static void fini(state_t *s) {
+      XXH64_freeState(*s);
+    }
+
+    static init_value_t calc(
+      state_t state,
+      init_value_t init_value,
+      size_t len,
+      bufferlist::const_iterator& p
+      ) {
+      XXH64_reset(state, init_value);
+      while (len > 0) {
+	const char *data;
+	size_t l = p.get_ptr_and_advance(len, &data);
+	XXH64_update(state, data, l);
+	len -= l;
+      }
+      return XXH64_digest(state);
+    }
+  };
+
+  template<class Alg>
+  static int calculate(
+    size_t csum_block_size,
+    size_t offset,
+    size_t length,
+    const bufferlist &bl,
+    bufferptr* csum_data
+    ) {
+    return calculate<Alg>(-1, csum_block_size, offset, length, bl, csum_data);
+  }
+
+  template<class Alg>
+  static int calculate(
+      typename Alg::init_value_t init_value,
+      size_t csum_block_size,
+      size_t offset,
+      size_t length,
+      const bufferlist &bl,
+      bufferptr* csum_data) {
+    ceph_assert(length % csum_block_size == 0);
+    size_t blocks = length / csum_block_size;
+    bufferlist::const_iterator p = bl.begin();
+    ceph_assert(bl.length() >= length);
+
+    typename Alg::state_t state;
+    Alg::init(&state);
+
+    ceph_assert(csum_data->length() >= (offset + length) / csum_block_size *
+	   sizeof(typename Alg::value_t));
+
+    typename Alg::value_t *pv =
+      reinterpret_cast<typename Alg::value_t*>(csum_data->c_str());
+    pv += offset / csum_block_size;
+    while (blocks--) {
+      *pv = Alg::calc(state, init_value, csum_block_size, p);
+      ++pv;
+    }
+    Alg::fini(&state);
+    return 0;
+  }
+
+  template<class Alg>
+  static int verify(
+    size_t csum_block_size,
+    size_t offset,
+    size_t length,
+    const bufferlist &bl,
+    const bufferptr& csum_data,
+    uint64_t *bad_csum=0
+    ) {
+    ceph_assert(length % csum_block_size == 0);
+    bufferlist::const_iterator p = bl.begin();
+    ceph_assert(bl.length() >= length);
+
+    typename Alg::state_t state;
+    Alg::init(&state);
+
+    const typename Alg::value_t *pv =
+      reinterpret_cast<const typename Alg::value_t*>(csum_data.c_str());
+    pv += offset / csum_block_size;
+    size_t pos = offset;
+    while (length > 0) {
+      typename Alg::init_value_t v = Alg::calc(state, -1, csum_block_size, p);
+      if (*pv != v) {
+	if (bad_csum) {
+	  *bad_csum = v;
+	}
+	Alg::fini(&state);
+	return pos;
+      }
+      ++pv;
+      pos += csum_block_size;
+      length -= csum_block_size;
+    }
+    Alg::fini(&state);
+    return -1;  // no errors
+  }
+};
+
+#endif
diff --git a/src/common/Clock.h b/src/common/Clock.h
new file mode 100644
index 00000000..b47954ad
--- /dev/null
+++ b/src/common/Clock.h
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_CLOCK_H
+#define CEPH_CLOCK_H
+
+#include "include/utime.h"
+
+#include <time.h>
+
+static inline utime_t ceph_clock_now()
+{
+#if defined(__linux__)
+  struct timespec tp;
+  clock_gettime(CLOCK_REALTIME, &tp);
+  utime_t n(tp);
+#else
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  utime_t n(&tv);
+#endif
+  return n;
+}
+
+#endif
diff --git a/src/common/CommandTable.h b/src/common/CommandTable.h
new file mode 100644
index 00000000..3fea9a03
--- /dev/null
+++ b/src/common/CommandTable.h
@@ -0,0 +1,103 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef COMMAND_TABLE_H_
+#define COMMAND_TABLE_H_
+
+#include "messages/MCommand.h"
+
+class CommandOp
+{
+  public:
+  ConnectionRef con;
+  ceph_tid_t tid;
+
+  std::vector<std::string> cmd;
+  bufferlist    inbl;
+  Context      *on_finish;
+  bufferlist   *outbl;
+  std::string  *outs;
+
+  MCommand::ref get_message(const uuid_d &fsid) const
+  {
+    auto m = MCommand::create(fsid);
+    m->cmd = cmd;
+    m->set_data(inbl);
+    m->set_tid(tid);
+
+    return m;
+  }
+
+  CommandOp(const ceph_tid_t t) : tid(t), on_finish(nullptr),
+                                  outbl(nullptr), outs(nullptr) {}
+  CommandOp() : tid(0), on_finish(nullptr), outbl(nullptr), outs(nullptr) {}
+};
+
+/**
+ * Hold client-side state for a collection of in-flight commands
+ * to a remote service.
+ */
+template<typename T>
+class CommandTable
+{
+protected:
+  ceph_tid_t last_tid;
+  std::map<ceph_tid_t, T> commands;
+
+public:
+
+  CommandTable()
+    : last_tid(0)
+  {}
+
+  ~CommandTable()
+  {
+    ceph_assert(commands.empty());
+  }
+
+  T& start_command()
+  {
+    ceph_tid_t tid = last_tid++;
+    commands.insert(std::make_pair(tid, T(tid)) );
+
+    return commands.at(tid);
+  }
+
+  const std::map<ceph_tid_t, T> &get_commands() const
+  {
+    return commands;
+  }
+
+  bool exists(ceph_tid_t tid) const
+  {
+    return commands.count(tid) > 0;
+  }
+
+  T& get_command(ceph_tid_t tid)
+  {
+    return commands.at(tid);
+  }
+
+  void erase(ceph_tid_t tid)
+  {
+    commands.erase(tid);
+  }
+
+  void clear() {
+    commands.clear();
+  }
+};
+
+#endif
+
diff --git a/src/common/Cond.h b/src/common/Cond.h
new file mode 100644
index 00000000..77b3882a
--- /dev/null
+++ b/src/common/Cond.h
@@ -0,0 +1,213 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_COND_H
+#define CEPH_COND_H
+
+#include "common/Clock.h"
+#include "include/Context.h"
+
+class Cond {
+  // my bits
+  pthread_cond_t _c;
+
+  Mutex *waiter_mutex;
+
+  // don't allow copying.
+  void operator=(Cond &C);
+  Cond(const Cond &C);
+
+ public:
+  Cond() : waiter_mutex(NULL) {
+    int r = pthread_cond_init(&_c,NULL);
+    ceph_assert(r == 0);
+  }
+  virtual ~Cond() { 
+    pthread_cond_destroy(&_c); 
+  }
+
+  int Wait(Mutex &mutex)  { 
+    // make sure this cond is used with one mutex only
+    ceph_assert(waiter_mutex == NULL || waiter_mutex == &mutex);
+    waiter_mutex = &mutex;
+
+    ceph_assert(mutex.is_locked());
+
+    mutex._pre_unlock();
+    int r = pthread_cond_wait(&_c, &mutex._m);
+    mutex._post_lock();
+    return r;
+  }
+
+  int WaitUntil(Mutex &mutex, utime_t when) {
+    // make sure this cond is used with one mutex only
+    ceph_assert(waiter_mutex == NULL || waiter_mutex == &mutex);
+    waiter_mutex = &mutex;
+
+    ceph_assert(mutex.is_locked());
+
+    struct timespec ts;
+    when.to_timespec(&ts);
+
+    mutex._pre_unlock();
+    int r = pthread_cond_timedwait(&_c, &mutex._m, &ts);
+    mutex._post_lock();
+
+    return r;
+  }
+
+  int WaitInterval(Mutex &mutex, utime_t interval) {
+    utime_t when = ceph_clock_now();
+    when += interval;
+    return WaitUntil(mutex, when);
+  }
+
+  template<typename Duration>
+  int WaitInterval(Mutex &mutex, Duration interval) {
+    ceph::real_time when(ceph::real_clock::now());
+    when += interval;
+
+    struct timespec ts = ceph::real_clock::to_timespec(when);
+
+    mutex._pre_unlock();
+    int r = pthread_cond_timedwait(&_c, &mutex._m, &ts);
+    mutex._post_lock();
+
+    return r;
+  }
+
+  int SloppySignal() { 
+    int r = pthread_cond_broadcast(&_c);
+    return r;
+  }
+  int Signal() { 
+    // make sure signaler is holding the waiter's lock.
+    ceph_assert(waiter_mutex == NULL ||
+	   waiter_mutex->is_locked());
+
+    int r = pthread_cond_broadcast(&_c);
+    return r;
+  }
+  int SignalOne() { 
+    // make sure signaler is holding the waiter's lock.
+    ceph_assert(waiter_mutex == NULL ||
+	   waiter_mutex->is_locked());
+
+    int r = pthread_cond_signal(&_c);
+    return r;
+  }
+  int SignalAll() { 
+    // make sure signaler is holding the waiter's lock.
+    ceph_assert(waiter_mutex == NULL ||
+	   waiter_mutex->is_locked());
+
+    int r = pthread_cond_broadcast(&_c);
+    return r;
+  }
+};
+
+/**
+ * context to signal a cond
+ *
+ * Generic context to signal a cond and store the return value.  We
+ * assume the caller is holding the appropriate lock.
+ */
+class C_Cond : public Context {
+  Cond *cond;   ///< Cond to signal
+  bool *done;   ///< true if finish() has been called
+  int *rval;    ///< return value
+public:
+  C_Cond(Cond *c, bool *d, int *r) : cond(c), done(d), rval(r) {
+    *done = false;
+  }
+  void finish(int r) override {
+    *done = true;
+    *rval = r;
+    cond->Signal();
+  }
+};
+
+/**
+ * context to signal a cond, protected by a lock
+ *
+ * Generic context to signal a cond under a specific lock. We take the
+ * lock in the finish() callback, so the finish() caller must not
+ * already hold it.
+ */
+class C_SafeCond : public Context {
+  Mutex *lock;    ///< Mutex to take
+  Cond *cond;     ///< Cond to signal
+  bool *done;     ///< true after finish() has been called
+  int *rval;      ///< return value (optional)
+public:
+  C_SafeCond(Mutex *l, Cond *c, bool *d, int *r=0) : lock(l), cond(c), done(d), rval(r) {
+    *done = false;
+  }
+  void finish(int r) override {
+    lock->lock();
+    if (rval)
+      *rval = r;
+    *done = true;
+    cond->Signal();
+    lock->unlock();
+  }
+};
+
+/**
+ * Context providing a simple wait() mechanism to wait for completion
+ *
+ * The context will not be deleted as part of complete and must live
+ * until wait() returns.
+ */
+class C_SaferCond : public Context {
+  Mutex lock;    ///< Mutex to take
+  Cond cond;     ///< Cond to signal
+  bool done;     ///< true after finish() has been called
+  int rval;      ///< return value
+public:
+  C_SaferCond() : lock("C_SaferCond"), done(false), rval(0) {}
+  explicit C_SaferCond(const std::string &name) : lock(name), done(false), rval(0) {}
+  void finish(int r) override { complete(r); }
+
+  /// We overload complete in order to not delete the context
+  void complete(int r) override {
+    std::lock_guard l(lock);
+    done = true;
+    rval = r;
+    cond.Signal();
+  }
+
+  /// Returns rval once the Context is called
+  int wait() {
+    std::lock_guard l(lock);
+    while (!done)
+      cond.Wait(lock);
+    return rval;
+  }
+
+  /// Wait until the \c secs expires or \c complete() is called
+  int wait_for(double secs) {
+    utime_t interval;
+    interval.set_from_double(secs);
+    std::lock_guard l{lock};
+    if (done) {
+      return rval;
+    }
+    cond.WaitInterval(lock, interval);
+    return done ? rval : ETIMEDOUT;
+  }
+};
+
+#endif
diff --git a/src/common/ConfUtils.cc b/src/common/ConfUtils.cc
new file mode 100644
index 00000000..7c8f1cc7
--- /dev/null
+++ b/src/common/ConfUtils.cc
@@ -0,0 +1,594 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <algorithm>
+#include <map>
+#include <sstream>
+#include <sys/stat.h>
+#include <iostream>
+
+#include "include/buffer.h"
+#include "common/errno.h"
+#include "common/utf8.h"
+#include "common/ConfUtils.h"
+
+using std::ostringstream;
+using std::pair;
+using std::string;
+
+#define MAX_CONFIG_FILE_SZ 0x40000000
+
+////////////////////////////// ConfLine //////////////////////////////
+ConfLine::
+ConfLine(const std::string &key_, const std::string &val_,
+      const std::string &newsection_, const std::string &comment_, int line_no_)
+  : key(key_), val(val_), newsection(newsection_)
+{
+  // If you want to implement writable ConfFile support, you'll need to save
+  // the comment and line_no arguments here.
+}
+
+bool ConfLine::
+operator<(const ConfLine &rhs) const
+{
+  // We only compare keys.
+  // If you have more than one line with the same key in a given section, the
+  // last one wins.
+  if (key < rhs.key)
+    return true;
+  else
+    return false;
+}
+
+std::ostream &operator<<(std::ostream& oss, const ConfLine &l)
+{
+  oss << "ConfLine(key = '" << l.key << "', val='"
+      << l.val << "', newsection='" << l.newsection << "')";
+  return oss;
+}
+///////////////////////// ConfFile //////////////////////////
+ConfFile::
+ConfFile()
+{
+}
+
+ConfFile::
+~ConfFile()
+{
+}
+
+void ConfFile::
+clear()
+{
+  sections.clear();
+}
+
+/* We load the whole file into memory and then parse it.  Although this is not
+ * the optimal approach, it does mean that most of this code can be shared with
+ * the bufferlist loading function. Since bufferlists are always in-memory, the
+ * load_from_buffer interface works well for them.
+ * In general, configuration files should be a few kilobytes at maximum, so
+ * loading the whole configuration into memory shouldn't be a problem.
+ */
+int ConfFile::
+parse_file(const std::string &fname, std::deque<std::string> *errors,
+	   std::ostream *warnings)
+{
+  clear();
+
+  int ret = 0;
+  size_t sz;
+  char *buf = NULL;
+  FILE *fp = fopen(fname.c_str(), "r");
+  if (!fp) {
+    ostringstream oss;
+    oss << __func__ << ": cannot open " << fname << ": " << cpp_strerror(errno);
+    errors->push_back(oss.str());
+    ret = -errno;
+    return ret;
+  }
+
+  struct stat st_buf;
+  if (fstat(fileno(fp), &st_buf)) {
+    ret = -errno;
+    ostringstream oss;
+    oss << __func__ << ": failed to fstat '" << fname << "': " << cpp_strerror(ret);
+    errors->push_back(oss.str());
+    goto done;
+  }
+
+  if (st_buf.st_size > MAX_CONFIG_FILE_SZ) {
+    ostringstream oss;
+    oss << __func__ << ": config file '" << fname << "' is " << st_buf.st_size
+	<< " bytes, but the maximum is " << MAX_CONFIG_FILE_SZ;
+    errors->push_back(oss.str());
+    ret = -EINVAL;
+    goto done;
+  }
+
+  sz = (size_t)st_buf.st_size;
+  buf = (char*)malloc(sz);
+  if (!buf) {
+    ret = -ENOMEM;
+    goto done;
+  }
+
+  if (fread(buf, 1, sz, fp) != sz) {
+    if (ferror(fp)) {
+      ret = -errno;
+      ostringstream oss;
+      oss << __func__ << ": fread error while reading '" << fname << "': "
+	  << cpp_strerror(ret);
+      errors->push_back(oss.str());
+      goto done;
+    }
+    else {
+      ostringstream oss;
+      oss << __func__ << ": unexpected EOF while reading '" << fname << "': "
+	  << "possible concurrent modification?";
+      errors->push_back(oss.str());
+      ret = -EIO;
+      goto done;
+    }
+  }
+
+  load_from_buffer(buf, sz, errors, warnings);
+  ret = 0;
+
+done:
+  free(buf);
+  fclose(fp);
+  return ret;
+}
+
+int ConfFile::
+parse_bufferlist(ceph::bufferlist *bl, std::deque<std::string> *errors,
+		 std::ostream *warnings)
+{
+  clear();
+
+  load_from_buffer(bl->c_str(), bl->length(), errors, warnings);
+  return 0;
+}
+
+int ConfFile::
+read(const std::string &section, const std::string &key, std::string &val) const
+{
+  string k(normalize_key_name(key));
+
+  const_section_iter_t s = sections.find(section);
+  if (s == sections.end())
+    return -ENOENT;
+  ConfLine exemplar(k, "", "", "", 0);
+  ConfSection::const_line_iter_t l = s->second.lines.find(exemplar);
+  if (l == s->second.lines.end())
+    return -ENOENT;
+  val = l->val;
+  return 0;
+}
+
+ConfFile::const_section_iter_t ConfFile::
+sections_begin() const
+{
+  return sections.begin();
+}
+
+ConfFile::const_section_iter_t ConfFile::
+sections_end() const
+{
+  return sections.end();
+}
+
+void ConfFile::
+trim_whitespace(std::string &str, bool strip_internal)
+{
+  // strip preceding
+  const char *in = str.c_str();
+  while (true) {
+    char c = *in;
+    if ((!c) || (!isspace(c)))
+      break;
+    ++in;
+  }
+  char output[strlen(in) + 1];
+  strcpy(output, in);
+
+  // strip trailing
+  char *o = output + strlen(output);
+  while (true) {
+    if (o == output)
+      break;
+    --o;
+    if (!isspace(*o)) {
+      ++o;
+      *o = '\0';
+      break;
+    }
+  }
+
+  if (!strip_internal) {
+    str.assign(output);
+    return;
+  }
+
+  // strip internal
+  char output2[strlen(output) + 1];
+  char *out2 = output2;
+  bool prev_was_space = false;
+  for (char *u = output; *u; ++u) {
+    char c = *u;
+    if (isspace(c)) {
+      if (!prev_was_space)
+	*out2++ = c;
+      prev_was_space = true;
+    }
+    else {
+      *out2++ = c;
+      prev_was_space = false;
+    }
+  }
+  *out2++ = '\0';
+  str.assign(output2);
+}
+
+/* Normalize a key name.
+ *
+ * Normalized key names have no leading or trailing whitespace, and all
+ * whitespace is stored as underscores.  The main reason for selecting this
+ * normal form is so that in common/config.cc, we can use a macro to stringify
+ * the field names of md_config_t and get a key in normal form.
+ */
+std::string ConfFile::
+normalize_key_name(const std::string &key)
+{
+  if (key.find_first_of(" \t\r\n\f\v\xa0") == string::npos) {
+    return key;
+  }
+
+  string k(key);
+  ConfFile::trim_whitespace(k, true);
+  std::replace(k.begin(), k.end(), ' ', '_');
+  return k;
+}
+
+std::ostream &operator<<(std::ostream &oss, const ConfFile &cf)
+{
+  for (ConfFile::const_section_iter_t s = cf.sections_begin();
+       s != cf.sections_end(); ++s) {
+    oss << "[" << s->first << "]\n";
+    for (ConfSection::const_line_iter_t l = s->second.lines.begin();
+	 l != s->second.lines.end(); ++l) {
+      if (!l->key.empty()) {
+	oss << "\t" << l->key << " = \"" << l->val << "\"\n";
+      }
+    }
+  }
+  return oss;
+}
+
+void ConfFile::
+load_from_buffer(const char *buf, size_t sz, std::deque<std::string> *errors,
+		 std::ostream *warnings)
+{
+  errors->clear();
+
+  section_iter_t::value_type vt("global", ConfSection());
+  pair < section_iter_t, bool > vr(sections.insert(vt));
+  ceph_assert(vr.second);
+  section_iter_t cur_section = vr.first;
+  std::string acc;
+
+  const char *b = buf;
+  int line_no = 0;
+  size_t line_len = -1;
+  size_t rem = sz;
+  while (1) {
+    b += line_len + 1;
+    if ((line_len + 1) > rem)
+      break;
+    rem -= line_len + 1;
+    if (rem == 0)
+      break;
+    line_no++;
+
+    // look for the next newline
+    const char *end = (const char*)memchr(b, '\n', rem);
+    if (!end) {
+      ostringstream oss;
+      oss << "read_conf: ignoring line " << line_no << " because it doesn't "
+	  << "end with a newline! Please end the config file with a newline.";
+      errors->push_back(oss.str());
+      break;
+    }
+
+    // find length of line, and search for NULLs
+    line_len = 0;
+    bool found_null = false;
+    for (const char *tmp = b; tmp != end; ++tmp) {
+      line_len++;
+      if (*tmp == '\0') {
+	found_null = true;
+      }
+    }
+
+    if (found_null) {
+      ostringstream oss;
+      oss << "read_conf: ignoring line " << line_no << " because it has "
+	  << "an embedded null.";
+      errors->push_back(oss.str());
+      acc.clear();
+      continue;
+    }
+
+    if (check_utf8(b, line_len)) {
+      ostringstream oss;
+      oss << "read_conf: ignoring line " << line_no << " because it is not "
+	  << "valid UTF8.";
+      errors->push_back(oss.str());
+      acc.clear();
+      continue;
+    }
+
+    if ((line_len >= 1) && (b[line_len-1] == '\\')) {
+      // A backslash at the end of a line serves as a line continuation marker.
+      // Combine the next line with this one.
+      // Remove the backslash itself from the text.
+      acc.append(b, line_len - 1);
+      continue;
+    }
+
+    acc.append(b, line_len);
+
+    //cerr << "acc = '" << acc << "'" << std::endl;
+    ConfLine *cline = process_line(line_no, acc.c_str(), errors);
+    acc.clear();
+    if (!cline)
+      continue;
+    const std::string &csection(cline->newsection);
+    if (!csection.empty()) {
+      std::map <std::string, ConfSection>::value_type nt(csection, ConfSection());
+      pair < section_iter_t, bool > nr(sections.insert(nt));
+      cur_section = nr.first;
+    }
+    else {
+      if (cur_section->second.lines.count(*cline)) {
+	// replace an existing key/line in this section, so that
+	//  [mysection]
+	//    foo = 1
+	//    foo = 2
+	// will result in foo = 2.
+	cur_section->second.lines.erase(*cline);
+	if (cline->key.length() && warnings)
+	  *warnings << "warning: line " << line_no << ": '" << cline->key << "' in section '"
+		    << cur_section->first << "' redefined " << std::endl;
+      }
+      // add line to current section
+      //std::cerr << "cur_section = " << cur_section->first << ", " << *cline << std::endl;
+      cur_section->second.lines.insert(*cline);
+    }
+    delete cline;
+  }
+
+  if (!acc.empty()) {
+    ostringstream oss;
+    oss << "read_conf: don't end with lines that end in backslashes!";
+    errors->push_back(oss.str());
+  }
+}
+
+/*
+ * A simple state-machine based parser.
+ * This probably could/should be rewritten with something like boost::spirit
+ * or yacc if the grammar ever gets more complex.
+ */
+ConfLine* ConfFile::
+process_line(int line_no, const char *line, std::deque<std::string> *errors)
+{
+  enum acceptor_state_t {
+    ACCEPT_INIT,
+    ACCEPT_SECTION_NAME,
+    ACCEPT_KEY,
+    ACCEPT_VAL_START,
+    ACCEPT_UNQUOTED_VAL,
+    ACCEPT_QUOTED_VAL,
+    ACCEPT_COMMENT_START,
+    ACCEPT_COMMENT_TEXT,
+  };
+  const char *l = line;
+  acceptor_state_t state = ACCEPT_INIT;
+  string key, val, newsection, comment;
+  bool escaping = false;
+  while (true) {
+    char c = *l++;
+    switch (state) {
+      case ACCEPT_INIT:
+	if (c == '\0')
+	  return NULL; // blank line. Not an error, but not interesting either.
+	else if (c == '[')
+	  state = ACCEPT_SECTION_NAME;
+	else if ((c == '#') || (c == ';'))
+	  state = ACCEPT_COMMENT_TEXT;
+	else if (c == ']') {
+	  ostringstream oss;
+	  oss << "unexpected right bracket at char " << (l - line)
+	      << ", line " << line_no;
+	  errors->push_back(oss.str());
+	  return NULL;
+	}
+	else if (isspace(c)) {
+	  // ignore whitespace here
+	}
+	else {
+	  // try to accept this character as a key
+	  state = ACCEPT_KEY;
+	  --l;
+	}
+	break;
+      case ACCEPT_SECTION_NAME:
+	if (c == '\0') {
+	  ostringstream oss;
+	  oss << "error parsing new section name: expected right bracket "
+	      << "at char " << (l - line) << ", line " << line_no;
+	  errors->push_back(oss.str());
+	  return NULL;
+	}
+	else if ((c == ']') && (!escaping)) {
+	  trim_whitespace(newsection, true);
+	  if (newsection.empty()) {
+	    ostringstream oss;
+	    oss << "error parsing new section name: no section name found? "
+	        << "at char " << (l - line) << ", line " << line_no;
+	    errors->push_back(oss.str());
+	    return NULL;
+	  }
+	  state = ACCEPT_COMMENT_START;
+	}
+	else if (((c == '#') || (c == ';')) && (!escaping)) {
+	  ostringstream oss;
+	  oss << "unexpected comment marker while parsing new section name, at "
+	      << "char " << (l - line) << ", line " << line_no;
+	  errors->push_back(oss.str());
+	  return NULL;
+	}
+	else if ((c == '\\') && (!escaping)) {
+	  escaping = true;
+	}
+	else {
+	  escaping = false;
+	  newsection += c;
+	}
+	break;
+      case ACCEPT_KEY:
+	if ((((c == '#') || (c == ';')) && (!escaping)) || (c == '\0')) {
+	  ostringstream oss;
+	  if (c == '\0') {
+	    oss << "end of key=val line " << line_no
+	        << " reached, no \"=val\" found...missing =?";
+	  } else {
+	    oss << "unexpected character while parsing putative key value, "
+		<< "at char " << (l - line) << ", line " << line_no;
+	  }
+	  errors->push_back(oss.str());
+	  return NULL;
+	}
+	else if ((c == '=') && (!escaping)) {
+	  key = normalize_key_name(key);
+	  if (key.empty()) {
+	    ostringstream oss;
+	    oss << "error parsing key name: no key name found? "
+	        << "at char " << (l - line) << ", line " << line_no;
+	    errors->push_back(oss.str());
+	    return NULL;
+	  }
+	  state = ACCEPT_VAL_START;
+	}
+	else if ((c == '\\') && (!escaping)) {
+	  escaping = true;
+	}
+	else {
+	  escaping = false;
+	  key += c;
+	}
+	break;
+      case ACCEPT_VAL_START:
+	if (c == '\0')
+	  return new ConfLine(key, val, newsection, comment, line_no);
+	else if ((c == '#') || (c == ';'))
+	  state = ACCEPT_COMMENT_TEXT;
+	else if (c == '"')
+	  state = ACCEPT_QUOTED_VAL;
+	else if (isspace(c)) {
+	  // ignore whitespace
+	}
+	else {
+	  // try to accept character as a val
+	  state = ACCEPT_UNQUOTED_VAL;
+	  --l;
+	}
+	break;
+      case ACCEPT_UNQUOTED_VAL:
+	if (c == '\0') {
+	  if (escaping) {
+	    ostringstream oss;
+	    oss << "error parsing value name: unterminated escape sequence "
+	        << "at char " << (l - line) << ", line " << line_no;
+	    errors->push_back(oss.str());
+	    return NULL;
+	  }
+	  trim_whitespace(val, false);
+	  return new ConfLine(key, val, newsection, comment, line_no);
+	}
+	else if (((c == '#') || (c == ';')) && (!escaping)) {
+	  trim_whitespace(val, false);
+	  state = ACCEPT_COMMENT_TEXT;
+	}
+	else if ((c == '\\') && (!escaping)) {
+	  escaping = true;
+	}
+	else {
+	  escaping = false;
+	  val += c;
+	}
+	break;
+      case ACCEPT_QUOTED_VAL:
+	if (c == '\0') {
+	  ostringstream oss;
+	  oss << "found opening quote for value, but not the closing quote. "
+	      << "line " << line_no;
+	  errors->push_back(oss.str());
+	  return NULL;
+	}
+	else if ((c == '"') && (!escaping)) {
+	  state = ACCEPT_COMMENT_START;
+	}
+	else if ((c == '\\') && (!escaping)) {
+	  escaping = true;
+	}
+	else {
+	  escaping = false;
+	  // Add anything, including whitespace.
+	  val += c;
+	}
+	break;
+      case ACCEPT_COMMENT_START:
+	if (c == '\0') {
+	  return new ConfLine(key, val, newsection, comment, line_no);
+	}
+	else if ((c == '#') || (c == ';')) {
+	  state = ACCEPT_COMMENT_TEXT;
+	}
+	else if (isspace(c)) {
+	  // ignore whitespace
+	}
+	else {
+	  ostringstream oss;
+	  oss << "unexpected character at char " << (l - line) << " of line "
+	      << line_no;
+	  errors->push_back(oss.str());
+	  return NULL;
+	}
+	break;
+      case ACCEPT_COMMENT_TEXT:
+	if (c == '\0')
+	  return new ConfLine(key, val, newsection, comment, line_no);
+	else
+	  comment += c;
+	break;
+      default:
+	ceph_abort();
+	break;
+    }
+    ceph_assert(c != '\0'); // We better not go past the end of the input string.
+  }
+}
diff --git a/src/common/ConfUtils.h b/src/common/ConfUtils.h
new file mode 100644
index 00000000..19ec188a
--- /dev/null
+++ b/src/common/ConfUtils.h
@@ -0,0 +1,86 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_CONFUTILS_H
+#define CEPH_CONFUTILS_H
+
+#include <deque>
+#include <map>
+#include <set>
+#include <string>
+
+#include "include/buffer_fwd.h"
+
+/*
+ * Ceph configuration file support.
+ *
+ * This class loads an INI-style configuration from a file or bufferlist, and
+ * holds it in memory. In general, an INI configuration file is composed of
+ * sections, which contain key/value pairs. You can put comments on the end of
+ * lines by using either a hash mark (#) or the semicolon (;).
+ *
+ * You can get information out of ConfFile by calling get_key or by examining
+ * individual sections.
+ *
+ * This class could be extended to support modifying configuration files and
+ * writing them back out without too much difficulty. Currently, this is not
+ * implemented, and the file is read-only.
+ */
+class ConfLine {
+public:
+  ConfLine(const std::string &key_, const std::string &val_,
+	   const std::string &newsection_, const std::string &comment_, int line_no_);
+  bool operator<(const ConfLine &rhs) const;
+  friend std::ostream &operator<<(std::ostream& oss, const ConfLine &l);
+
+  std::string key, val, newsection;
+};
+
+class ConfSection {
+public:
+  typedef std::set <ConfLine>::const_iterator const_line_iter_t;
+
+  std::set <ConfLine> lines;
+};
+
+class ConfFile {
+public:
+  typedef std::map <std::string, ConfSection>::iterator section_iter_t;
+  typedef std::map <std::string, ConfSection>::const_iterator const_section_iter_t;
+
+  ConfFile();
+  ~ConfFile();
+  void clear();
+  int parse_file(const std::string &fname, std::deque<std::string> *errors, std::ostream *warnings);
+  int parse_bufferlist(ceph::bufferlist *bl, std::deque<std::string> *errors, std::ostream *warnings);
+  int read(const std::string &section, const std::string &key,
+	      std::string &val) const;
+
+  const_section_iter_t sections_begin() const;
+  const_section_iter_t sections_end() const;
+
+  static void trim_whitespace(std::string &str, bool strip_internal);
+  static std::string normalize_key_name(const std::string &key);
+  friend std::ostream &operator<<(std::ostream &oss, const ConfFile &cf);
+
+private:
+  void load_from_buffer(const char *buf, size_t sz,
+			std::deque<std::string> *errors, std::ostream *warnings);
+  static ConfLine* process_line(int line_no, const char *line,
+			        std::deque<std::string> *errors);
+
+  std::map <std::string, ConfSection> sections;
+};
+
+#endif
diff --git a/src/common/ContextCompletion.cc b/src/common/ContextCompletion.cc
new file mode 100644
index 00000000..a4f81683
--- /dev/null
+++ b/src/common/ContextCompletion.cc
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include "common/ContextCompletion.h"
+
+namespace ceph
+{
+
+ContextCompletion::ContextCompletion(Context *ctx, bool ignore_enoent)
+  : m_ctx(ctx),
+    m_ignore_enoent(ignore_enoent), m_ret(0), m_building(true), m_current_ops(0)
+{
+}
+
+void ContextCompletion::finish_adding_requests() {
+  bool complete;
+  {
+    std::lock_guard l(m_lock);
+    m_building = false;
+    complete = (m_current_ops == 0);
+  }
+  if (complete) {
+    m_ctx->complete(m_ret);
+    delete this;
+  }
+}
+
+void ContextCompletion::start_op() {
+  std::lock_guard l(m_lock);
+  ++m_current_ops;
+}
+
+void ContextCompletion::finish_op(int r) {
+  bool complete;
+  {
+    std::lock_guard l(m_lock);
+    if (r < 0 && m_ret == 0 && (!m_ignore_enoent || r != -ENOENT)) {
+      m_ret = r;
+    }
+
+    --m_current_ops;
+    complete = (m_current_ops == 0 && !m_building);
+  }
+  if (complete) {
+    m_ctx->complete(m_ret);
+    delete this;
+  }
+}
+
+} // namespace ceph
diff --git a/src/common/ContextCompletion.h b/src/common/ContextCompletion.h
new file mode 100644
index 00000000..86c51b2b
--- /dev/null
+++ b/src/common/ContextCompletion.h
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_ASYNC_COMPLETION_H
+#define CEPH_ASYNC_COMPLETION_H
+
+#include "include/Context.h"
+
+namespace ceph {
+
+class ContextCompletion {
+public:
+  ContextCompletion(Context *ctx, bool ignore_enoent);
+
+  void finish_adding_requests();
+
+  void start_op();
+  void finish_op(int r);
+
+private:
+  ceph::mutex m_lock = ceph::make_mutex("ContextCompletion::m_lock");
+  Context *m_ctx;
+  bool m_ignore_enoent;
+  int m_ret;
+  bool m_building;
+  uint64_t m_current_ops;
+};
+
+class C_ContextCompletion : public Context {
+public:
+  C_ContextCompletion(ContextCompletion &context_completion)
+    : m_context_completion(context_completion)
+  {
+    m_context_completion.start_op();
+  }
+
+  void finish(int r) override {
+    m_context_completion.finish_op(r);
+  }
+
+private:
+  ContextCompletion &m_context_completion;
+};
+
+} // namespace ceph
+
+#endif // CEPH_ASYNC_COMPLETION_H
diff --git a/src/common/Continuation.h b/src/common/Continuation.h
new file mode 100644
index 00000000..966b63d0
--- /dev/null
+++ b/src/common/Continuation.h
@@ -0,0 +1,174 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/Context.h"
+
+/**
+ * The Continuation interface is designed to help easily create multi-step
+ * operations that share data without having to pass it around or create
+ * custom Context classes for each step. To write a Continuation:
+ * 1) create a child class with a function for each stage.
+ * 2) Put all your shared data members into the class.
+ * 3) In the constructor, register each function stage with set_callback().
+ * 4) Whenever you need to provide a Context callback that activates the next
+ * stage, call get_callback(stage_number). If you need to proceed to another
+ * stage immediately, call immediate(stage, retcode) and return its result.
+ *
+ * To use a class:
+ * 1) Construct the child class on the heap.
+ * 2) Call begin().
+ * 3) The destructor will be called once one of your functions returns true to
+ * indicate it is done.
+ *
+ * Please note that while you can skip stages and get multiple Callback
+ * objects at once, you *cannot* have any stage report that the Continuation
+ * is completed while any other stage Callbacks are outstanding. It's best to
+ * be serial unless you want to maintain your own metadata about which stages
+ * are still pending.
+ *
+ * In fact, there are only two situations in which a stage should return
+ * true while others are running:
+ * 1) A Callback was issued and completed in the same thread,
+ * 2) you called immediate(stage) and it is returning true.
+ */
+
+class Continuation {
+  std::set<int> stages_in_flight;
+  std::set<int> stages_processing;
+  int rval;
+  Context *on_finish;
+  bool reported_done;
+
+  class Callback : public Context {
+    Continuation *continuation;
+    int stage_to_activate;
+  public:
+    Callback(Continuation *c, int stage) :
+      continuation(c),
+      stage_to_activate(stage) {}
+    void finish(int r) override {
+      continuation->continue_function(r, stage_to_activate);
+    }
+  };
+
+protected:
+  typedef bool (Continuation::*stagePtr)(int r);
+  /**
+   * Continue immediately to the given stage. It will be executed
+   * immediately, in the given thread.
+   * @pre You are in a callback function.
+   * @param stage The stage to execute
+   * @param r The return code that will be provided to the next stage
+   */
+  bool immediate(int stage, int r) {
+    ceph_assert(!stages_in_flight.count(stage));
+    ceph_assert(!stages_processing.count(stage));
+    stages_in_flight.insert(stage);
+    stages_processing.insert(stage);
+    return _continue_function(r, stage);
+  }
+
+  /**
+   * Obtain a Context * that when complete()ed calls back into the given stage.
+   * @pre You are in a callback function.
+   * @param stage The stage this Context should activate
+   */
+  Context *get_callback(int stage) {
+    stages_in_flight.insert(stage);
+    return new Callback(this, stage);
+  }
+
+  /**
+   * Set the return code that is passed to the finally-activated Context.
+   * @param new_rval The return code to use.
+   */
+  void set_rval(int new_rval) { rval = new_rval; }
+  int get_rval() { return rval; }
+
+  /**
+   * Register member functions as associated with a given stage. Start
+   * your stage IDs at 0 and make that one the setup phase.
+   * @pre There are no other functions associated with the stage.
+   * @param stage The stage to associate this function with
+   * @param func The function to use
+   */
+  void set_callback(int stage, stagePtr func) {
+    ceph_assert(callbacks.find(stage) == callbacks.end());
+    callbacks[stage] = func;
+  }
+  
+  /**
+   * Called when the Continuation is done, as determined by a stage returning
+   * true and us having finished all the currently-processing ones.
+   */
+   virtual void _done() {
+     on_finish->complete(rval);
+     on_finish = NULL;
+     return;
+   }
+
+private:
+  std::map<int, Continuation::stagePtr> callbacks;
+
+  bool _continue_function(int r, int n) {
+    set<int>::iterator stage_iter = stages_in_flight.find(n);
+    ceph_assert(stage_iter != stages_in_flight.end());
+    ceph_assert(callbacks.count(n));
+    stagePtr p = callbacks[n];
+
+    pair<set<int>::iterator,bool> insert_r = stages_processing.insert(n);
+
+    bool done = (this->*p)(r);
+    if (done)
+      reported_done = true;
+
+    stages_processing.erase(insert_r.first);
+    stages_in_flight.erase(stage_iter);
+    return done;
+  }
+
+  void continue_function(int r, int stage) {
+    bool done = _continue_function(r, stage);
+
+    assert (!done ||
+            stages_in_flight.size() == stages_processing.size());
+
+    if (done ||
+        (reported_done && stages_processing.empty())) {
+      _done();
+      delete this;
+    }
+  }
+
+
+
+public:
+  /**
+   * Construct a new Continuation object. Call this from your child class,
+   * obviously.
+   *
+   * @Param c The Context which should be complete()ed when this Continuation
+   * is done.
+   */
+  Continuation(Context *c) :
+    rval(0), on_finish(c), reported_done(false) {}
+  /**
+   * Clean up.
+   */
+  virtual ~Continuation() { ceph_assert(on_finish == NULL); }
+  /**
+   * Begin running the Continuation.
+   */
+  void begin() { stages_in_flight.insert(0); continue_function(0, 0); }
+};
diff --git a/src/common/Cycles.cc b/src/common/Cycles.cc
new file mode 100644
index 00000000..2ebd2469
--- /dev/null
+++ b/src/common/Cycles.cc
@@ -0,0 +1,220 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+/* Copyright (c) 2011-2014 Stanford University
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "debug.h"
+#include "Cycles.h"
+
+double Cycles::cycles_per_sec = 0;
+
+/**
+ * Perform once-only overall initialization for the Cycles class, such
+ * as calibrating the clock frequency.  This method must be called
+ * before using the Cycles module.
+ *
+ * It is not initialized by default because the timing loops cause
+ * general process startup times to balloon
+ * (http://tracker.ceph.com/issues/15225).
+ */
+void Cycles::init()
+{
+  if (cycles_per_sec != 0)
+    return;
+
+  // Skip initialization if rtdsc is not implemented
+  if (rdtsc() == 0)
+    return;
+
+  // Compute the frequency of the fine-grained CPU timer: to do this,
+  // take parallel time readings using both rdtsc and gettimeofday.
+  // After 10ms have elapsed, take the ratio between these readings.
+
+  struct timeval start_time, stop_time;
+  uint64_t micros;
+  double old_cycles;
+
+  // There is one tricky aspect, which is that we could get interrupted
+  // between calling gettimeofday and reading the cycle counter, in which
+  // case we won't have corresponding readings.  To handle this (unlikely)
+  // case, compute the overall result repeatedly, and wait until we get
+  // two successive calculations that are within 0.1% of each other.
+  old_cycles = 0;
+  while (1) {
+    if (gettimeofday(&start_time, NULL) != 0) {
+      ceph_abort_msg("couldn't read clock");
+    }
+    uint64_t start_cycles = rdtsc();
+    while (1) {
+      if (gettimeofday(&stop_time, NULL) != 0) {
+        ceph_abort_msg("couldn't read clock");
+      }
+      uint64_t stop_cycles = rdtsc();
+      micros = (stop_time.tv_usec - start_time.tv_usec) +
+          (stop_time.tv_sec - start_time.tv_sec)*1000000;
+      if (micros > 10000) {
+        cycles_per_sec = static_cast<double>(stop_cycles - start_cycles);
+        cycles_per_sec = 1000000.0*cycles_per_sec/ static_cast<double>(micros);
+        break;
+      }
+    }
+    double delta = cycles_per_sec/1000.0;
+    if ((old_cycles > (cycles_per_sec - delta)) &&
+        (old_cycles < (cycles_per_sec + delta))) {
+      return;
+    }
+    old_cycles = cycles_per_sec;
+  }
+}
+
+/**
+ * Return the number of CPU cycles per second.
+ */
+double Cycles::per_second()
+{
+  return get_cycles_per_sec();
+}
+
+/**
+ * Given an elapsed time measured in cycles, return a floating-point number
+ * giving the corresponding time in seconds.
+ * \param cycles
+ *      Difference between the results of two calls to rdtsc.
+ * \param cycles_per_sec
+ *      Optional parameter to specify the frequency of the counter that #cycles
+ *      was taken from. Useful when converting a remote machine's tick counter
+ *      to seconds. The default value of 0 will use the local processor's
+ *      computed counter frequency.
+ * \return
+ *      The time in seconds corresponding to cycles.
+ */
+double Cycles::to_seconds(uint64_t cycles, double cycles_per_sec)
+{
+  if (cycles_per_sec == 0)
+    cycles_per_sec = get_cycles_per_sec();
+  return static_cast<double>(cycles)/cycles_per_sec;
+}
+
+/**
+ * Given a time in seconds, return the number of cycles that it
+ * corresponds to.
+ * \param seconds
+ *      Time in seconds.
+ * \param cycles_per_sec
+ *      Optional parameter to specify the frequency of the counter that #cycles
+ *      was taken from. Useful when converting a remote machine's tick counter
+ *      to seconds. The default value of 0 will use the local processor's
+ *      computed counter frequency.
+ * \return
+ *      The approximate number of cycles corresponding to #seconds.
+ */
+uint64_t Cycles::from_seconds(double seconds, double cycles_per_sec)
+{
+  if (cycles_per_sec == 0)
+    cycles_per_sec = get_cycles_per_sec();
+  return (uint64_t) (seconds*cycles_per_sec + 0.5);
+}
+
+/**
+ * Given an elapsed time measured in cycles, return an integer
+ * giving the corresponding time in microseconds. Note: to_seconds()
+ * is faster than this method.
+ * \param cycles
+ *      Difference between the results of two calls to rdtsc.
+ * \param cycles_per_sec
+ *      Optional parameter to specify the frequency of the counter that #cycles
+ *      was taken from. Useful when converting a remote machine's tick counter
+ *      to seconds. The default value of 0 will use the local processor's
+ *      computed counter frequency.
+ * \return
+ *      The time in microseconds corresponding to cycles (rounded).
+ */
+uint64_t Cycles::to_microseconds(uint64_t cycles, double cycles_per_sec)
+{
+  return to_nanoseconds(cycles, cycles_per_sec) / 1000;
+}
+
+/**
+ * Given an elapsed time measured in cycles, return an integer
+ * giving the corresponding time in nanoseconds. Note: to_seconds()
+ * is faster than this method.
+ * \param cycles
+ *      Difference between the results of two calls to rdtsc.
+ * \param cycles_per_sec
+ *      Optional parameter to specify the frequency of the counter that #cycles
+ *      was taken from. Useful when converting a remote machine's tick counter
+ *      to seconds. The default value of 0 will use the local processor's
+ *      computed counter frequency.
+ * \return
+ *      The time in nanoseconds corresponding to cycles (rounded).
+ */
+uint64_t Cycles::to_nanoseconds(uint64_t cycles, double cycles_per_sec)
+{
+  if (cycles_per_sec == 0)
+    cycles_per_sec = get_cycles_per_sec();
+  return (uint64_t) (1e09*static_cast<double>(cycles)/cycles_per_sec + 0.5);
+}
+
+/**
+ * Given a number of nanoseconds, return an approximate number of
+ * cycles for an equivalent time length.
+ * \param ns
+ *      Number of nanoseconds.
+ * \param cycles_per_sec
+ *      Optional parameter to specify the frequency of the counter that #cycles
+ *      was taken from. Useful when converting a remote machine's tick counter
+ *      to seconds. The default value of 0 will use the local processor's
+ *      computed counter frequency.
+ * \return
+ *      The approximate number of cycles for the same time length.
+ */
+uint64_t
+Cycles::from_nanoseconds(uint64_t ns, double cycles_per_sec)
+{
+  if (cycles_per_sec == 0)
+    cycles_per_sec = get_cycles_per_sec();
+  return (uint64_t) (static_cast<double>(ns)*cycles_per_sec/1e09 + 0.5);
+}
+
+/**
+ * Busy wait for a given number of microseconds.
+ * Callers should use this method in most reasonable cases as opposed to
+ * usleep for accurate measurements. Calling usleep may put the the processor
+ * in a low power mode/sleep state which reduces the clock frequency.
+ * So, each time the process/thread wakes up from usleep, it takes some time
+ * to ramp up to maximum frequency. Thus meausrements often incur higher
+ * latencies.
+ * \param us
+ *      Number of microseconds.
+ */
+void
+Cycles::sleep(uint64_t us)
+{
+  uint64_t stop = Cycles::rdtsc() + Cycles::from_nanoseconds(1000*us);
+  while (Cycles::rdtsc() < stop);
+}
diff --git a/src/common/Cycles.h b/src/common/Cycles.h
new file mode 100644
index 00000000..bb47d5cb
--- /dev/null
+++ b/src/common/Cycles.h
@@ -0,0 +1,111 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+/* Copyright (c) 2011-2014 Stanford University
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+
+#ifndef CEPH_CYCLES_H
+#define CEPH_CYCLES_H
+
+/**
+ * This class provides static methods that read the fine-grain CPU
+ * cycle counter and translate between cycle-level times and absolute
+ * times.
+ */
+class Cycles {
+ public:
+  static void init();
+
+  /**
+   * Return the current value of the fine-grain CPU cycle counter
+   * (accessed via the RDTSC instruction).
+   */
+  static __inline __attribute__((always_inline)) uint64_t rdtsc() {
+#if defined(__i386__)
+    int64_t ret;
+    __asm__ volatile ("rdtsc" : "=A" (ret) );
+    return ret;
+#elif defined(__x86_64__) || defined(__amd64__)
+    uint32_t lo, hi;
+    __asm__ __volatile__("rdtsc" : "=a" (lo), "=d" (hi));
+    return (((uint64_t)hi << 32) | lo);
+#elif defined(__aarch64__)
+    //
+    // arch/arm64/include/asm/arch_timer.h
+    //
+    // static inline u64 arch_counter_get_cntvct(void)
+    // {
+    //         u64 cval;
+    // 
+    //         isb();
+    //         asm volatile("mrs %0, cntvct_el0" : "=r" (cval));
+    // 
+    //         return cval;
+    // }
+    //
+    // https://github.com/cloudius-systems/osv/blob/master/arch/aarch64/arm-clock.cc
+    uint64_t cntvct;
+    asm volatile ("isb; mrs %0, cntvct_el0; isb; " : "=r" (cntvct) :: "memory");
+    return cntvct;
+#elif defined(__powerpc__) || defined (__powerpc64__)
+    // Based on:
+    // https://github.com/randombit/botan/blob/net.randombit.botan/src/lib/entropy/hres_timer/hres_timer.cpp
+    uint32_t lo = 0, hi = 0;
+    asm volatile("mftbu %0; mftb %1" : "=r" (hi), "=r" (lo));
+    return (((uint64_t)hi << 32) | lo);
+#else
+#warning No high-precision counter available for your OS/arch
+    return 0;
+#endif
+  }
+
+  static double per_second();
+  static double to_seconds(uint64_t cycles, double cycles_per_sec = 0);
+  static uint64_t from_seconds(double seconds, double cycles_per_sec = 0);
+  static uint64_t to_microseconds(uint64_t cycles, double cycles_per_sec = 0);
+  static uint64_t to_nanoseconds(uint64_t cycles, double cycles_per_sec = 0);
+  static uint64_t from_nanoseconds(uint64_t ns, double cycles_per_sec = 0);
+  static void sleep(uint64_t us);
+
+private:
+  Cycles();
+
+  /// Conversion factor between cycles and the seconds; computed by
+  /// Cycles::init.
+  static double cycles_per_sec;
+
+  /**
+   * Returns the conversion factor between cycles in seconds, using
+   * a mock value for testing when appropriate.
+   */
+  static __inline __attribute__((always_inline)) double get_cycles_per_sec() {
+    return cycles_per_sec;
+  }
+};
+
+#endif  // CEPH_CYCLES_H
diff --git a/src/common/DecayCounter.cc b/src/common/DecayCounter.cc
new file mode 100644
index 00000000..bdc75011
--- /dev/null
+++ b/src/common/DecayCounter.cc
@@ -0,0 +1,80 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "DecayCounter.h"
+#include "Formatter.h"
+
+#include "include/encoding.h"
+
+void DecayCounter::encode(bufferlist& bl) const
+{
+  decay();
+  ENCODE_START(5, 4, bl);
+  encode(val, bl);
+  ENCODE_FINISH(bl);
+}
+
+void DecayCounter::decode(bufferlist::const_iterator &p)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, p);
+  if (struct_v < 2) {
+    double k = 0.0;
+    decode(k, p);
+  }
+  if (struct_v < 3) {
+    double k = 0.0;
+    decode(k, p);
+  }
+  decode(val, p);
+  if (struct_v < 5) {
+    double delta, _;
+    decode(delta, p);
+    val += delta;
+    decode(_, p); /* velocity */
+  }
+  last_decay = clock::now();
+  DECODE_FINISH(p);
+}
+
+void DecayCounter::dump(Formatter *f) const
+{
+  decay();
+  f->dump_float("value", val);
+  f->dump_float("halflife", rate.get_halflife());
+}
+
+void DecayCounter::generate_test_instances(std::list<DecayCounter*>& ls)
+{
+  DecayCounter *counter = new DecayCounter();
+  counter->val = 3.0;
+  counter->rate = DecayRate(2.0);
+  ls.push_back(counter);
+  counter = new DecayCounter();
+  ls.push_back(counter);
+}
+
+void DecayCounter::decay(double delta) const
+{
+  auto now = clock::now();
+  double el = std::chrono::duration<double>(now - last_decay).count();
+
+  // calculate new value
+  double newval = val * exp(el * rate.k) + delta;
+  if (newval < .01) {
+    newval = 0.0;
+  }
+
+  val = newval;
+  last_decay = now;
+}
diff --git a/src/common/DecayCounter.h b/src/common/DecayCounter.h
new file mode 100644
index 00000000..b9545b15
--- /dev/null
+++ b/src/common/DecayCounter.h
@@ -0,0 +1,135 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_DECAYCOUNTER_H
+#define CEPH_DECAYCOUNTER_H
+
+#include "include/buffer.h"
+#include "common/Formatter.h"
+#include "common/ceph_time.h"
+
+#include <cmath>
+#include <list>
+#include <sstream>
+
+/**
+ *
+ * TODO: normalize value based on some function of half_life, 
+ *  so that it can be interpreted as an approximation of a
+ *  moving average of N seconds.  currently, changing half-life
+ *  skews the scale of the value, even at steady state.  
+ *
+ */
+
+class DecayRate {
+public:
+  friend class DecayCounter;
+
+  DecayRate() {}
+  // cppcheck-suppress noExplicitConstructor
+  DecayRate(double hl) { set_halflife(hl); }
+  DecayRate(const DecayRate &dr) : k(dr.k) {}
+
+  void set_halflife(double hl) {
+    k = log(.5) / hl;
+  }
+  double get_halflife() const {
+    return log(.5) / k;
+  }
+
+private:
+  double k = 0;             // k = ln(.5)/half_life
+};
+
+class DecayCounter {
+public:
+  using time = ceph::coarse_mono_time;
+  using clock = ceph::coarse_mono_clock;
+
+  DecayCounter() : DecayCounter(DecayRate()) {}
+  explicit DecayCounter(const DecayRate &rate) : last_decay(clock::now()), rate(rate) {}
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& p);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<DecayCounter*>& ls);
+
+  /**
+   * reading
+   */
+
+  double get() const {
+    decay();
+    return val;
+  }
+
+  double get_last() const {
+    return val;
+  }
+  
+  time get_last_decay() const {
+    return last_decay; 
+  }
+
+  /**
+   * adjusting
+   */
+
+  double hit(double v = 1.0) {
+    decay(v);
+    return val;
+  }
+  void adjust(double v = 1.0) {
+    decay(v);
+  }
+
+  void scale(double f) {
+    val *= f;
+  }
+
+  /**
+   * decay etc.
+   */
+
+  void reset() {
+    last_decay = clock::now();
+    val = 0;
+  }
+
+protected:
+  void decay(double delta) const;
+  void decay() const {decay(0.0);}
+
+private:
+  mutable double val = 0.0;           // value
+  mutable time last_decay = clock::zero();   // time of last decay
+  DecayRate rate;
+};
+
+inline void encode(const DecayCounter &c, bufferlist &bl) {
+  c.encode(bl);
+}
+inline void decode(DecayCounter &c, bufferlist::const_iterator &p) {
+  c.decode(p);
+}
+
+inline std::ostream& operator<<(std::ostream& out, const DecayCounter& d) {
+  std::ostringstream oss;
+  oss.precision(2);
+  double val = d.get();
+  oss << "[C " << std::scientific << val << "]";
+  return out << oss.str();
+}
+
+#endif
diff --git a/src/common/EventTrace.cc b/src/common/EventTrace.cc
new file mode 100644
index 00000000..04c61e93
--- /dev/null
+++ b/src/common/EventTrace.cc
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) Intel Corporation.
+ * All rights reserved.
+ *
+ * Author: Anjaneya Chagam <anjaneya.chagam@intel.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/EventTrace.h"
+#include "common/TracepointProvider.h"
+#include "messages/MOSDOpReply.h"
+
+#ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#include "tracing/eventtrace.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
+#else
+#define tracepoint(...)
+#endif
+
+TracepointProvider::Traits event_tracepoint_traits("libeventtrace_tp.so", "event_tracing");
+bool EventTrace::tpinit = false;
+
+void EventTrace::init_tp(CephContext *_ctx)
+{
+  if (unlikely(!_ctx))
+    return;
+
+  if (unlikely(!tpinit)) {
+    TracepointProvider::initialize<event_tracepoint_traits>(_ctx);
+    tpinit = true;
+  }
+}
+
+void EventTrace::set_message_attrs(const Message *m, string& oid, string& context, bool incl_oid)
+{
+  // arg1 = oid, arg2 = message type, arg3 = source!source_addr!tid!sequence
+  if (m && (m->get_type() == CEPH_MSG_OSD_OP || m->get_type() == CEPH_MSG_OSD_OPREPLY)) {
+    if (incl_oid) {
+      if (m->get_type() == CEPH_MSG_OSD_OP)
+        oid = ((MOSDOp *)m)->get_oid().name;
+      else
+        oid = ((MOSDOpReply *)m)->get_oid().name;
+    }
+
+    ostringstream buf;
+    buf << m->get_source() << "!" << m->get_source_addr() << "!"
+        << m->get_tid() << "!" << m->get_seq() << "!" << m->get_type();
+    context = buf.str();
+  }
+}
+
+EventTrace::EventTrace(CephContext *_ctx, const char *_file, const char *_func, int _line) :
+  ctx(_ctx),
+  file(_file),
+  func(_func),
+  line(_line)
+{
+  if (unlikely(!ctx)) 
+    return;
+  last_ts = ceph_clock_now();
+  init_tp(ctx);
+
+  lsubdout(ctx, eventtrace, LOG_LEVEL) << "ENTRY (" <<  func << ") " << file << ":" << line << dendl;
+  tracepoint(eventtrace, func_enter, file.c_str(), func.c_str(), line);
+}
+
+EventTrace::~EventTrace()
+{
+  if (unlikely(!ctx)) 
+    return;
+  lsubdout(ctx, eventtrace, LOG_LEVEL) << "EXIT (" << func << ") " << file << dendl;
+  tracepoint(eventtrace, func_exit, file.c_str(), func.c_str());
+}
+
+void EventTrace::log_event_latency(const char *event)
+{
+  utime_t now = ceph_clock_now();
+  double usecs = (now.to_nsec()-last_ts.to_nsec())/1000;
+  OID_ELAPSED("", usecs, event);
+  last_ts = now;
+}
+
+void EventTrace::trace_oid_event(const char *oid, const char *event, const char *context,
+  const char *file, const char *func, int line)
+{
+  if (unlikely(!g_ceph_context))
+    return;
+  init_tp(g_ceph_context);
+  tracepoint(eventtrace, oid_event, oid, event, context, file, func, line);
+}
+
+void EventTrace::trace_oid_event(const Message *m, const char *event, const char *file,
+  const char *func, int line, bool incl_oid)
+{
+  string oid, context;
+  set_message_attrs(m, oid, context, incl_oid);
+  trace_oid_event(oid.c_str(), event, context.c_str(), file, func, line);
+}
+
+void EventTrace::trace_oid_elapsed(const char *oid, const char *event, const char *context,
+  double elapsed, const char *file, const char *func, int line)
+{
+  if (unlikely(!g_ceph_context))
+    return;
+  init_tp(g_ceph_context);
+  tracepoint(eventtrace, oid_elapsed, oid, event, context, elapsed, file, func, line);
+}
+
+void EventTrace::trace_oid_elapsed(const Message *m, const char *event, double elapsed,
+  const char *file, const char *func, int line, bool incl_oid)
+{
+  string oid, context;
+  set_message_attrs(m, oid, context, incl_oid);
+  trace_oid_elapsed(oid.c_str(), event, context.c_str(), elapsed, file, func, line);
+}
diff --git a/src/common/EventTrace.h b/src/common/EventTrace.h
new file mode 100644
index 00000000..c97ff51d
--- /dev/null
+++ b/src/common/EventTrace.h
@@ -0,0 +1,79 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Intel Corporation.
+ * All rights reserved.
+ *
+ * Author: Anjaneya Chagam <anjaneya.chagam@intel.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef _EventTrace_h_
+#define _EventTrace_h_
+
+#include "msg/Message.h"
+
+#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
+
+#define OID_EVENT_TRACE(oid, event) \
+  EventTrace::trace_oid_event(oid, event, "", __FILE__, __func__, __LINE__)
+#define OID_EVENT_TRACE_WITH_MSG(msg, event, incl_oid) \
+  EventTrace::trace_oid_event(msg, event, __FILE__, __func__, __LINE__, incl_oid)
+#define OID_ELAPSED(oid, elapsed, event) \
+  EventTrace::trace_oid_elapsed(oid, event, "", elapsed, __FILE__, __func__, __LINE__)
+#define OID_ELAPSED_WITH_MSG(m, elapsed, event, incl_oid) \
+  EventTrace::trace_oid_elapsed(m, event, elapsed, __FILE__, __func__, __LINE__, incl_oid)
+#define FUNCTRACE(cct) EventTrace _t1(cct, __FILE__, __func__, __LINE__)
+#define OID_ELAPSED_FUNC_EVENT(event) _t1.log_event_latency(event)
+
+#else
+
+#define OID_EVENT_TRACE(oid, event)
+#define OID_EVENT_TRACE_WITH_MSG(msg, event, incl_oid)
+#define OID_ELAPSED(oid, elapsed, event)
+#define OID_ELAPSED_WITH_MSG(m, elapsed, event, incl_oid)
+#define FUNCTRACE(cct)
+#define OID_ELAPSED_FUNC_EVENT(event)
+
+#endif
+
+#define LOG_LEVEL 1
+
+class EventTrace {
+private:
+  CephContext *ctx;
+  string file;
+  string func;
+  int line;
+  utime_t last_ts;
+
+  static bool tpinit;
+
+  static void init_tp(CephContext *_ctx);
+  static void set_message_attrs(const Message *m, string& oid, string& context, bool incl_oid);
+
+public:
+
+  EventTrace(CephContext *_ctx, const char *_file, const char *_func, int line);
+  ~EventTrace();
+  void log_event_latency(const char *tag);
+
+  static void trace_oid_event(const char *oid, const char *event, const char *context,
+    const char *file, const char *func, int line);
+  static void trace_oid_event(const Message *m, const char *event, const char *file,
+    const char *func, int line, bool incl_oid);
+
+  static void trace_oid_elapsed(const char *oid, const char *event, const char *context,
+    double elapsed, const char *file, const char *func, int line);
+  static void trace_oid_elapsed(const Message *m, const char *event, double elapsed,
+    const char *file, const char *func, int line, bool incl_oid);
+  
+};
+#endif
diff --git a/src/common/Finisher.cc b/src/common/Finisher.cc
new file mode 100644
index 00000000..277fe06f
--- /dev/null
+++ b/src/common/Finisher.cc
@@ -0,0 +1,96 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Finisher.h"
+
+#define dout_subsys ceph_subsys_finisher
+#undef dout_prefix
+#define dout_prefix *_dout << "finisher(" << this << ") "
+
+void Finisher::start()
+{
+  ldout(cct, 10) << __func__ << dendl;
+  finisher_thread.create(thread_name.c_str());
+}
+
+void Finisher::stop()
+{
+  ldout(cct, 10) << __func__ << dendl;
+  finisher_lock.lock();
+  finisher_stop = true;
+  // we don't have any new work to do, but we want the worker to wake up anyway
+  // to process the stop condition.
+  finisher_cond.notify_all();
+  finisher_lock.unlock();
+  finisher_thread.join(); // wait until the worker exits completely
+  ldout(cct, 10) << __func__ << " finish" << dendl;
+}
+
+void Finisher::wait_for_empty()
+{
+  std::unique_lock ul(finisher_lock);
+  while (!finisher_queue.empty() || finisher_running) {
+    ldout(cct, 10) << "wait_for_empty waiting" << dendl;
+    finisher_empty_wait = true;
+    finisher_empty_cond.wait(ul);
+  }
+  ldout(cct, 10) << "wait_for_empty empty" << dendl;
+  finisher_empty_wait = false;
+}
+
+void *Finisher::finisher_thread_entry()
+{
+  std::unique_lock ul(finisher_lock);
+  ldout(cct, 10) << "finisher_thread start" << dendl;
+
+  utime_t start;
+  uint64_t count = 0;
+  while (!finisher_stop) {
+    /// Every time we are woken up, we process the queue until it is empty.
+    while (!finisher_queue.empty()) {
+      // To reduce lock contention, we swap out the queue to process.
+      // This way other threads can submit new contexts to complete
+      // while we are working.
+      vector<pair<Context*,int>> ls;
+      ls.swap(finisher_queue);
+      finisher_running = true;
+      ul.unlock();
+      ldout(cct, 10) << "finisher_thread doing " << ls << dendl;
+
+      if (logger) {
+	start = ceph_clock_now();
+	count = ls.size();
+      }
+
+      // Now actually process the contexts.
+      for (auto p : ls) {
+	p.first->complete(p.second);
+      }
+      ldout(cct, 10) << "finisher_thread done with " << ls << dendl;
+      ls.clear();
+      if (logger) {
+	logger->dec(l_finisher_queue_len, count);
+	logger->tinc(l_finisher_complete_lat, ceph_clock_now() - start);
+      }
+
+      ul.lock();
+      finisher_running = false;
+    }
+    ldout(cct, 10) << "finisher_thread empty" << dendl;
+    if (unlikely(finisher_empty_wait))
+      finisher_empty_cond.notify_all();
+    if (finisher_stop)
+      break;
+    
+    ldout(cct, 10) << "finisher_thread sleeping" << dendl;
+    finisher_cond.wait(ul);
+  }
+  // If we are exiting, we signal the thread waiting in stop(),
+  // otherwise it would never unblock
+  finisher_empty_cond.notify_all();
+
+  ldout(cct, 10) << "finisher_thread stop" << dendl;
+  finisher_stop = false;
+  return 0;
+}
+
diff --git a/src/common/Finisher.h b/src/common/Finisher.h
new file mode 100644
index 00000000..cca3f81c
--- /dev/null
+++ b/src/common/Finisher.h
@@ -0,0 +1,235 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_FINISHER_H
+#define CEPH_FINISHER_H
+
+#include "include/Context.h"
+#include "common/Thread.h"
+#include "common/ceph_mutex.h"
+#include "common/perf_counters.h"
+#include "common/Cond.h"
+
+class CephContext;
+
+/// Finisher queue length performance counter ID.
+enum {
+  l_finisher_first = 997082,
+  l_finisher_queue_len,
+  l_finisher_complete_lat,
+  l_finisher_last
+};
+
+/** @brief Asynchronous cleanup class.
+ * Finisher asynchronously completes Contexts, which are simple classes
+ * representing callbacks, in a dedicated worker thread. Enqueuing
+ * contexts to complete is thread-safe.
+ */
+class Finisher {
+  CephContext *cct;
+  ceph::mutex finisher_lock; ///< Protects access to queues and finisher_running.
+  ceph::condition_variable finisher_cond; ///< Signaled when there is something to process.
+  ceph::condition_variable finisher_empty_cond; ///< Signaled when the finisher has nothing more to process.
+  bool         finisher_stop; ///< Set when the finisher should stop.
+  bool         finisher_running; ///< True when the finisher is currently executing contexts.
+  bool	       finisher_empty_wait; ///< True mean someone wait finisher empty.
+
+  /// Queue for contexts for which complete(0) will be called.
+  vector<pair<Context*,int>> finisher_queue;
+
+  string thread_name;
+
+  /// Performance counter for the finisher's queue length.
+  /// Only active for named finishers.
+  PerfCounters *logger;
+  
+  void *finisher_thread_entry();
+
+  struct FinisherThread : public Thread {
+    Finisher *fin;    
+    explicit FinisherThread(Finisher *f) : fin(f) {}
+    void* entry() override { return fin->finisher_thread_entry(); }
+  } finisher_thread;
+
+ public:
+  /// Add a context to complete, optionally specifying a parameter for the complete function.
+  void queue(Context *c, int r = 0) {
+    std::unique_lock ul(finisher_lock);
+    if (finisher_queue.empty()) {
+      finisher_cond.notify_all();
+    }
+    finisher_queue.push_back(make_pair(c, r));
+    if (logger)
+      logger->inc(l_finisher_queue_len);
+  }
+
+  void queue(list<Context*>& ls) {
+    {
+      std::unique_lock ul(finisher_lock);
+      if (finisher_queue.empty()) {
+	finisher_cond.notify_all();
+      }
+      for (auto i : ls) {
+	finisher_queue.push_back(make_pair(i, 0));
+      }
+      if (logger)
+	logger->inc(l_finisher_queue_len, ls.size());
+    }
+    ls.clear();
+  }
+  void queue(deque<Context*>& ls) {
+    {
+      std::unique_lock ul(finisher_lock);
+      if (finisher_queue.empty()) {
+	finisher_cond.notify_all();
+      }
+      for (auto i : ls) {
+	finisher_queue.push_back(make_pair(i, 0));
+      }
+      if (logger)
+	logger->inc(l_finisher_queue_len, ls.size());
+    }
+    ls.clear();
+  }
+  void queue(vector<Context*>& ls) {
+    {
+      std::unique_lock ul(finisher_lock);
+      if (finisher_queue.empty()) {
+	finisher_cond.notify_all();
+      }
+      for (auto i : ls) {
+	finisher_queue.push_back(make_pair(i, 0));
+      }
+      if (logger)
+	logger->inc(l_finisher_queue_len, ls.size());
+    }
+    ls.clear();
+  }
+
+  /// Start the worker thread.
+  void start();
+
+  /** @brief Stop the worker thread.
+   *
+   * Does not wait until all outstanding contexts are completed.
+   * To ensure that everything finishes, you should first shut down
+   * all sources that can add contexts to this finisher and call
+   * wait_for_empty() before calling stop(). */
+  void stop();
+
+  /** @brief Blocks until the finisher has nothing left to process.
+   * This function will also return when a concurrent call to stop()
+   * finishes, but this class should never be used in this way. */
+  void wait_for_empty();
+
+  /// Construct an anonymous Finisher.
+  /// Anonymous finishers do not log their queue length.
+  explicit Finisher(CephContext *cct_) :
+    cct(cct_), finisher_lock(ceph::make_mutex("Finisher::finisher_lock")),
+    finisher_stop(false), finisher_running(false), finisher_empty_wait(false),
+    thread_name("fn_anonymous"), logger(0),
+    finisher_thread(this) {}
+
+  /// Construct a named Finisher that logs its queue length.
+  Finisher(CephContext *cct_, string name, string tn) :
+    cct(cct_), finisher_lock(ceph::make_mutex("Finisher::" + name)),
+    finisher_stop(false), finisher_running(false), finisher_empty_wait(false),
+    thread_name(tn), logger(0),
+    finisher_thread(this) {
+    PerfCountersBuilder b(cct, string("finisher-") + name,
+			  l_finisher_first, l_finisher_last);
+    b.add_u64(l_finisher_queue_len, "queue_len");
+    b.add_time_avg(l_finisher_complete_lat, "complete_latency");
+    logger = b.create_perf_counters();
+    cct->get_perfcounters_collection()->add(logger);
+    logger->set(l_finisher_queue_len, 0);
+    logger->set(l_finisher_complete_lat, 0);
+  }
+
+  ~Finisher() {
+    if (logger && cct) {
+      cct->get_perfcounters_collection()->remove(logger);
+      delete logger;
+    }
+  }
+};
+
+/// Context that is completed asynchronously on the supplied finisher.
+class C_OnFinisher : public Context {
+  Context *con;
+  Finisher *fin;
+public:
+  C_OnFinisher(Context *c, Finisher *f) : con(c), fin(f) {
+    ceph_assert(fin != NULL);
+    ceph_assert(con != NULL);
+  }
+
+  ~C_OnFinisher() override {
+    if (con != nullptr) {
+      delete con;
+      con = nullptr;
+    }
+  }
+
+  void finish(int r) override {
+    fin->queue(con, r);
+    con = nullptr;
+  }
+};
+
+class ContextQueue {
+  list<Context *> q;
+  std::mutex q_mutex;
+  ceph::mutex& mutex;
+  ceph::condition_variable& cond;
+public:
+  ContextQueue(ceph::mutex& mut,
+	       ceph::condition_variable& con)
+    : mutex(mut), cond(con) {}
+
+  void queue(list<Context *>& ls) {
+    bool empty = false;
+    {
+      std::scoped_lock l(q_mutex);
+      if (q.empty()) {
+	q.swap(ls);
+	empty = true;
+      } else {
+	q.insert(q.end(), ls.begin(), ls.end());
+      }
+    }
+
+    if (empty) {
+      std::scoped_lock l{mutex};
+      cond.notify_all();
+    }
+
+    ls.clear();
+  }
+
+  void swap(list<Context *>& ls) {
+    ls.clear();
+    std::scoped_lock l(q_mutex);
+    if (!q.empty()) {
+      q.swap(ls);
+    }
+  }
+
+  bool empty() {
+    std::scoped_lock l(q_mutex);
+    return q.empty();
+  }
+};
+
+#endif
diff --git a/src/common/Formatter.cc b/src/common/Formatter.cc
new file mode 100644
index 00000000..786272c1
--- /dev/null
+++ b/src/common/Formatter.cc
@@ -0,0 +1,951 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#define LARGE_SIZE 1024
+
+#include "HTMLFormatter.h"
+#include "common/escape.h"
+#include "include/buffer.h"
+
+#include <set>
+#include <limits>
+#include <boost/format.hpp>
+
+// -----------------------
+namespace ceph {
+
+std::string
+fixed_u_to_string(uint64_t num, int scale)
+{
+	std::ostringstream t;
+
+	t.fill('0');
+	t.width(scale + 1);
+	t << num;
+	int len = t.str().size();
+	return t.str().substr(0,len - scale) + "." + t.str().substr(len - scale);
+}
+
+std::string
+fixed_to_string(int64_t num, int scale)
+{
+	std::ostringstream t;
+	bool neg = num < 0;
+	if (neg) num = -num;
+
+	t.fill('0');
+	t.width(scale + 1);
+	t << num;
+	int len = t.str().size();
+	return (neg ? "-" : "") + t.str().substr(0,len - scale) + "." + t.str().substr(len - scale);
+}
+
+/*
+ * FormatterAttrs(const char *attr, ...)
+ *
+ * Requires a list of attrs followed by NULL. The attrs should be char *
+ * pairs, first one is the name, second one is the value. E.g.,
+ *
+ * FormatterAttrs("name1", "value1", "name2", "value2", NULL);
+ */
+FormatterAttrs::FormatterAttrs(const char *attr, ...)
+{
+  const char *s = attr;
+  va_list ap;
+  va_start(ap, attr);
+  do {
+    const char *val = va_arg(ap, char *);
+    if (!val)
+      break;
+
+    attrs.push_back(make_pair(std::string(s), std::string(val)));
+    s = va_arg(ap, char *);
+  } while (s);
+  va_end(ap);
+}
+
+Formatter::Formatter() { }
+
+Formatter::~Formatter() { }
+
+Formatter *Formatter::create(std::string_view type,
+			     std::string_view default_type,
+			     std::string_view fallback)
+{
+  std::string mytype(type);
+  if (mytype == "")
+    mytype = default_type;
+
+  if (mytype == "json")
+    return new JSONFormatter(false);
+  else if (mytype == "json-pretty")
+    return new JSONFormatter(true);
+  else if (mytype == "xml")
+    return new XMLFormatter(false);
+  else if (mytype == "xml-pretty")
+    return new XMLFormatter(true);
+  else if (mytype == "table")
+    return new TableFormatter();
+  else if (mytype == "table-kv")
+    return new TableFormatter(true);
+  else if (mytype == "html")
+    return new HTMLFormatter(false);
+  else if (mytype == "html-pretty")
+    return new HTMLFormatter(true);
+  else if (fallback != "")
+    return create(fallback, "", "");
+  else
+    return (Formatter *) NULL;
+}
+
+
+void Formatter::flush(bufferlist &bl)
+{
+  std::stringstream os;
+  flush(os);
+  bl.append(os.str());
+}
+
+void Formatter::dump_format(const char *name, const char *fmt, ...)
+{
+  va_list ap;
+  va_start(ap, fmt);
+  dump_format_va(name, NULL, true, fmt, ap);
+  va_end(ap);
+}
+
+void Formatter::dump_format_ns(const char *name, const char *ns, const char *fmt, ...)
+{
+  va_list ap;
+  va_start(ap, fmt);
+  dump_format_va(name, ns, true, fmt, ap);
+  va_end(ap);
+
+}
+
+void Formatter::dump_format_unquoted(const char *name, const char *fmt, ...)
+{
+  va_list ap;
+  va_start(ap, fmt);
+  dump_format_va(name, NULL, false, fmt, ap);
+  va_end(ap);
+}
+
+// -----------------------
+
+JSONFormatter::JSONFormatter(bool p)
+: m_pretty(p), m_is_pending_string(false)
+{
+  reset();
+}
+
+void JSONFormatter::flush(std::ostream& os)
+{
+  finish_pending_string();
+  os << m_ss.str();
+  if (m_line_break_enabled)
+    os << "\n";
+  m_ss.clear();
+  m_ss.str("");
+}
+
+void JSONFormatter::reset()
+{
+  m_stack.clear();
+  m_ss.clear();
+  m_ss.str("");
+  m_pending_string.clear();
+  m_pending_string.str("");
+}
+
+void JSONFormatter::print_comma(json_formatter_stack_entry_d& entry)
+{
+  if (entry.size) {
+    if (m_pretty) {
+      m_ss << ",\n";
+      for (unsigned i = 1; i < m_stack.size(); i++)
+        m_ss << "    ";
+    } else {
+      m_ss << ",";
+    }
+  } else if (m_pretty) {
+    m_ss << "\n";
+    for (unsigned i = 1; i < m_stack.size(); i++)
+      m_ss << "    ";
+  }
+  if (m_pretty && entry.is_array)
+    m_ss << "    ";
+}
+
+void JSONFormatter::print_quoted_string(std::string_view s)
+{
+  m_ss << '\"' << json_stream_escaper(s) << '\"';
+}
+
+void JSONFormatter::print_name(const char *name)
+{
+  finish_pending_string();
+  if (m_stack.empty())
+    return;
+  struct json_formatter_stack_entry_d& entry = m_stack.back();
+  print_comma(entry);
+  if (!entry.is_array) {
+    if (m_pretty) {
+      m_ss << "    ";
+    }
+    m_ss << "\"" << name << "\"";
+    if (m_pretty)
+      m_ss << ": ";
+    else
+      m_ss << ':';
+  }
+  ++entry.size;
+}
+
+void JSONFormatter::open_section(const char *name, const char *ns, bool is_array)
+{
+  if (handle_open_section(name, ns, is_array)) {
+    return;
+  }
+  if (ns) {
+    std::ostringstream oss;
+    oss << name << " " << ns;
+    print_name(oss.str().c_str());
+  } else {
+    print_name(name);
+  }
+  if (is_array)
+    m_ss << '[';
+  else
+    m_ss << '{';
+
+  json_formatter_stack_entry_d n;
+  n.is_array = is_array;
+  m_stack.push_back(n);
+}
+
+void JSONFormatter::open_array_section(const char *name)
+{
+  open_section(name, nullptr, true);
+}
+
+void JSONFormatter::open_array_section_in_ns(const char *name, const char *ns)
+{
+  open_section(name, ns, true);
+}
+
+void JSONFormatter::open_object_section(const char *name)
+{
+  open_section(name, nullptr, false);
+}
+
+void JSONFormatter::open_object_section_in_ns(const char *name, const char *ns)
+{
+  open_section(name, ns, false);
+}
+
+void JSONFormatter::close_section()
+{
+
+  if (handle_close_section()) {
+    return;
+  }
+  ceph_assert(!m_stack.empty());
+  finish_pending_string();
+
+  struct json_formatter_stack_entry_d& entry = m_stack.back();
+  if (m_pretty && entry.size) {
+    m_ss << "\n";
+    for (unsigned i = 1; i < m_stack.size(); i++)
+      m_ss << "    ";
+  }
+  m_ss << (entry.is_array ? ']' : '}');
+  m_stack.pop_back();
+  if (m_pretty && m_stack.empty())
+    m_ss << "\n";
+}
+
+void JSONFormatter::finish_pending_string()
+{
+  if (m_is_pending_string) {
+    m_is_pending_string = false;
+    add_value(m_pending_name.c_str(), m_pending_string.str(), true);
+    m_pending_string.str("");
+  }
+}
+
+template <class T>
+void JSONFormatter::add_value(const char *name, T val)
+{
+  std::stringstream ss;
+  ss.precision(std::numeric_limits<T>::max_digits10);
+  ss << val;
+  add_value(name, ss.str(), false);
+}
+
+void JSONFormatter::add_value(const char *name, std::string_view val, bool quoted)
+{
+  if (handle_value(name, val, quoted)) {
+    return;
+  }
+  print_name(name);
+  if (!quoted) {
+    m_ss << val;
+  } else {
+    print_quoted_string(val);
+  }
+}
+
+void JSONFormatter::dump_unsigned(const char *name, uint64_t u)
+{
+  add_value(name, u);
+}
+
+void JSONFormatter::dump_int(const char *name, int64_t s)
+{
+  add_value(name, s);
+}
+
+void JSONFormatter::dump_float(const char *name, double d)
+{
+  add_value(name, d);
+}
+
+void JSONFormatter::dump_string(const char *name, std::string_view s)
+{
+  add_value(name, s, true);
+}
+
+std::ostream& JSONFormatter::dump_stream(const char *name)
+{
+  finish_pending_string();
+  m_pending_name = name;
+  m_is_pending_string = true;
+  return m_pending_string;
+}
+
+void JSONFormatter::dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap)
+{
+  char buf[LARGE_SIZE];
+  vsnprintf(buf, LARGE_SIZE, fmt, ap);
+
+  add_value(name, buf, quoted);
+}
+
+int JSONFormatter::get_len() const
+{
+  return m_ss.str().size();
+}
+
+void JSONFormatter::write_raw_data(const char *data)
+{
+  m_ss << data;
+}
+
+const char *XMLFormatter::XML_1_DTD =
+  "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
+
+XMLFormatter::XMLFormatter(bool pretty, bool lowercased, bool underscored)
+: m_pretty(pretty),
+  m_lowercased(lowercased),
+  m_underscored(underscored)
+{
+  reset();
+}
+
+void XMLFormatter::flush(std::ostream& os)
+{
+  finish_pending_string();
+  std::string m_ss_str = m_ss.str();
+  os << m_ss_str;
+  /* There is a small catch here. If the rest of the formatter had NO output,
+   * we should NOT output a newline. This primarily triggers on HTTP redirects */
+  if (m_pretty && !m_ss_str.empty())
+    os << "\n";
+  else if (m_line_break_enabled)
+    os << "\n";
+  m_ss.clear();
+  m_ss.str("");
+}
+
+void XMLFormatter::reset()
+{
+  m_ss.clear();
+  m_ss.str("");
+  m_pending_string.clear();
+  m_pending_string.str("");
+  m_sections.clear();
+  m_pending_string_name.clear();
+  m_header_done = false;
+}
+
+void XMLFormatter::output_header()
+{
+  if(!m_header_done) {
+    m_header_done = true;
+    write_raw_data(XMLFormatter::XML_1_DTD);
+    if (m_pretty)
+      m_ss << "\n";
+  }
+}
+
+void XMLFormatter::output_footer()
+{
+  while(!m_sections.empty()) {
+    close_section();
+  }
+}
+
+void XMLFormatter::open_object_section(const char *name)
+{
+  open_section_in_ns(name, NULL, NULL);
+}
+
+void XMLFormatter::open_object_section_with_attrs(const char *name, const FormatterAttrs& attrs)
+{
+  open_section_in_ns(name, NULL, &attrs);
+}
+
+void XMLFormatter::open_object_section_in_ns(const char *name, const char *ns)
+{
+  open_section_in_ns(name, ns, NULL);
+}
+
+void XMLFormatter::open_array_section(const char *name)
+{
+  open_section_in_ns(name, NULL, NULL);
+}
+
+void XMLFormatter::open_array_section_with_attrs(const char *name, const FormatterAttrs& attrs)
+{
+  open_section_in_ns(name, NULL, &attrs);
+}
+
+void XMLFormatter::open_array_section_in_ns(const char *name, const char *ns)
+{
+  open_section_in_ns(name, ns, NULL);
+}
+
+void XMLFormatter::close_section()
+{
+  ceph_assert(!m_sections.empty());
+  finish_pending_string();
+
+  std::string section = m_sections.back();
+  std::transform(section.begin(), section.end(), section.begin(),
+	 [this](char c) { return this->to_lower_underscore(c); });
+  m_sections.pop_back();
+  print_spaces();
+  m_ss << "</" << section << ">";
+  if (m_pretty)
+    m_ss << "\n";
+}
+
+template <class T>
+void XMLFormatter::add_value(const char *name, T val)
+{
+  std::string e(name);
+  std::transform(e.begin(), e.end(), e.begin(),
+      [this](char c) { return this->to_lower_underscore(c); });
+
+  print_spaces();
+  m_ss.precision(std::numeric_limits<T>::max_digits10);
+  m_ss << "<" << e << ">" << val << "</" << e << ">";
+  if (m_pretty)
+    m_ss << "\n";
+}
+
+void XMLFormatter::dump_unsigned(const char *name, uint64_t u)
+{
+  add_value(name, u);
+}
+
+void XMLFormatter::dump_int(const char *name, int64_t s)
+{
+  add_value(name, s);
+}
+
+void XMLFormatter::dump_float(const char *name, double d)
+{
+  add_value(name, d);
+}
+
+void XMLFormatter::dump_string(const char *name, std::string_view s)
+{
+  std::string e(name);
+  std::transform(e.begin(), e.end(), e.begin(),
+      [this](char c) { return this->to_lower_underscore(c); });
+
+  print_spaces();
+  m_ss << "<" << e << ">" << xml_stream_escaper(s) << "</" << e << ">";
+  if (m_pretty)
+    m_ss << "\n";
+}
+
+void XMLFormatter::dump_string_with_attrs(const char *name, std::string_view s, const FormatterAttrs& attrs)
+{
+  std::string e(name);
+  std::transform(e.begin(), e.end(), e.begin(),
+      [this](char c) { return this->to_lower_underscore(c); });
+
+  std::string attrs_str;
+  get_attrs_str(&attrs, attrs_str);
+  print_spaces();
+  m_ss << "<" << e << attrs_str << ">" << xml_stream_escaper(s) << "</" << e << ">";
+  if (m_pretty)
+    m_ss << "\n";
+}
+
+std::ostream& XMLFormatter::dump_stream(const char *name)
+{
+  print_spaces();
+  m_pending_string_name = name;
+  m_ss << "<" << m_pending_string_name << ">";
+  return m_pending_string;
+}
+
+void XMLFormatter::dump_format_va(const char* name, const char *ns, bool quoted, const char *fmt, va_list ap)
+{
+  char buf[LARGE_SIZE];
+  size_t len = vsnprintf(buf, LARGE_SIZE, fmt, ap);
+  std::string e(name);
+  std::transform(e.begin(), e.end(), e.begin(),
+      [this](char c) { return this->to_lower_underscore(c); });
+
+  print_spaces();
+  if (ns) {
+    m_ss << "<" << e << " xmlns=\"" << ns << "\">" << buf << "</" << e << ">";
+  } else {
+    m_ss << "<" << e << ">" << xml_stream_escaper(std::string_view(buf, len)) << "</" << e << ">";
+  }
+
+  if (m_pretty)
+    m_ss << "\n";
+}
+
+int XMLFormatter::get_len() const
+{
+  return m_ss.str().size();
+}
+
+void XMLFormatter::write_raw_data(const char *data)
+{
+  m_ss << data;
+}
+
+void XMLFormatter::get_attrs_str(const FormatterAttrs *attrs, std::string& attrs_str)
+{
+  std::stringstream attrs_ss;
+
+  for (std::list<std::pair<std::string, std::string> >::const_iterator iter = attrs->attrs.begin();
+       iter != attrs->attrs.end(); ++iter) {
+    std::pair<std::string, std::string> p = *iter;
+    attrs_ss << " " << p.first << "=" << "\"" << p.second << "\"";
+  }
+
+  attrs_str = attrs_ss.str();
+}
+
+void XMLFormatter::open_section_in_ns(const char *name, const char *ns, const FormatterAttrs *attrs)
+{
+  print_spaces();
+  std::string attrs_str;
+
+  if (attrs) {
+    get_attrs_str(attrs, attrs_str);
+  }
+
+  std::string e(name);
+  std::transform(e.begin(), e.end(), e.begin(),
+      [this](char c) { return this->to_lower_underscore(c); });
+
+  if (ns) {
+    m_ss << "<" << e << attrs_str << " xmlns=\"" << ns << "\">";
+  } else {
+    m_ss << "<" << e << attrs_str << ">";
+  }
+  if (m_pretty)
+    m_ss << "\n";
+  m_sections.push_back(name);
+}
+
+void XMLFormatter::finish_pending_string()
+{
+  if (!m_pending_string_name.empty()) {
+    m_ss << xml_stream_escaper(m_pending_string.str())
+      << "</" << m_pending_string_name << ">";
+    m_pending_string_name.clear();
+    m_pending_string.str(std::string());
+    if (m_pretty) {
+      m_ss << "\n";
+    }
+  }
+}
+
+void XMLFormatter::print_spaces()
+{
+  finish_pending_string();
+  if (m_pretty) {
+    std::string spaces(m_sections.size(), ' ');
+    m_ss << spaces;
+  }
+}
+
+char XMLFormatter::to_lower_underscore(char c) const
+{
+  if (m_underscored && c == ' ') {
+      return '_';
+  } else if (m_lowercased) {
+    return std::tolower(c);
+  }
+  return c;
+}
+
+TableFormatter::TableFormatter(bool keyval) : m_keyval(keyval)
+{
+  reset();
+}
+
+void TableFormatter::flush(std::ostream& os)
+{
+  finish_pending_string();
+  std::vector<size_t> column_size = m_column_size;
+  std::vector<std::string> column_name = m_column_name;
+
+  std::set<int> need_header_set;
+
+  // auto-sizing columns
+  for (size_t i = 0; i < m_vec.size(); i++) {
+    for (size_t j = 0; j < m_vec[i].size(); j++) {
+      column_size.resize(m_vec[i].size());
+      column_name.resize(m_vec[i].size());
+      if (i > 0) {
+        if (m_vec[i - 1][j] != m_vec[i][j]) {
+          // changing row labels require to show the header
+          need_header_set.insert(i);
+          column_name[i] = m_vec[i][j].first;
+        }
+      } else {
+        column_name[i] = m_vec[i][j].first;
+      }
+
+      if (m_vec[i][j].second.length() > column_size[j])
+        column_size[j] = m_vec[i][j].second.length();
+      if (m_vec[i][j].first.length() > column_size[j])
+        column_size[j] = m_vec[i][j].first.length();
+    }
+  }
+
+  bool need_header = false;
+  if ((column_size.size() == m_column_size.size())) {
+    for (size_t i = 0; i < column_size.size(); i++) {
+      if (column_size[i] != m_column_size[i]) {
+        need_header = true;
+        break;
+      }
+    }
+  } else {
+    need_header = true;
+  }
+
+  if (need_header) {
+    // first row always needs a header if there wasn't one before
+    need_header_set.insert(0);
+  }
+
+  m_column_size = column_size;
+  for (size_t i = 0; i < m_vec.size(); i++) {
+    if (i == 0) {
+      if (need_header_set.count(i)) {
+        // print the header
+        if (!m_keyval) {
+          os << "+";
+          for (size_t j = 0; j < m_vec[i].size(); j++) {
+            for (size_t v = 0; v < m_column_size[j] + 3; v++)
+              os << "-";
+            os << "+";
+          }
+          os << "\n";
+          os << "|";
+
+          for (size_t j = 0; j < m_vec[i].size(); j++) {
+            os << " ";
+            std::stringstream fs;
+            fs << boost::format("%%-%is") % (m_column_size[j] + 2);
+            os << boost::format(fs.str()) % m_vec[i][j].first;
+            os << "|";
+          }
+          os << "\n";
+          os << "+";
+          for (size_t j = 0; j < m_vec[i].size(); j++) {
+            for (size_t v = 0; v < m_column_size[j] + 3; v++)
+              os << "-";
+            os << "+";
+          }
+          os << "\n";
+        }
+      }
+    }
+    // print body
+    if (!m_keyval)
+      os << "|";
+    for (size_t j = 0; j < m_vec[i].size(); j++) {
+      if (!m_keyval)
+        os << " ";
+      std::stringstream fs;
+
+      if (m_keyval) {
+        os << "key::";
+        os << m_vec[i][j].first;
+        os << "=";
+        os << "\"";
+        os << m_vec[i][j].second;
+        os << "\" ";
+      } else {
+        fs << boost::format("%%-%is") % (m_column_size[j] + 2);
+        os << boost::format(fs.str()) % m_vec[i][j].second;
+        os << "|";
+      }
+    }
+
+    os << "\n";
+    if (!m_keyval) {
+      if (i == (m_vec.size() - 1)) {
+        // print trailer
+        os << "+";
+        for (size_t j = 0; j < m_vec[i].size(); j++) {
+          for (size_t v = 0; v < m_column_size[j] + 3; v++)
+            os << "-";
+          os << "+";
+        }
+        os << "\n";
+      }
+    }
+    m_vec[i].clear();
+  }
+  m_vec.clear();
+}
+
+void TableFormatter::reset()
+{
+  m_ss.clear();
+  m_ss.str("");
+  m_section_cnt.clear();
+  m_column_size.clear();
+  m_section_open = 0;
+}
+
+void TableFormatter::open_object_section(const char *name)
+{
+  open_section_in_ns(name, NULL, NULL);
+}
+
+void TableFormatter::open_object_section_with_attrs(const char *name, const FormatterAttrs& attrs)
+{
+  open_section_in_ns(name, NULL, NULL);
+}
+
+void TableFormatter::open_object_section_in_ns(const char *name, const char *ns)
+{
+  open_section_in_ns(name, NULL, NULL);
+}
+
+void TableFormatter::open_array_section(const char *name)
+{
+  open_section_in_ns(name, NULL, NULL);
+}
+
+void TableFormatter::open_array_section_with_attrs(const char *name, const FormatterAttrs& attrs)
+{
+  open_section_in_ns(name, NULL, NULL);
+}
+
+void TableFormatter::open_array_section_in_ns(const char *name, const char *ns)
+{
+  open_section_in_ns(name, NULL, NULL);
+}
+
+void TableFormatter::open_section_in_ns(const char *name, const char *ns, const FormatterAttrs *attrs)
+{
+  m_section.push_back(name);
+  m_section_open++;
+}
+
+void TableFormatter::close_section()
+{
+  //
+  m_section_open--;
+  if (m_section.size()) {
+    m_section_cnt[m_section.back()] = 0;
+    m_section.pop_back();
+  }
+}
+
+size_t TableFormatter::m_vec_index(const char *name)
+{
+  std::string key(name);
+
+  size_t i = m_vec.size();
+  if (i)
+    i--;
+
+  // make sure there are vectors to push back key/val pairs
+  if (!m_vec.size())
+    m_vec.resize(1);
+
+  if (m_vec.size()) {
+    if (m_vec[i].size()) {
+      if (m_vec[i][0].first == key) {
+        // start a new column if a key is repeated
+        m_vec.resize(m_vec.size() + 1);
+        i++;
+      }
+    }
+  }
+
+  return i;
+}
+
+std::string TableFormatter::get_section_name(const char* name)
+{
+  std::string t_name = name;
+  for (size_t i = 0; i < m_section.size(); i++) {
+    t_name.insert(0, ":");
+    t_name.insert(0, m_section[i]);
+  }
+  if (m_section_open) {
+    std::stringstream lss;
+    lss << t_name;
+    lss << "[";
+    lss << m_section_cnt[t_name]++;
+    lss << "]";
+    return lss.str();
+  } else {
+    return t_name;
+  }
+}
+
+template <class T>
+void TableFormatter::add_value(const char *name, T val) {
+  finish_pending_string();
+  size_t i = m_vec_index(name);
+  m_ss.precision(std::numeric_limits<double>::max_digits10);
+  m_ss << val;
+
+  m_vec[i].push_back(std::make_pair(get_section_name(name), m_ss.str()));
+  m_ss.clear();
+  m_ss.str("");
+}
+
+void TableFormatter::dump_unsigned(const char *name, uint64_t u)
+{
+  add_value(name, u);
+}
+
+void TableFormatter::dump_int(const char *name, int64_t s)
+{
+  add_value(name, s);
+}
+
+void TableFormatter::dump_float(const char *name, double d)
+{
+  add_value(name, d);
+}
+
+void TableFormatter::dump_string(const char *name, std::string_view s)
+{
+  finish_pending_string();
+  size_t i = m_vec_index(name);
+  m_ss << s;
+
+  m_vec[i].push_back(std::make_pair(get_section_name(name), m_ss.str()));
+  m_ss.clear();
+  m_ss.str("");
+}
+
+void TableFormatter::dump_string_with_attrs(const char *name, std::string_view s, const FormatterAttrs& attrs)
+{
+  finish_pending_string();
+  size_t i = m_vec_index(name);
+
+  std::string attrs_str;
+  get_attrs_str(&attrs, attrs_str);
+  m_ss << attrs_str << s;
+
+  m_vec[i].push_back(std::make_pair(get_section_name(name), m_ss.str()));
+  m_ss.clear();
+  m_ss.str("");
+}
+
+void TableFormatter::dump_format_va(const char* name, const char *ns, bool quoted, const char *fmt, va_list ap)
+{
+  finish_pending_string();
+  char buf[LARGE_SIZE];
+  vsnprintf(buf, LARGE_SIZE, fmt, ap);
+
+  size_t i = m_vec_index(name);
+  if (ns) {
+    m_ss << ns << "." << buf;
+  } else
+    m_ss << buf;
+
+  m_vec[i].push_back(std::make_pair(get_section_name(name), m_ss.str()));
+  m_ss.clear();
+  m_ss.str("");
+}
+
+std::ostream& TableFormatter::dump_stream(const char *name)
+{
+  finish_pending_string();
+  // we don't support this
+  m_pending_name = name;
+  return m_ss;
+}
+
+int TableFormatter::get_len() const
+{
+  // we don't know the size until flush is called
+  return 0;
+}
+
+void TableFormatter::write_raw_data(const char *data) {
+  // not supported
+}
+
+void TableFormatter::get_attrs_str(const FormatterAttrs *attrs, std::string& attrs_str)
+{
+  std::stringstream attrs_ss;
+
+  for (std::list<std::pair<std::string, std::string> >::const_iterator iter = attrs->attrs.begin();
+       iter != attrs->attrs.end(); ++iter) {
+    std::pair<std::string, std::string> p = *iter;
+    attrs_ss << " " << p.first << "=" << "\"" << p.second << "\"";
+  }
+
+  attrs_str = attrs_ss.str();
+}
+
+void TableFormatter::finish_pending_string()
+{
+  if (m_pending_name.length()) {
+    std::string ss = m_ss.str();
+    m_ss.clear();
+    m_ss.str("");
+    std::string pending_name = m_pending_name;
+    m_pending_name = "";
+    dump_string(pending_name.c_str(), ss);
+  }
+}
+}
+
diff --git a/src/common/Formatter.h b/src/common/Formatter.h
new file mode 100644
index 00000000..c4cdd552
--- /dev/null
+++ b/src/common/Formatter.h
@@ -0,0 +1,305 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_FORMATTER_H
+#define CEPH_FORMATTER_H
+
+#include "include/int_types.h"
+#include "include/buffer_fwd.h"
+
+#include <deque>
+#include <list>
+#include <vector>
+#include <stdarg.h>
+#include <sstream>
+#include <map>
+
+namespace ceph {
+
+  struct FormatterAttrs {
+    std::list< std::pair<std::string, std::string> > attrs;
+
+    FormatterAttrs(const char *attr, ...);
+  };
+
+  class Formatter {
+  public:
+    class ObjectSection {
+      Formatter& formatter;
+
+    public:
+      ObjectSection(Formatter& f, const char *name) : formatter(f) {
+        formatter.open_object_section(name);
+      }
+      ObjectSection(Formatter& f, const char *name, const char *ns) : formatter(f) {
+        formatter.open_object_section_in_ns(name, ns);
+      }
+      ~ObjectSection() {
+        formatter.close_section();
+      }
+    };
+    class ArraySection {
+      Formatter& formatter;
+
+    public:
+      ArraySection(Formatter& f, const char *name) : formatter(f) {
+        formatter.open_array_section(name);
+      }
+      ArraySection(Formatter& f, const char *name, const char *ns) : formatter(f) {
+        formatter.open_array_section_in_ns(name, ns);
+      }
+      ~ArraySection() {
+        formatter.close_section();
+      }
+    };
+
+    static Formatter *create(std::string_view type,
+			     std::string_view default_type,
+			     std::string_view fallback);
+    static Formatter *create(std::string_view type,
+			     std::string_view default_type) {
+      return create(type, default_type, "");
+    }
+    static Formatter *create(std::string_view type) {
+      return create(type, "json-pretty", "");
+    }
+
+    Formatter();
+    virtual ~Formatter();
+
+    virtual void enable_line_break() = 0;
+    virtual void flush(std::ostream& os) = 0;
+    void flush(bufferlist &bl);
+    virtual void reset() = 0;
+
+    virtual void set_status(int status, const char* status_name) = 0;
+    virtual void output_header() = 0;
+    virtual void output_footer() = 0;
+
+    virtual void open_array_section(const char *name) = 0;
+    virtual void open_array_section_in_ns(const char *name, const char *ns) = 0;
+    virtual void open_object_section(const char *name) = 0;
+    virtual void open_object_section_in_ns(const char *name, const char *ns) = 0;
+    virtual void close_section() = 0;
+    virtual void dump_unsigned(const char *name, uint64_t u) = 0;
+    virtual void dump_int(const char *name, int64_t s) = 0;
+    virtual void dump_float(const char *name, double d) = 0;
+    virtual void dump_string(const char *name, std::string_view s) = 0;
+    virtual void dump_bool(const char *name, bool b)
+    {
+      dump_format_unquoted(name, "%s", (b ? "true" : "false"));
+    }
+    template<typename T>
+    void dump_object(const char *name, const T& foo) {
+      open_object_section(name);
+      foo.dump(this);
+      close_section();
+    }
+    virtual std::ostream& dump_stream(const char *name) = 0;
+    virtual void dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap) = 0;
+    virtual void dump_format(const char *name, const char *fmt, ...);
+    virtual void dump_format_ns(const char *name, const char *ns, const char *fmt, ...);
+    virtual void dump_format_unquoted(const char *name, const char *fmt, ...);
+    virtual int get_len() const = 0;
+    virtual void write_raw_data(const char *data) = 0;
+    /* with attrs */
+    virtual void open_array_section_with_attrs(const char *name, const FormatterAttrs& attrs)
+    {
+      open_array_section(name);
+    }
+    virtual void open_object_section_with_attrs(const char *name, const FormatterAttrs& attrs)
+    {
+      open_object_section(name);
+    }
+    virtual void dump_string_with_attrs(const char *name, std::string_view s, const FormatterAttrs& attrs)
+    {
+      dump_string(name, s);
+    }
+  };
+
+  class copyable_sstream : public std::stringstream {
+  public:
+    copyable_sstream() {}
+    copyable_sstream(const copyable_sstream& rhs) {
+      str(rhs.str());
+    }
+    copyable_sstream& operator=(const copyable_sstream& rhs) {
+      str(rhs.str());
+      return *this;
+    }
+  };
+
+  class JSONFormatter : public Formatter {
+  public:
+    explicit JSONFormatter(bool p = false);
+
+    void set_status(int status, const char* status_name) override {};
+    void output_header() override {};
+    void output_footer() override {};
+    void enable_line_break() override { m_line_break_enabled = true; }
+    void flush(std::ostream& os) override;
+    using Formatter::flush; // don't hide Formatter::flush(bufferlist &bl)
+    void reset() override;
+    void open_array_section(const char *name) override;
+    void open_array_section_in_ns(const char *name, const char *ns) override;
+    void open_object_section(const char *name) override;
+    void open_object_section_in_ns(const char *name, const char *ns) override;
+    void close_section() override;
+    void dump_unsigned(const char *name, uint64_t u) override;
+    void dump_int(const char *name, int64_t s) override;
+    void dump_float(const char *name, double d) override;
+    void dump_string(const char *name, std::string_view s) override;
+    std::ostream& dump_stream(const char *name) override;
+    void dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap) override;
+    int get_len() const override;
+    void write_raw_data(const char *data) override;
+
+  protected:
+    virtual bool handle_value(const char *name, std::string_view s, bool quoted) {
+      return false; /* is handling done? */
+    }
+
+    virtual bool handle_open_section(const char *name, const char *ns, bool is_array) {
+      return false; /* is handling done? */
+    }
+
+    virtual bool handle_close_section() {
+      return false; /* is handling done? */
+    }
+
+  private:
+
+    struct json_formatter_stack_entry_d {
+      int size;
+      bool is_array;
+      json_formatter_stack_entry_d() : size(0), is_array(false) { }
+    };
+
+    bool m_pretty;
+    void open_section(const char *name, const char *ns, bool is_array);
+    void print_quoted_string(std::string_view s);
+    void print_name(const char *name);
+    void print_comma(json_formatter_stack_entry_d& entry);
+    void finish_pending_string();
+
+    template <class T>
+    void add_value(const char *name, T val);
+    void add_value(const char *name, std::string_view val, bool quoted);
+
+    copyable_sstream m_ss;
+    copyable_sstream m_pending_string;
+    std::string m_pending_name;
+    std::list<json_formatter_stack_entry_d> m_stack;
+    bool m_is_pending_string;
+    bool m_line_break_enabled = false;
+  };
+
+  template <class T>
+  void add_value(const char *name, T val);
+
+  class XMLFormatter : public Formatter {
+  public:
+    static const char *XML_1_DTD;
+    XMLFormatter(bool pretty = false, bool lowercased = false, bool underscored = true);
+
+    void set_status(int status, const char* status_name) override {}
+    void output_header() override;
+    void output_footer() override;
+
+    void enable_line_break() override { m_line_break_enabled = true; }
+    void flush(std::ostream& os) override;
+    using Formatter::flush; // don't hide Formatter::flush(bufferlist &bl)
+    void reset() override;
+    void open_array_section(const char *name) override;
+    void open_array_section_in_ns(const char *name, const char *ns) override;
+    void open_object_section(const char *name) override;
+    void open_object_section_in_ns(const char *name, const char *ns) override;
+    void close_section() override;
+    void dump_unsigned(const char *name, uint64_t u) override;
+    void dump_int(const char *name, int64_t s) override;
+    void dump_float(const char *name, double d) override;
+    void dump_string(const char *name, std::string_view s) override;
+    std::ostream& dump_stream(const char *name) override;
+    void dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap) override;
+    int get_len() const override;
+    void write_raw_data(const char *data) override;
+
+    /* with attrs */
+    void open_array_section_with_attrs(const char *name, const FormatterAttrs& attrs) override;
+    void open_object_section_with_attrs(const char *name, const FormatterAttrs& attrs) override;
+    void dump_string_with_attrs(const char *name, std::string_view s, const FormatterAttrs& attrs) override;
+
+  protected:
+    void open_section_in_ns(const char *name, const char *ns, const FormatterAttrs *attrs);
+    void finish_pending_string();
+    void print_spaces();
+    void get_attrs_str(const FormatterAttrs *attrs, std::string& attrs_str);
+    char to_lower_underscore(char c) const;
+
+    std::stringstream m_ss, m_pending_string;
+    std::deque<std::string> m_sections;
+    const bool m_pretty;
+    const bool m_lowercased;
+    const bool m_underscored;
+    std::string m_pending_string_name;
+    bool m_header_done;
+    bool m_line_break_enabled = false;
+  private:
+    template <class T>
+    void add_value(const char *name, T val);
+  };
+
+  class TableFormatter : public Formatter {
+  public:
+    explicit TableFormatter(bool keyval = false);
+
+    void set_status(int status, const char* status_name) override {};
+    void output_header() override {};
+    void output_footer() override {};
+    void enable_line_break() override {};
+    void flush(std::ostream& os) override;
+    using Formatter::flush; // don't hide Formatter::flush(bufferlist &bl)
+    void reset() override;
+    void open_array_section(const char *name) override;
+    void open_array_section_in_ns(const char *name, const char *ns) override;
+    void open_object_section(const char *name) override;
+    void open_object_section_in_ns(const char *name, const char *ns) override;
+
+    void open_array_section_with_attrs(const char *name, const FormatterAttrs& attrs) override;
+    void open_object_section_with_attrs(const char *name, const FormatterAttrs& attrs) override;
+
+    void close_section() override;
+    void dump_unsigned(const char *name, uint64_t u) override;
+    void dump_int(const char *name, int64_t s) override;
+    void dump_float(const char *name, double d) override;
+    void dump_string(const char *name, std::string_view s) override;
+    void dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap) override;
+    void dump_string_with_attrs(const char *name, std::string_view s, const FormatterAttrs& attrs) override;
+    std::ostream& dump_stream(const char *name) override;
+
+    int get_len() const override;
+    void write_raw_data(const char *data) override;
+    void get_attrs_str(const FormatterAttrs *attrs, std::string& attrs_str);
+
+  private:
+    template <class T>
+    void add_value(const char *name, T val);
+    void open_section_in_ns(const char *name, const char *ns, const FormatterAttrs *attrs);
+    std::vector< std::vector<std::pair<std::string, std::string> > > m_vec;
+    std::stringstream m_ss;
+    size_t m_vec_index(const char* name);
+    std::string get_section_name(const char* name);
+    void finish_pending_string();
+    std::string m_pending_name;
+    bool m_keyval;
+
+    int m_section_open;
+    std::vector< std::string > m_section;
+    std::map<std::string, int> m_section_cnt;
+    std::vector<size_t> m_column_size;
+    std::vector< std::string > m_column_name;
+  };
+
+  std::string fixed_to_string(int64_t num, int scale);
+  std::string fixed_u_to_string(uint64_t num, int scale);
+}
+#endif
diff --git a/src/common/Graylog.cc b/src/common/Graylog.cc
new file mode 100644
index 00000000..79a537ee
--- /dev/null
+++ b/src/common/Graylog.cc
@@ -0,0 +1,170 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "Graylog.h"
+#include "common/Formatter.h"
+#include "common/LogEntry.h"
+#include "log/Entry.h"
+#include "log/SubsystemMap.h"
+
+namespace ceph {
+namespace logging {
+
+Graylog::Graylog(const SubsystemMap * const s, const std::string &logger)
+    : m_subs(s),
+      m_log_dst_valid(false),
+      m_hostname(""),
+      m_fsid(""),
+      m_logger(std::move(logger)),
+      m_ostream_compressed(std::stringstream::in |
+                           std::stringstream::out |
+                           std::stringstream::binary)
+{
+  m_formatter = std::unique_ptr<Formatter>(Formatter::create("json"));
+  m_formatter_section = std::unique_ptr<Formatter>(Formatter::create("json"));
+}
+
+Graylog::Graylog(const std::string &logger)
+    : m_subs(NULL),
+      m_log_dst_valid(false),
+      m_hostname(""),
+      m_fsid(""),
+      m_logger(std::move(logger)),
+      m_ostream_compressed(std::stringstream::in |
+                           std::stringstream::out |
+                           std::stringstream::binary)
+{
+  m_formatter = std::unique_ptr<Formatter>(Formatter::create("json"));
+  m_formatter_section = std::unique_ptr<Formatter>(Formatter::create("json"));
+}
+
+Graylog::~Graylog()
+{
+}
+
+void Graylog::set_destination(const std::string& host, int port)
+{
+  try {
+    boost::asio::ip::udp::resolver resolver(m_io_service);
+    boost::asio::ip::udp::resolver::query query(host, std::to_string(port));
+    m_endpoint = *resolver.resolve(query);
+    m_log_dst_valid = true;
+  } catch (boost::system::system_error const& e) {
+    cerr << "Error resolving graylog destination: " << e.what() << std::endl;
+    m_log_dst_valid = false;
+  }
+}
+
+void Graylog::set_hostname(const std::string& host)
+{
+  m_hostname = host;
+}
+
+void Graylog::set_fsid(const uuid_d& fsid)
+{
+  std::vector<char> buf(40);
+  fsid.print(&buf[0]);
+  m_fsid = std::string(&buf[0]);
+}
+
+void Graylog::log_entry(const Entry& e)
+{
+  if (m_log_dst_valid) {
+    auto s = e.strv();
+
+    m_formatter->open_object_section("");
+    m_formatter->dump_string("version", "1.1");
+    m_formatter->dump_string("host", m_hostname);
+    m_formatter->dump_string("short_message", s);
+    m_formatter->dump_string("_app", "ceph");
+    auto t = ceph::logging::log_clock::to_timeval(e.m_stamp);
+    m_formatter->dump_float("timestamp", t.tv_sec + (t.tv_usec / 1000000.0));
+    m_formatter->dump_unsigned("_thread", (uint64_t)e.m_thread);
+    m_formatter->dump_int("_level", e.m_prio);
+    if (m_subs != NULL)
+    m_formatter->dump_string("_subsys_name", m_subs->get_name(e.m_subsys));
+    m_formatter->dump_int("_subsys_id", e.m_subsys);
+    m_formatter->dump_string("_fsid", m_fsid);
+    m_formatter->dump_string("_logger", m_logger);
+    m_formatter->close_section();
+
+    m_ostream_compressed.clear();
+    m_ostream_compressed.str("");
+
+    m_ostream.reset();
+
+    m_ostream.push(m_compressor);
+    m_ostream.push(m_ostream_compressed);
+
+    m_formatter->flush(m_ostream);
+    m_ostream << std::endl;
+
+    m_ostream.reset();
+
+    try {
+      boost::asio::ip::udp::socket socket(m_io_service);
+      socket.open(m_endpoint.protocol());
+      socket.send_to(boost::asio::buffer(m_ostream_compressed.str()), m_endpoint);
+    } catch (boost::system::system_error const& e) {
+      cerr << "Error sending graylog message: " << e.what() << std::endl;
+    }
+  }
+}
+
+void Graylog::log_log_entry(LogEntry const * const e)
+{
+  if (m_log_dst_valid) {
+    m_formatter->open_object_section("");
+    m_formatter->dump_string("version", "1.1");
+    m_formatter->dump_string("host", m_hostname);
+    m_formatter->dump_string("short_message", e->msg);
+    m_formatter->dump_float("timestamp", e->stamp.sec() + (e->stamp.usec() / 1000000.0));
+    m_formatter->dump_string("_app", "ceph");
+
+    m_formatter->dump_string("name", e->name.to_str());
+
+    m_formatter_section->open_object_section("rank");
+    e->rank.dump(m_formatter_section.get());
+    m_formatter_section->close_section();
+
+    m_formatter_section->open_object_section("addrs");
+    e->addrs.dump(m_formatter_section.get());
+    m_formatter_section->close_section();
+
+    m_ostream_section.clear();
+    m_ostream_section.str("");
+    m_formatter_section->flush(m_ostream_section);
+    m_formatter->dump_string("_who", m_ostream_section.str());
+
+    m_formatter->dump_int("_seq", e->seq);
+    m_formatter->dump_string("_prio", clog_type_to_string(e->prio));
+    m_formatter->dump_string("_channel", e->channel);
+    m_formatter->dump_string("_fsid", m_fsid);
+    m_formatter->dump_string("_logger", m_logger);
+    m_formatter->close_section();
+
+    m_ostream_compressed.clear();
+    m_ostream_compressed.str("");
+
+    m_ostream.reset();
+
+    m_ostream.push(m_compressor);
+    m_ostream.push(m_ostream_compressed);
+
+    m_formatter->flush(m_ostream);
+    m_ostream << std::endl;
+
+    m_ostream.reset();
+
+    try {
+      boost::asio::ip::udp::socket socket(m_io_service);
+      socket.open(m_endpoint.protocol());
+      socket.send_to(boost::asio::buffer(m_ostream_compressed.str()), m_endpoint);
+    } catch (boost::system::system_error const& e) {
+      cerr << "Error sending graylog message: " << e.what() << std::endl;
+    }
+  }
+}
+
+} // ceph::logging::
+} // ceph::
diff --git a/src/common/Graylog.h b/src/common/Graylog.h
new file mode 100644
index 00000000..a4214fa8
--- /dev/null
+++ b/src/common/Graylog.h
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef __CEPH_LOG_GRAYLOG_H
+#define __CEPH_LOG_GRAYLOG_H
+
+#include <boost/asio.hpp>
+#include <boost/iostreams/filtering_stream.hpp>
+#include <boost/iostreams/filter/zlib.hpp>
+
+#include "include/ceph_assert.h"  // boost clobbers this
+
+struct uuid_d;
+class LogEntry;
+
+namespace ceph {
+
+class Formatter;
+
+namespace logging {
+
+class Entry;
+class SubsystemMap;
+
+// Graylog logging backend: Convert log datastructures (LogEntry, Entry) to
+// GELF (http://www.graylog2.org/resources/gelf/specification) and send it
+// to a GELF UDP receiver
+
+class Graylog
+{
+ public:
+
+  /**
+   * Create Graylog with SubsystemMap. log_entry will resolve the subsystem
+   * id to string. Logging will not be ready until set_destination is called
+   * @param s SubsystemMap
+   * @param logger Value for key "_logger" in GELF
+   */
+  Graylog(const SubsystemMap * const s, const std::string &logger);
+
+  /**
+   * Create Graylog without SubsystemMap. Logging will not be ready
+   * until set_destination is called
+   * @param logger Value for key "_logger" in GELF
+   */
+  explicit Graylog(const std::string &logger);
+  virtual ~Graylog();
+
+  void set_hostname(const std::string& host);
+  void set_fsid(const uuid_d& fsid);
+
+  void set_destination(const std::string& host, int port);
+
+  void log_entry(const Entry& e);
+  void log_log_entry(LogEntry const * const e);
+
+  typedef std::shared_ptr<Graylog> Ref;
+
+ private:
+  SubsystemMap const * const m_subs;
+
+  bool m_log_dst_valid;
+
+  std::string m_hostname;
+  std::string m_fsid;
+  std::string m_logger;
+
+  boost::asio::ip::udp::endpoint m_endpoint;
+  boost::asio::io_service m_io_service;
+
+  std::unique_ptr<Formatter> m_formatter;
+  std::unique_ptr<Formatter> m_formatter_section;
+  std::stringstream m_ostream_section;
+  std::stringstream m_ostream_compressed;
+  boost::iostreams::filtering_ostream m_ostream;
+  boost::iostreams::zlib_compressor m_compressor;
+
+};
+
+}
+}
+
+#endif
diff --git a/src/common/HTMLFormatter.cc b/src/common/HTMLFormatter.cc
new file mode 100644
index 00000000..725bc39f
--- /dev/null
+++ b/src/common/HTMLFormatter.cc
@@ -0,0 +1,158 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#define LARGE_SIZE 1024
+
+#include "HTMLFormatter.h"
+#include "Formatter.h"
+
+#include <sstream>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <string.h>     // for strdup
+
+#include "common/escape.h"
+
+// -----------------------
+namespace ceph {
+
+HTMLFormatter::HTMLFormatter(bool pretty)
+: XMLFormatter(pretty), m_status(0), m_status_name(NULL)
+{
+}
+
+HTMLFormatter::~HTMLFormatter()
+{
+  if (m_status_name) {
+    free((void*)m_status_name);
+    m_status_name = NULL;
+  }
+}
+
+void HTMLFormatter::reset()
+{
+  XMLFormatter::reset();
+  m_header_done = false;
+  m_status = 0;
+  if (m_status_name) {
+    free((void*)m_status_name);
+    m_status_name = NULL;
+  }
+}
+
+void HTMLFormatter::set_status(int status, const char* status_name)
+{
+  m_status = status;
+  if (status_name) {
+    if (m_status_name) {
+      free((void*)m_status_name);
+    }
+    m_status_name = strdup(status_name);
+  }
+};
+
+void HTMLFormatter::output_header() {
+  if (!m_header_done) {
+    m_header_done = true;
+    char buf[16];
+    snprintf(buf, sizeof(buf), "%d", m_status);
+    std::string status_line(buf);
+    if (m_status_name) {
+      status_line += " ";
+      status_line += m_status_name;
+    }
+    open_object_section("html");
+    print_spaces();
+    m_ss << "<head><title>" << status_line << "</title></head>";
+    if (m_pretty)
+      m_ss << "\n";
+    open_object_section("body");
+    print_spaces();
+    m_ss << "<h1>" << status_line << "</h1>";
+    if (m_pretty)
+      m_ss << "\n";
+    open_object_section("ul");
+  }
+}
+
+template <typename T>
+void HTMLFormatter::dump_template(const char *name, T arg)
+{
+  print_spaces();
+  m_ss << "<li>" << name << ": " << arg << "</li>";
+  if (m_pretty)
+    m_ss << "\n";
+}
+
+void HTMLFormatter::dump_unsigned(const char *name, uint64_t u)
+{
+  dump_template(name, u);
+}
+
+void HTMLFormatter::dump_int(const char *name, int64_t u)
+{
+  dump_template(name, u);
+}
+
+void HTMLFormatter::dump_float(const char *name, double d)
+{
+  dump_template(name, d);
+}
+
+void HTMLFormatter::dump_string(const char *name, std::string_view s)
+{
+  dump_template(name, xml_stream_escaper(s));
+}
+
+void HTMLFormatter::dump_string_with_attrs(const char *name, std::string_view s, const FormatterAttrs& attrs)
+{
+  std::string e(name);
+  std::string attrs_str;
+  get_attrs_str(&attrs, attrs_str);
+  print_spaces();
+  m_ss << "<li>" << e << ": " << xml_stream_escaper(s) << attrs_str << "</li>";
+  if (m_pretty)
+    m_ss << "\n";
+}
+
+std::ostream& HTMLFormatter::dump_stream(const char *name)
+{
+  print_spaces();
+  m_pending_string_name = "li";
+  m_ss << "<li>" << name << ": ";
+  return m_pending_string;
+}
+
+void HTMLFormatter::dump_format_va(const char* name, const char *ns, bool quoted, const char *fmt, va_list ap)
+{
+  char buf[LARGE_SIZE];
+  size_t len = vsnprintf(buf, LARGE_SIZE, fmt, ap);
+
+  std::string e(name);
+  print_spaces();
+  if (ns) {
+    m_ss << "<li xmlns=\"" << ns << "\">" << e << ": "
+	 << xml_stream_escaper(std::string_view(buf, len)) << "</li>";
+  } else {
+    m_ss << "<li>" << e << ": "
+	 << xml_stream_escaper(std::string_view(buf, len)) << "</li>";
+  }
+
+  if (m_pretty)
+    m_ss << "\n";
+}
+
+} // namespace ceph
diff --git a/src/common/HTMLFormatter.h b/src/common/HTMLFormatter.h
new file mode 100644
index 00000000..ab725062
--- /dev/null
+++ b/src/common/HTMLFormatter.h
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_HTML_FORMATTER_H
+#define CEPH_HTML_FORMATTER_H
+
+#include "Formatter.h"
+
+namespace ceph {
+  class HTMLFormatter : public XMLFormatter {
+  public:
+    explicit HTMLFormatter(bool pretty = false);
+    ~HTMLFormatter() override;
+    void reset() override;
+
+    void set_status(int status, const char* status_name) override;
+    void output_header() override;
+
+    void dump_unsigned(const char *name, uint64_t u) override;
+    void dump_int(const char *name, int64_t u) override;
+    void dump_float(const char *name, double d) override;
+    void dump_string(const char *name, std::string_view s) override;
+    std::ostream& dump_stream(const char *name) override;
+    void dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap) override;
+
+    /* with attrs */
+    void dump_string_with_attrs(const char *name, std::string_view s, const FormatterAttrs& attrs) override;
+  private:
+    template <typename T> void dump_template(const char *name, T arg);
+
+    int m_status;
+    const char* m_status_name;
+  };
+
+}
+
+#endif
diff --git a/src/common/HeartbeatMap.cc b/src/common/HeartbeatMap.cc
new file mode 100644
index 00000000..a211eb14
--- /dev/null
+++ b/src/common/HeartbeatMap.cc
@@ -0,0 +1,183 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include <signal.h>
+
+#include "HeartbeatMap.h"
+#include "ceph_context.h"
+#include "common/errno.h"
+#include "debug.h"
+
+#define dout_subsys ceph_subsys_heartbeatmap
+#undef dout_prefix
+#define dout_prefix *_dout << "heartbeat_map "
+
+namespace ceph {
+
+HeartbeatMap::HeartbeatMap(CephContext *cct)
+  : m_cct(cct),
+    m_rwlock("HeartbeatMap::m_rwlock"),
+    m_unhealthy_workers(0),
+    m_total_workers(0)
+{
+}
+
+HeartbeatMap::~HeartbeatMap()
+{
+  ceph_assert(m_workers.empty());
+}
+
+heartbeat_handle_d *HeartbeatMap::add_worker(const string& name, pthread_t thread_id)
+{
+  m_rwlock.get_write();
+  ldout(m_cct, 10) << "add_worker '" << name << "'" << dendl;
+  heartbeat_handle_d *h = new heartbeat_handle_d(name);
+  ANNOTATE_BENIGN_RACE_SIZED(&h->timeout, sizeof(h->timeout),
+                             "heartbeat_handle_d timeout");
+  ANNOTATE_BENIGN_RACE_SIZED(&h->suicide_timeout, sizeof(h->suicide_timeout),
+                             "heartbeat_handle_d suicide_timeout");
+  m_workers.push_front(h);
+  h->list_item = m_workers.begin();
+  h->thread_id = thread_id;
+  m_rwlock.put_write();
+  return h;
+}
+
+void HeartbeatMap::remove_worker(const heartbeat_handle_d *h)
+{
+  m_rwlock.get_write();
+  ldout(m_cct, 10) << "remove_worker '" << h->name << "'" << dendl;
+  m_workers.erase(h->list_item);
+  m_rwlock.put_write();
+  delete h;
+}
+
+bool HeartbeatMap::_check(const heartbeat_handle_d *h, const char *who,
+			  ceph::coarse_mono_clock::rep now)
+{
+  bool healthy = true;
+  auto was = h->timeout.load();
+  if (was && was < now) {
+    ldout(m_cct, 1) << who << " '" << h->name << "'"
+		    << " had timed out after " << h->grace << dendl;
+    healthy = false;
+  }
+  was = h->suicide_timeout;
+  if (was && was < now) {
+    ldout(m_cct, 1) << who << " '" << h->name << "'"
+		    << " had suicide timed out after " << h->suicide_grace << dendl;
+    pthread_kill(h->thread_id, SIGABRT);
+    sleep(1);
+    ceph_abort_msg("hit suicide timeout");
+  }
+  return healthy;
+}
+
+void HeartbeatMap::reset_timeout(heartbeat_handle_d *h,
+				 ceph::coarse_mono_clock::rep grace,
+				 ceph::coarse_mono_clock::rep suicide_grace)
+{
+  ldout(m_cct, 20) << "reset_timeout '" << h->name << "' grace " << grace
+		   << " suicide " << suicide_grace << dendl;
+  auto now = chrono::duration_cast<chrono::seconds>(
+	       ceph::coarse_mono_clock::now().time_since_epoch()).count();
+  _check(h, "reset_timeout", now);
+
+  h->timeout = now + grace;
+  h->grace = grace;
+
+  if (suicide_grace)
+    h->suicide_timeout = now + suicide_grace;
+  else
+    h->suicide_timeout = 0;
+  h->suicide_grace = suicide_grace;
+}
+
+void HeartbeatMap::clear_timeout(heartbeat_handle_d *h)
+{
+  ldout(m_cct, 20) << "clear_timeout '" << h->name << "'" << dendl;
+  auto now = chrono::duration_cast<std::chrono::seconds>(
+	       ceph::coarse_mono_clock::now().time_since_epoch()).count();
+  _check(h, "clear_timeout", now);
+  h->timeout = 0;
+  h->suicide_timeout = 0;
+}
+
+bool HeartbeatMap::is_healthy()
+{
+  int unhealthy = 0;
+  int total = 0;
+  m_rwlock.get_read();
+  auto now = ceph::coarse_mono_clock::now();
+  if (m_cct->_conf->heartbeat_inject_failure) {
+    ldout(m_cct, 0) << "is_healthy injecting failure for next " << m_cct->_conf->heartbeat_inject_failure << " seconds" << dendl;
+    m_inject_unhealthy_until = now + std::chrono::seconds(m_cct->_conf->heartbeat_inject_failure);
+    m_cct->_conf.set_val("heartbeat_inject_failure", "0");
+  }
+
+  bool healthy = true;
+  if (now < m_inject_unhealthy_until) {
+    auto sec = std::chrono::duration_cast<std::chrono::seconds>(m_inject_unhealthy_until - now).count();
+    ldout(m_cct, 0) << "is_healthy = false, injected failure for next "
+                    << sec << " seconds" << dendl;
+    healthy = false;
+  }
+
+  for (list<heartbeat_handle_d*>::iterator p = m_workers.begin();
+       p != m_workers.end();
+       ++p) {
+    heartbeat_handle_d *h = *p;
+    auto epoch = chrono::duration_cast<chrono::seconds>(now.time_since_epoch()).count();
+    if (!_check(h, "is_healthy", epoch)) {
+      healthy = false;
+      unhealthy++;
+    }
+    total++;
+  }
+  m_rwlock.put_read();
+
+  m_unhealthy_workers = unhealthy;
+  m_total_workers = total;
+
+  ldout(m_cct, 20) << "is_healthy = " << (healthy ? "healthy" : "NOT HEALTHY")
+    << ", total workers: " << total << ", number of unhealthy: " << unhealthy << dendl;
+  return healthy;
+}
+
+int HeartbeatMap::get_unhealthy_workers() const
+{
+  return m_unhealthy_workers;
+}
+
+int HeartbeatMap::get_total_workers() const
+{
+  return m_total_workers;
+}
+
+void HeartbeatMap::check_touch_file()
+{
+  string path = m_cct->_conf->heartbeat_file;
+  if (path.length() && is_healthy()) {
+    int fd = ::open(path.c_str(), O_WRONLY|O_CREAT|O_CLOEXEC, 0644);
+    if (fd >= 0) {
+      ::utimes(path.c_str(), NULL);
+      ::close(fd);
+    } else {
+      ldout(m_cct, 0) << "unable to touch " << path << ": "
+                     << cpp_strerror(errno) << dendl;
+    }
+  }
+}
+
+}
diff --git a/src/common/HeartbeatMap.h b/src/common/HeartbeatMap.h
new file mode 100644
index 00000000..f7ffd9eb
--- /dev/null
+++ b/src/common/HeartbeatMap.h
@@ -0,0 +1,95 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_HEARTBEATMAP_H
+#define CEPH_HEARTBEATMAP_H
+
+#include <list>
+#include <atomic>
+#include <string>
+#include <pthread.h>
+
+#include "common/ceph_time.h"
+#include "RWLock.h"
+
+class CephContext;
+
+namespace ceph {
+
+/*
+ * HeartbeatMap -
+ *
+ * Maintain a set of handles for internal subsystems to periodically
+ * check in with a health check and timeout.  Each user can register
+ * and get a handle they can use to set or reset a timeout.  
+ *
+ * A simple is_healthy() method checks for any users who are not within
+ * their grace period for a heartbeat.
+ */
+
+struct heartbeat_handle_d {
+  const std::string name;
+  pthread_t thread_id;
+  // TODO: use atomic<time_point>, once we can ditch GCC 4.8
+  std::atomic<unsigned> timeout = { 0 }, suicide_timeout = { 0 };
+  time_t grace, suicide_grace;
+  std::list<heartbeat_handle_d*>::iterator list_item;
+
+  explicit heartbeat_handle_d(const std::string& n)
+    : name(n), thread_id(0), grace(0), suicide_grace(0)
+  { }
+};
+
+class HeartbeatMap {
+ public:
+  // register/unregister
+  heartbeat_handle_d *add_worker(const std::string& name, pthread_t thread_id);
+  void remove_worker(const heartbeat_handle_d *h);
+
+  // reset the timeout so that it expects another touch within grace amount of time
+  void reset_timeout(heartbeat_handle_d *h,
+		     ceph::coarse_mono_clock::rep grace,
+		     ceph::coarse_mono_clock::rep suicide_grace);
+  // clear the timeout so that it's not checked on
+  void clear_timeout(heartbeat_handle_d *h);
+
+  // return false if any of the timeouts are currently expired.
+  bool is_healthy();
+
+  // touch cct->_conf->heartbeat_file if is_healthy()
+  void check_touch_file();
+
+  // get the number of unhealthy workers
+  int get_unhealthy_workers() const;
+
+  // get the number of total workers
+  int get_total_workers() const;
+
+  explicit HeartbeatMap(CephContext *cct);
+  ~HeartbeatMap();
+
+ private:
+  CephContext *m_cct;
+  RWLock m_rwlock;
+  ceph::coarse_mono_clock::time_point m_inject_unhealthy_until;
+  std::list<heartbeat_handle_d*> m_workers;
+  std::atomic<unsigned> m_unhealthy_workers = { 0 };
+  std::atomic<unsigned> m_total_workers = { 0 };
+
+  bool _check(const heartbeat_handle_d *h, const char *who,
+	      ceph::coarse_mono_clock::rep now);
+};
+
+}
+#endif
diff --git a/src/common/Initialize.h b/src/common/Initialize.h
new file mode 100644
index 00000000..78ad5ec6
--- /dev/null
+++ b/src/common/Initialize.h
@@ -0,0 +1,96 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+/* Copyright (c) 2011 Stanford University
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef CEPH_INITIALIZE_H
+#define CEPH_INITIALIZE_H
+
+/**
+ * This class is used to manage once-only initialization that should occur
+ * before main() is invoked, such as the creation of static variables.  It
+ * also provides a mechanism for handling dependencies (where one class
+ * needs to perform its once-only initialization before another).
+ * 
+ * The simplest way to use an Initialize object is to define a static
+ * initialization method for a class, say Foo::init().  Then, declare
+ * a static Initialize object in the class:
+ * "static Initialize initializer(Foo::init);".
+ * The result is that Foo::init will be invoked when the object is
+ * constructed (before main() is invoked).  Foo::init can create static
+ * objects and perform any other once-only initialization needed by the
+ * class.  Furthermore, if some other class needs to ensure that Foo has
+ * been initialized (e.g. as part of its own initialization) it can invoke
+ * Foo::init directly (Foo::init should contain an internal guard so that
+ * it only performs its functions once, even if invoked several times).
+ *
+ * There is also a second form of constructor for Initialize that causes a
+ * new object to be dynamically allocated and assigned to a pointer, instead
+ * of invoking a function. This form allows for the creation of static objects
+ * that are never destructed (thereby avoiding issues with the order of
+ * destruction).
+ */
+class Initialize {
+ public:
+  /**
+   * This form of constructor causes its function argument to be invoked
+   * when the object is constructed.  When used with a static Initialize
+   * object, this will cause \p func to run before main() runs, so that
+   * \p func can perform once-only initialization.
+   *
+   * \param func
+   *      This function is invoked with no arguments when the object is
+   *      constructed.  Typically the function will create static
+   *      objects and/or invoke other initialization functions.  The
+   *      function should normally contain an internal guard so that it
+   *      only performs its initialization the first time it is invoked.
+   */
+  explicit Initialize(void (*func)()) {
+    (*func)();
+  }
+
+  /**
+   * This form of constructor causes a new object of a particular class
+   * to be constructed with a no-argument constructor and assigned to a
+   * given pointer.  This form is typically used with a static Initialize
+   * object: the result is that the object will be created and assigned
+   * to the pointer before main() runs.
+   *
+   * \param p
+   *      Pointer to an object of any type. If the pointer is NULL then
+   *      it is replaced with a pointer to a newly allocated object of
+   *      the given type.
+   */
+  template<typename T>
+  explicit Initialize(T*& p) {
+    if (p == NULL) {
+      p = new T;
+    }
+  }
+};
+
+#endif  // CEPH_INITIALIZE_H
diff --git a/src/common/LogClient.cc b/src/common/LogClient.cc
new file mode 100644
index 00000000..982bbe35
--- /dev/null
+++ b/src/common/LogClient.cc
@@ -0,0 +1,383 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "common/LogClient.h"
+#include "include/str_map.h"
+#include "messages/MLog.h"
+#include "messages/MLogAck.h"
+#include "msg/Messenger.h"
+#include "mon/MonMap.h"
+#include "common/Graylog.h"
+
+#define dout_subsys ceph_subsys_monc
+
+int parse_log_client_options(CephContext *cct,
+			     map<string,string> &log_to_monitors,
+			     map<string,string> &log_to_syslog,
+			     map<string,string> &log_channels,
+			     map<string,string> &log_prios,
+			     map<string,string> &log_to_graylog,
+			     map<string,string> &log_to_graylog_host,
+			     map<string,string> &log_to_graylog_port,
+			     uuid_d &fsid,
+			     string &host)
+{
+  ostringstream oss;
+
+  int r = get_conf_str_map_helper(
+    cct->_conf.get_val<string>("clog_to_monitors"), oss,
+    &log_to_monitors, CLOG_CONFIG_DEFAULT_KEY);
+  if (r < 0) {
+    lderr(cct) << __func__ << " error parsing 'clog_to_monitors'" << dendl;
+    return r;
+  }
+
+  r = get_conf_str_map_helper(
+    cct->_conf.get_val<string>("clog_to_syslog"), oss,
+                              &log_to_syslog, CLOG_CONFIG_DEFAULT_KEY);
+  if (r < 0) {
+    lderr(cct) << __func__ << " error parsing 'clog_to_syslog'" << dendl;
+    return r;
+  }
+
+  r = get_conf_str_map_helper(
+    cct->_conf.get_val<string>("clog_to_syslog_facility"), oss,
+    &log_channels, CLOG_CONFIG_DEFAULT_KEY);
+  if (r < 0) {
+    lderr(cct) << __func__ << " error parsing 'clog_to_syslog_facility'" << dendl;
+    return r;
+  }
+
+  r = get_conf_str_map_helper(
+    cct->_conf.get_val<string>("clog_to_syslog_level"), oss,
+    &log_prios, CLOG_CONFIG_DEFAULT_KEY);
+  if (r < 0) {
+    lderr(cct) << __func__ << " error parsing 'clog_to_syslog_level'" << dendl;
+    return r;
+  }
+
+  r = get_conf_str_map_helper(
+    cct->_conf.get_val<string>("clog_to_graylog"), oss,
+    &log_to_graylog, CLOG_CONFIG_DEFAULT_KEY);
+  if (r < 0) {
+    lderr(cct) << __func__ << " error parsing 'clog_to_graylog'" << dendl;
+    return r;
+  }
+
+  r = get_conf_str_map_helper(
+    cct->_conf.get_val<string>("clog_to_graylog_host"), oss,
+    &log_to_graylog_host, CLOG_CONFIG_DEFAULT_KEY);
+  if (r < 0) {
+    lderr(cct) << __func__ << " error parsing 'clog_to_graylog_host'" << dendl;
+    return r;
+  }
+
+  r = get_conf_str_map_helper(
+    cct->_conf.get_val<string>("clog_to_graylog_port"), oss,
+    &log_to_graylog_port, CLOG_CONFIG_DEFAULT_KEY);
+  if (r < 0) {
+    lderr(cct) << __func__ << " error parsing 'clog_to_graylog_port'" << dendl;
+    return r;
+  }
+
+  fsid = cct->_conf.get_val<uuid_d>("fsid");
+  host = cct->_conf->host;
+  return 0;
+}
+
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+static ostream& _prefix(std::ostream *_dout, LogClient *logc) {
+  return *_dout << "log_client ";
+}
+
+static ostream& _prefix(std::ostream *_dout, LogChannel *lc) {
+  return *_dout << "log_channel(" << lc->get_log_channel() << ") ";
+}
+
+LogChannel::LogChannel(CephContext *cct, LogClient *lc, const string &channel)
+  : cct(cct), parent(lc),
+    log_channel(channel), log_to_syslog(false), log_to_monitors(false)
+{
+}
+
+LogChannel::LogChannel(CephContext *cct, LogClient *lc,
+                       const string &channel, const string &facility,
+                       const string &prio)
+  : cct(cct), parent(lc),
+    log_channel(channel), log_prio(prio), syslog_facility(facility),
+    log_to_syslog(false), log_to_monitors(false)
+{
+}
+
+LogClient::LogClient(CephContext *cct, Messenger *m, MonMap *mm,
+		     enum logclient_flag_t flags)
+  : cct(cct), messenger(m), monmap(mm), is_mon(flags & FLAG_MON),
+    last_log_sent(0), last_log(0)
+{
+}
+
+LogClientTemp::LogClientTemp(clog_type type_, LogChannel &parent_)
+  : type(type_), parent(parent_)
+{
+}
+
+LogClientTemp::LogClientTemp(const LogClientTemp &rhs)
+  : type(rhs.type), parent(rhs.parent)
+{
+  // don't want to-- nor can we-- copy the ostringstream
+}
+
+LogClientTemp::~LogClientTemp()
+{
+  if (ss.peek() != EOF)
+    parent.do_log(type, ss);
+}
+
+void LogChannel::update_config(map<string,string> &log_to_monitors,
+			       map<string,string> &log_to_syslog,
+			       map<string,string> &log_channels,
+			       map<string,string> &log_prios,
+			       map<string,string> &log_to_graylog,
+			       map<string,string> &log_to_graylog_host,
+			       map<string,string> &log_to_graylog_port,
+			       uuid_d &fsid,
+			       string &host)
+{
+  ldout(cct, 20) << __func__ << " log_to_monitors " << log_to_monitors
+		 << " log_to_syslog " << log_to_syslog
+		 << " log_channels " << log_channels
+		 << " log_prios " << log_prios
+		 << dendl;
+  bool to_monitors = (get_str_map_key(log_to_monitors, log_channel,
+                                      &CLOG_CONFIG_DEFAULT_KEY) == "true");
+  bool to_syslog = (get_str_map_key(log_to_syslog, log_channel,
+                                    &CLOG_CONFIG_DEFAULT_KEY) == "true");
+  string syslog_facility = get_str_map_key(log_channels, log_channel,
+					   &CLOG_CONFIG_DEFAULT_KEY);
+  string prio = get_str_map_key(log_prios, log_channel,
+				&CLOG_CONFIG_DEFAULT_KEY);
+  bool to_graylog = (get_str_map_key(log_to_graylog, log_channel,
+				     &CLOG_CONFIG_DEFAULT_KEY) == "true");
+  string graylog_host = get_str_map_key(log_to_graylog_host, log_channel,
+				       &CLOG_CONFIG_DEFAULT_KEY);
+  string graylog_port_str = get_str_map_key(log_to_graylog_port, log_channel,
+					    &CLOG_CONFIG_DEFAULT_KEY);
+  int graylog_port = atoi(graylog_port_str.c_str());
+
+  set_log_to_monitors(to_monitors);
+  set_log_to_syslog(to_syslog);
+  set_syslog_facility(syslog_facility);
+  set_log_prio(prio);
+
+  if (to_graylog && !graylog) { /* should but isn't */
+    graylog = std::make_shared<ceph::logging::Graylog>("clog");
+  } else if (!to_graylog && graylog) { /* shouldn't but is */
+    graylog.reset();
+  }
+
+  if (to_graylog && graylog) {
+    graylog->set_fsid(fsid);
+    graylog->set_hostname(host);
+  }
+
+  if (graylog && (!graylog_host.empty()) && (graylog_port != 0)) {
+    graylog->set_destination(graylog_host, graylog_port);
+  }
+
+  ldout(cct, 10) << __func__
+		 << " to_monitors: " << (to_monitors ? "true" : "false")
+		 << " to_syslog: " << (to_syslog ? "true" : "false")
+		 << " syslog_facility: " << syslog_facility
+		 << " prio: " << prio
+		 << " to_graylog: " << (to_graylog ? "true" : "false")
+		 << " graylog_host: " << graylog_host
+		 << " graylog_port: " << graylog_port
+		 << ")" << dendl;
+}
+
+void LogChannel::do_log(clog_type prio, std::stringstream& ss)
+{
+  while (!ss.eof()) {
+    string s;
+    getline(ss, s);
+    if (!s.empty())
+      do_log(prio, s);
+  }
+}
+
+void LogChannel::do_log(clog_type prio, const std::string& s)
+{
+  std::lock_guard l(channel_lock);
+  if (CLOG_ERROR == prio) {
+    ldout(cct,-1) << "log " << prio << " : " << s << dendl;
+  } else {
+    ldout(cct,0) << "log " << prio << " : " << s << dendl;
+  }
+  LogEntry e;
+  e.stamp = ceph_clock_now();
+  // seq and who should be set for syslog/graylog/log_to_mon
+  e.addrs = parent->get_myaddrs();
+  e.name = parent->get_myname();
+  e.rank = parent->get_myrank();
+  e.prio = prio;
+  e.msg = s;
+  e.channel = get_log_channel();
+
+  // log to monitor?
+  if (log_to_monitors) {
+    e.seq = parent->queue(e);
+  } else {
+    e.seq = parent->get_next_seq();
+  }
+
+  // log to syslog?
+  if (do_log_to_syslog()) {
+    ldout(cct,0) << __func__ << " log to syslog"  << dendl;
+    e.log_to_syslog(get_log_prio(), get_syslog_facility());
+  }
+
+  // log to graylog?
+  if (do_log_to_graylog()) {
+    ldout(cct,0) << __func__ << " log to graylog"  << dendl;
+    graylog->log_log_entry(&e);
+  }
+}
+
+Message *LogClient::get_mon_log_message(bool flush)
+{
+  std::lock_guard l(log_lock);
+  if (flush) {
+    if (log_queue.empty())
+      return nullptr;
+    // reset session
+    last_log_sent = log_queue.front().seq;
+  }
+  return _get_mon_log_message();
+}
+
+bool LogClient::are_pending()
+{
+  std::lock_guard l(log_lock);
+  return last_log > last_log_sent;
+}
+
+Message *LogClient::_get_mon_log_message()
+{
+  ceph_assert(ceph_mutex_is_locked(log_lock));
+  if (log_queue.empty())
+    return NULL;
+
+  // only send entries that haven't been sent yet during this mon
+  // session!  monclient needs to call reset_session() on mon session
+  // reset for this to work right.
+
+  if (last_log_sent == last_log)
+    return NULL;
+
+  // limit entries per message
+  unsigned num_unsent = last_log - last_log_sent;
+  unsigned num_send;
+  if (cct->_conf->mon_client_max_log_entries_per_message > 0)
+    num_send = std::min(num_unsent, (unsigned)cct->_conf->mon_client_max_log_entries_per_message);
+  else
+    num_send = num_unsent;
+
+  ldout(cct,10) << " log_queue is " << log_queue.size() << " last_log " << last_log << " sent " << last_log_sent
+		<< " num " << log_queue.size()
+		<< " unsent " << num_unsent
+		<< " sending " << num_send << dendl;
+  ceph_assert(num_unsent <= log_queue.size());
+  std::deque<LogEntry>::iterator p = log_queue.begin();
+  std::deque<LogEntry> o;
+  while (p->seq <= last_log_sent) {
+    ++p;
+    ceph_assert(p != log_queue.end());
+  }
+  while (num_send--) {
+    ceph_assert(p != log_queue.end());
+    o.push_back(*p);
+    last_log_sent = p->seq;
+    ldout(cct,10) << " will send " << *p << dendl;
+    ++p;
+  }
+  
+  MLog *log = new MLog(monmap->get_fsid());
+  log->entries.swap(o);
+
+  return log;
+}
+
+void LogClient::_send_to_mon()
+{
+  ceph_assert(ceph_mutex_is_locked(log_lock));
+  ceph_assert(is_mon);
+  ceph_assert(messenger->get_myname().is_mon());
+  ldout(cct,10) << __func__ << " log to self" << dendl;
+  Message *log = _get_mon_log_message();
+  messenger->get_loopback_connection()->send_message(log);
+}
+
+version_t LogClient::queue(LogEntry &entry)
+{
+  std::lock_guard l(log_lock);
+  entry.seq = ++last_log;
+  log_queue.push_back(entry);
+
+  if (is_mon) {
+    _send_to_mon();
+  }
+
+  return entry.seq;
+}
+
+uint64_t LogClient::get_next_seq()
+{
+  std::lock_guard l(log_lock);
+  return ++last_log;
+}
+
+entity_addrvec_t LogClient::get_myaddrs()
+{
+  return messenger->get_myaddrs();
+}
+
+entity_name_t LogClient::get_myrank()
+{
+  return messenger->get_myname();
+}
+
+const EntityName& LogClient::get_myname()
+{
+  return cct->_conf->name;
+}
+
+bool LogClient::handle_log_ack(MLogAck *m)
+{
+  std::lock_guard l(log_lock);
+  ldout(cct,10) << "handle_log_ack " << *m << dendl;
+
+  version_t last = m->last;
+
+  deque<LogEntry>::iterator q = log_queue.begin();
+  while (q != log_queue.end()) {
+    const LogEntry &entry(*q);
+    if (entry.seq > last)
+      break;
+    ldout(cct,10) << " logged " << entry << dendl;
+    q = log_queue.erase(q);
+  }
+  return true;
+}
+
diff --git a/src/common/LogClient.h b/src/common/LogClient.h
new file mode 100644
index 00000000..e138beac
--- /dev/null
+++ b/src/common/LogClient.h
@@ -0,0 +1,273 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_LOGCLIENT_H
+#define CEPH_LOGCLIENT_H
+
+#include <atomic>
+#include "common/LogEntry.h"
+#include "common/ceph_mutex.h"
+#include "include/health.h"
+
+class LogClient;
+class MLog;
+class MLogAck;
+class Messenger;
+class MonMap;
+class Message;
+struct uuid_d;
+struct Connection;
+
+class LogChannel;
+
+namespace ceph {
+namespace logging {
+  class Graylog;
+}
+}
+
+int parse_log_client_options(CephContext *cct,
+			     map<string,string> &log_to_monitors,
+			     map<string,string> &log_to_syslog,
+			     map<string,string> &log_channels,
+			     map<string,string> &log_prios,
+			     map<string,string> &log_to_graylog,
+			     map<string,string> &log_to_graylog_host,
+			     map<string,string> &log_to_graylog_port,
+			     uuid_d &fsid,
+			     string &host);
+
+class LogClientTemp
+{
+public:
+  LogClientTemp(clog_type type_, LogChannel &parent_);
+  LogClientTemp(const LogClientTemp &rhs);
+  ~LogClientTemp();
+
+  template<typename T>
+  std::ostream& operator<<(const T& rhs)
+  {
+    return ss << rhs;
+  }
+
+private:
+  clog_type type;
+  LogChannel &parent;
+  stringstream ss;
+};
+
+/** Manage where we output to and at which priority
+ *
+ * Not to be confused with the LogClient, which is the almighty coordinator
+ * of channels.  We just deal with the boring part of the logging: send to
+ * syslog, send to file, generate LogEntry and queue it for the LogClient.
+ *
+ * Past queueing the LogEntry, the LogChannel is done with the whole thing.
+ * LogClient will deal with sending and handling of LogEntries.
+ */
+class LogChannel
+{
+public:
+
+  LogChannel(CephContext *cct, LogClient *lc, const std::string &channel);
+  LogChannel(CephContext *cct, LogClient *lc,
+             const std::string &channel,
+             const std::string &facility,
+             const std::string &prio);
+
+  LogClientTemp debug() {
+    return LogClientTemp(CLOG_DEBUG, *this);
+  }
+  void debug(std::stringstream &s) {
+    do_log(CLOG_DEBUG, s);
+  }
+  /**
+   * Convenience function mapping health status to
+   * the appropriate cluster log severity.
+   */
+  LogClientTemp health(health_status_t health) {
+    switch(health) {
+      case HEALTH_OK:
+        return info();
+      case HEALTH_WARN:
+        return warn();
+      case HEALTH_ERR:
+        return error();
+      default:
+        // Invalid health_status_t value
+        ceph_abort();
+    }
+  }
+  LogClientTemp info() {
+    return LogClientTemp(CLOG_INFO, *this);
+  }
+  void info(std::stringstream &s) {
+    do_log(CLOG_INFO, s);
+  }
+  LogClientTemp warn() {
+    return LogClientTemp(CLOG_WARN, *this);
+  }
+  void warn(std::stringstream &s) {
+    do_log(CLOG_WARN, s);
+  }
+  LogClientTemp error() {
+    return LogClientTemp(CLOG_ERROR, *this);
+  }
+  void error(std::stringstream &s) {
+    do_log(CLOG_ERROR, s);
+  }
+  LogClientTemp sec() {
+    return LogClientTemp(CLOG_SEC, *this);
+  }
+  void sec(std::stringstream &s) {
+    do_log(CLOG_SEC, s);
+  }
+
+  void set_log_to_monitors(bool v) {
+    log_to_monitors = v;
+  }
+  void set_log_to_syslog(bool v) {
+    log_to_syslog = v;
+  }
+  void set_log_channel(const std::string& v) {
+    log_channel = v;
+  }
+  void set_log_prio(const std::string& v) {
+    log_prio = v;
+  }
+  void set_syslog_facility(const std::string& v) {
+    syslog_facility = v;
+  }
+  std::string get_log_prio() { return log_prio; }
+  std::string get_log_channel() { return log_channel; }
+  std::string get_syslog_facility() { return syslog_facility; }
+  bool must_log_to_syslog() { return log_to_syslog; }
+  /**
+   * Do we want to log to syslog?
+   *
+   * @return true if log_to_syslog is true and both channel and prio
+   *         are not empty; false otherwise.
+   */
+  bool do_log_to_syslog() {
+    return must_log_to_syslog() &&
+          !log_prio.empty() && !log_channel.empty();
+  }
+  bool must_log_to_monitors() { return log_to_monitors; }
+
+  bool do_log_to_graylog() {
+    return (graylog != nullptr);
+  }
+
+  typedef shared_ptr<LogChannel> Ref;
+
+  /**
+   * update config values from parsed k/v map for each config option
+   *
+   * Pick out the relevant value based on our channel.
+   */
+  void update_config(map<string,string> &log_to_monitors,
+		     map<string,string> &log_to_syslog,
+		     map<string,string> &log_channels,
+		     map<string,string> &log_prios,
+		     map<string,string> &log_to_graylog,
+		     map<string,string> &log_to_graylog_host,
+		     map<string,string> &log_to_graylog_port,
+		     uuid_d &fsid,
+		     string &host);
+
+  void do_log(clog_type prio, std::stringstream& ss);
+  void do_log(clog_type prio, const std::string& s);
+
+private:
+  CephContext *cct;
+  LogClient *parent;
+  ceph::mutex channel_lock = ceph::make_mutex("LogChannel::channel_lock");
+  std::string log_channel;
+  std::string log_prio;
+  std::string syslog_facility;
+  bool log_to_syslog;
+  bool log_to_monitors;
+  shared_ptr<ceph::logging::Graylog> graylog;
+
+
+  friend class LogClientTemp;
+};
+
+typedef LogChannel::Ref LogChannelRef;
+
+class LogClient
+{
+public:
+  enum logclient_flag_t {
+    NO_FLAGS = 0,
+    FLAG_MON = 0x1,
+  };
+
+  LogClient(CephContext *cct, Messenger *m, MonMap *mm,
+	    enum logclient_flag_t flags);
+  virtual ~LogClient() {
+    channels.clear();
+  }
+
+  bool handle_log_ack(MLogAck *m);
+  Message *get_mon_log_message(bool flush);
+  bool are_pending();
+
+  LogChannelRef create_channel() {
+    return create_channel(CLOG_CHANNEL_DEFAULT);
+  }
+
+  LogChannelRef create_channel(const std::string& name) {
+    LogChannelRef c;
+    if (channels.count(name))
+      c = channels[name];
+    else {
+      c = std::make_shared<LogChannel>(cct, this, name);
+      channels[name] = c;
+    }
+    return c;
+  }
+
+  void destroy_channel(const std::string& name) {
+    if (channels.count(name))
+      channels.erase(name);
+  }
+
+  void shutdown() {
+    channels.clear();
+  }
+
+  uint64_t get_next_seq();
+  entity_addrvec_t get_myaddrs();
+  const EntityName& get_myname();
+  entity_name_t get_myrank();
+  version_t queue(LogEntry &entry);
+
+private:
+  Message *_get_mon_log_message();
+  void _send_to_mon();
+
+  CephContext *cct;
+  Messenger *messenger;
+  MonMap *monmap;
+  bool is_mon;
+  ceph::mutex log_lock = ceph::make_mutex("LogClient::log_lock");
+  version_t last_log_sent;
+  version_t last_log;
+  std::deque<LogEntry> log_queue;
+
+  std::map<std::string, LogChannelRef> channels;
+
+};
+#endif
diff --git a/src/common/LogEntry.cc b/src/common/LogEntry.cc
new file mode 100644
index 00000000..993bf444
--- /dev/null
+++ b/src/common/LogEntry.cc
@@ -0,0 +1,375 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+//
+#include <syslog.h>
+#include <boost/algorithm/string/predicate.hpp>
+
+#include "LogEntry.h"
+#include "Formatter.h"
+#include "include/stringify.h"
+
+// ----
+// LogEntryKey
+
+void LogEntryKey::dump(Formatter *f) const
+{
+  f->dump_stream("rank") << rank;
+  f->dump_stream("stamp") << stamp;
+  f->dump_unsigned("seq", seq);
+}
+
+void LogEntryKey::generate_test_instances(list<LogEntryKey*>& o)
+{
+  o.push_back(new LogEntryKey);
+  o.push_back(new LogEntryKey(entity_name_t::CLIENT(1234), utime_t(1,2), 34));
+}
+
+clog_type LogEntry::str_to_level(std::string const &str)
+{
+  std::string level_str = str;
+  std::transform(level_str.begin(), level_str.end(), level_str.begin(),
+      [](char c) {return std::tolower(c);});
+
+  if (level_str == "debug") {
+    return CLOG_DEBUG;
+  } else if (level_str == "info") {
+    return CLOG_INFO;
+  } else if (level_str == "sec") {
+    return CLOG_SEC;
+  } else if (level_str == "warn" || level_str == "warning") {
+    return CLOG_WARN;
+  } else if (level_str == "error" || level_str == "err") {
+    return CLOG_ERROR;
+  } else {
+    return CLOG_UNKNOWN;
+  }
+}
+
+// ----
+
+int clog_type_to_syslog_level(clog_type t)
+{
+  switch (t) {
+    case CLOG_DEBUG:
+      return LOG_DEBUG;
+    case CLOG_INFO:
+      return LOG_INFO;
+    case CLOG_WARN:
+      return LOG_WARNING;
+    case CLOG_ERROR:
+      return LOG_ERR;
+    case CLOG_SEC:
+      return LOG_CRIT;
+    default:
+      ceph_abort();
+      return 0;
+  }
+}
+
+clog_type string_to_clog_type(const string& s)
+{
+  if (boost::iequals(s, "debug") ||
+      boost::iequals(s, "dbg"))
+    return CLOG_DEBUG;
+  if (boost::iequals(s, "info") ||
+      boost::iequals(s, "inf"))
+    return CLOG_INFO;
+  if (boost::iequals(s, "warning") ||
+      boost::iequals(s, "warn") ||
+      boost::iequals(s, "wrn"))
+    return CLOG_WARN;
+  if (boost::iequals(s, "error") ||
+      boost::iequals(s, "err"))
+    return CLOG_ERROR;
+  if (boost::iequals(s, "security") ||
+      boost::iequals(s, "sec"))
+    return CLOG_SEC;
+
+  return CLOG_UNKNOWN;
+}
+
+int string_to_syslog_level(string s)
+{
+  if (boost::iequals(s, "debug"))
+    return LOG_DEBUG;
+  if (boost::iequals(s, "info") ||
+      boost::iequals(s, "notice"))
+    return LOG_INFO;
+  if (boost::iequals(s, "warning") ||
+      boost::iequals(s, "warn"))
+    return LOG_WARNING;
+  if (boost::iequals(s, "error") ||
+      boost::iequals(s, "err"))
+    return LOG_ERR;
+  if (boost::iequals(s, "crit") ||
+      boost::iequals(s, "critical") ||
+      boost::iequals(s, "emerg"))
+    return LOG_CRIT;
+
+  // err on the side of noise!
+  return LOG_DEBUG;
+}
+
+int string_to_syslog_facility(string s)
+{
+  if (boost::iequals(s, "auth"))
+    return LOG_AUTH;
+  if (boost::iequals(s, "authpriv"))
+    return LOG_AUTHPRIV;
+  if (boost::iequals(s, "cron"))
+    return LOG_CRON;
+  if (boost::iequals(s, "daemon"))
+    return LOG_DAEMON;
+  if (boost::iequals(s, "ftp"))
+    return LOG_FTP;
+  if (boost::iequals(s, "kern"))
+    return LOG_KERN;
+  if (boost::iequals(s, "local0"))
+    return LOG_LOCAL0;
+  if (boost::iequals(s, "local1"))
+    return LOG_LOCAL1;
+  if (boost::iequals(s, "local2"))
+    return LOG_LOCAL2;
+  if (boost::iequals(s, "local3"))
+    return LOG_LOCAL3;
+  if (boost::iequals(s, "local4"))
+    return LOG_LOCAL4;
+  if (boost::iequals(s, "local5"))
+    return LOG_LOCAL5;
+  if (boost::iequals(s, "local6"))
+    return LOG_LOCAL6;
+  if (boost::iequals(s, "local7"))
+    return LOG_LOCAL7;
+  if (boost::iequals(s, "lpr"))
+    return LOG_LPR;
+  if (boost::iequals(s, "mail"))
+    return LOG_MAIL;
+  if (boost::iequals(s, "news"))
+    return LOG_NEWS;
+  if (boost::iequals(s, "syslog"))
+    return LOG_SYSLOG;
+  if (boost::iequals(s, "user"))
+    return LOG_USER;
+  if (boost::iequals(s, "uucp"))
+    return LOG_UUCP;
+
+  // default to USER
+  return LOG_USER;
+}
+
+string clog_type_to_string(clog_type t)
+{
+  switch (t) {
+    case CLOG_DEBUG:
+      return "debug";
+    case CLOG_INFO:
+      return "info";
+    case CLOG_WARN:
+      return "warn";
+    case CLOG_ERROR:
+      return "err";
+    case CLOG_SEC:
+      return "crit";
+    default:
+      ceph_abort();
+      return 0;
+  }
+}
+
+void LogEntry::log_to_syslog(string level, string facility)
+{
+  int min = string_to_syslog_level(level);
+  int l = clog_type_to_syslog_level(prio);
+  if (l <= min) {
+    int f = string_to_syslog_facility(facility);
+    syslog(l | f, "%s %s %llu : %s",
+	   name.to_cstr(),
+	   stringify(rank).c_str(),
+	   (long long unsigned)seq,
+	   msg.c_str());
+  }
+}
+
+void LogEntry::encode(bufferlist& bl, uint64_t features) const
+{
+  if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+    ENCODE_START(4, 2, bl);
+    __u16 t = prio;
+    entity_inst_t who;
+    who.name = rank;
+    who.addr = addrs.as_legacy_addr();
+    encode(who, bl, features);
+    encode(stamp, bl);
+    encode(seq, bl);
+    encode(t, bl);
+    encode(msg, bl);
+    encode(channel, bl);
+    encode(name, bl);
+    ENCODE_FINISH(bl);
+    return;
+  }
+  ENCODE_START(5, 5, bl);
+  __u16 t = prio;
+  encode(name, bl);
+  encode(rank, bl);
+  encode(addrs, bl, features);
+  encode(stamp, bl);
+  encode(seq, bl);
+  encode(t, bl);
+  encode(msg, bl);
+  encode(channel, bl);
+  ENCODE_FINISH(bl);
+}
+
+void LogEntry::decode(bufferlist::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(5, 2, 2, bl);
+  if (struct_v < 5) {
+    __u16 t;
+    entity_inst_t who;
+    decode(who, bl);
+    rank = who.name;
+    addrs.v.clear();
+    addrs.v.push_back(who.addr);
+    decode(stamp, bl);
+    decode(seq, bl);
+    decode(t, bl);
+    prio = (clog_type)t;
+    decode(msg, bl);
+    if (struct_v >= 3) {
+      decode(channel, bl);
+    } else {
+      // prior to having logging channels we only had a cluster log.
+      // Ensure we keep that appearance when the other party has no
+      // clue of what a 'channel' is.
+      channel = CLOG_CHANNEL_CLUSTER;
+    }
+    if (struct_v >= 4) {
+      decode(name, bl);
+    }
+  } else {
+    __u16 t;
+    decode(name, bl);
+    decode(rank, bl);
+    decode(addrs, bl);
+    decode(stamp, bl);
+    decode(seq, bl);
+    decode(t, bl);
+    prio = (clog_type)t;
+    decode(msg, bl);
+    decode(channel, bl);
+  }
+  DECODE_FINISH(bl);
+}
+
+void LogEntry::dump(Formatter *f) const
+{
+  f->dump_stream("name") << name;
+  f->dump_stream("rank") << rank;
+  f->dump_object("addrs", addrs);
+  f->dump_stream("stamp") << stamp;
+  f->dump_unsigned("seq", seq);
+  f->dump_string("channel", channel);
+  f->dump_stream("priority") << prio;
+  f->dump_string("message", msg);
+}
+
+void LogEntry::generate_test_instances(list<LogEntry*>& o)
+{
+  o.push_back(new LogEntry);
+}
+
+
+// -----
+
+void LogSummary::build_ordered_tail(list<LogEntry> *tail) const
+{
+  tail->clear();
+  // channel -> (begin, end)
+  map<string,pair<list<pair<uint64_t,LogEntry>>::const_iterator,
+		  list<pair<uint64_t,LogEntry>>::const_iterator>> pos;
+  for (auto& i : tail_by_channel) {
+    pos.emplace(i.first, make_pair(i.second.begin(), i.second.end()));
+  }
+  while (true) {
+    uint64_t min_seq = 0;
+    list<pair<uint64_t,LogEntry>>::const_iterator *minp = 0;
+    for (auto& i : pos) {
+      if (i.second.first == i.second.second) {
+	continue;
+      }
+      if (min_seq == 0 || i.second.first->first < min_seq) {
+	min_seq = i.second.first->first;
+	minp = &i.second.first;
+      }
+    }
+    if (min_seq == 0) {
+      break; // done
+    }
+    tail->push_back((*minp)->second);
+    ++(*minp);
+  }
+}
+
+void LogSummary::encode(bufferlist& bl, uint64_t features) const
+{
+  if (!HAVE_FEATURE(features, SERVER_MIMIC)) {
+    ENCODE_START(2, 2, bl);
+    encode(version, bl);
+    list<LogEntry> tail;
+    build_ordered_tail(&tail);
+    encode(tail, bl, features);
+    ENCODE_FINISH(bl);
+    return;
+  }
+  ENCODE_START(3, 3, bl);
+  encode(version, bl);
+  encode(seq, bl);
+  encode(tail_by_channel, bl, features);
+  ENCODE_FINISH(bl);
+}
+
+void LogSummary::decode(bufferlist::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+  decode(version, bl);
+  if (struct_v < 3) {
+    list<LogEntry> tail;
+    decode(tail, bl);
+    for (auto& i : tail) {
+      add(i);
+    }
+  } else {
+    decode(seq, bl);
+    decode(tail_by_channel, bl);
+  }
+  DECODE_FINISH(bl);
+  keys.clear();
+  for (auto& i : tail_by_channel) {
+    for (auto& e : i.second) {
+      keys.insert(e.second.key());
+    }
+  }
+}
+
+void LogSummary::dump(Formatter *f) const
+{
+  f->dump_unsigned("version", version);
+  f->open_object_section("tail_by_channel");
+  for (auto& i : tail_by_channel) {
+    f->open_object_section(i.first.c_str());
+    for (auto& j : i.second) {
+      string s = stringify(j.first);
+      f->dump_object(s.c_str(), j.second);
+    }
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void LogSummary::generate_test_instances(list<LogSummary*>& o)
+{
+  o.push_back(new LogSummary);
+  // more!
+}
+
diff --git a/src/common/LogEntry.h b/src/common/LogEntry.h
new file mode 100644
index 00000000..2501faf0
--- /dev/null
+++ b/src/common/LogEntry.h
@@ -0,0 +1,179 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_LOGENTRY_H
+#define CEPH_LOGENTRY_H
+
+#include "include/utime.h"
+#include "msg/msg_types.h"
+#include "common/entity_name.h"
+
+namespace ceph {
+  class Formatter;
+}
+
+typedef enum {
+  CLOG_DEBUG = 0,
+  CLOG_INFO = 1,
+  CLOG_SEC = 2,
+  CLOG_WARN = 3,
+  CLOG_ERROR = 4,
+  CLOG_UNKNOWN = -1,
+} clog_type;
+
+static const std::string CLOG_CHANNEL_NONE    = "none";
+static const std::string CLOG_CHANNEL_DEFAULT = "cluster";
+static const std::string CLOG_CHANNEL_CLUSTER = "cluster";
+static const std::string CLOG_CHANNEL_AUDIT   = "audit";
+
+// this is the key name used in the config options for the default, e.g.
+//   default=true foo=false bar=false
+static const std::string CLOG_CONFIG_DEFAULT_KEY = "default";
+
+/*
+ * Given a clog log_type, return the equivalent syslog priority
+ */
+int clog_type_to_syslog_level(clog_type t);
+
+clog_type string_to_clog_type(const string& s);
+int string_to_syslog_level(string s);
+int string_to_syslog_facility(string s);
+
+string clog_type_to_string(clog_type t);
+
+
+struct LogEntryKey {
+private:
+  uint64_t _hash = 0;
+
+  void _calc_hash() {
+    hash<entity_name_t> h;
+    _hash = seq + h(rank);
+  }
+
+  entity_name_t rank;
+  utime_t stamp;
+  uint64_t seq = 0;
+
+public:
+  LogEntryKey() {}
+  LogEntryKey(const entity_name_t& w, utime_t t, uint64_t s)
+    : rank(w), stamp(t), seq(s) {
+    _calc_hash();
+  }
+
+  uint64_t get_hash() const {
+    return _hash;
+  }
+
+  void dump(Formatter *f) const;
+  static void generate_test_instances(list<LogEntryKey*>& o);
+
+  friend bool operator==(const LogEntryKey& l, const LogEntryKey& r) {
+    return l.rank == r.rank && l.stamp == r.stamp && l.seq == r.seq;
+  }
+};
+
+namespace std {
+  template<> struct hash<LogEntryKey> {
+    size_t operator()(const LogEntryKey& r) const {
+      return r.get_hash();
+    }
+  };
+} // namespace std
+
+struct LogEntry {
+  EntityName name;
+  entity_name_t rank;
+  entity_addrvec_t addrs;
+  utime_t stamp;
+  uint64_t seq;
+  clog_type prio;
+  string msg;
+  string channel;
+
+  LogEntry() : seq(0), prio(CLOG_DEBUG) {}
+
+  LogEntryKey key() const { return LogEntryKey(rank, stamp, seq); }
+
+  void log_to_syslog(string level, string facility);
+
+  void encode(bufferlist& bl, uint64_t features) const;
+  void decode(bufferlist::const_iterator& bl);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(list<LogEntry*>& o);
+  static clog_type str_to_level(std::string const &str);
+};
+WRITE_CLASS_ENCODER_FEATURES(LogEntry)
+
+struct LogSummary {
+  version_t version;
+  // channel -> [(seq#, entry), ...]
+  map<string,list<pair<uint64_t,LogEntry>>> tail_by_channel;
+  uint64_t seq = 0;
+  ceph::unordered_set<LogEntryKey> keys;
+
+  LogSummary() : version(0) {}
+
+  void build_ordered_tail(list<LogEntry> *tail) const;
+
+  void add(const LogEntry& e) {
+    keys.insert(e.key());
+    tail_by_channel[e.channel].push_back(make_pair(++seq, e));
+  }
+  void prune(size_t max) {
+    for (auto& i : tail_by_channel) {
+      while (i.second.size() > max) {
+	keys.erase(i.second.front().second.key());
+	i.second.pop_front();
+      }
+    }
+  }
+  bool contains(const LogEntryKey& k) const {
+    return keys.count(k);
+  }
+
+  void encode(bufferlist& bl, uint64_t features) const;
+  void decode(bufferlist::const_iterator& bl);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(list<LogSummary*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(LogSummary)
+
+inline ostream& operator<<(ostream& out, const clog_type t)
+{
+  switch (t) {
+  case CLOG_DEBUG:
+    return out << "[DBG]";
+  case CLOG_INFO:
+    return out << "[INF]";
+  case CLOG_SEC:
+    return out << "[SEC]";
+  case CLOG_WARN:
+    return out << "[WRN]";
+  case CLOG_ERROR:
+    return out << "[ERR]";
+  default:
+    return out << "[???]";
+  }
+}
+
+inline ostream& operator<<(ostream& out, const LogEntry& e)
+{
+  return out << e.stamp << " " << e.name << " (" << e.rank << ") "
+	     << e.seq << " : "
+             << e.channel << " " << e.prio << " " << e.msg;
+}
+
+#endif
diff --git a/src/common/MemoryModel.cc b/src/common/MemoryModel.cc
new file mode 100644
index 00000000..14d31cc9
--- /dev/null
+++ b/src/common/MemoryModel.cc
@@ -0,0 +1,94 @@
+#include "MemoryModel.h"
+#include "include/compat.h"
+#include "debug.h"
+#if defined(__linux__)
+#include <malloc.h>
+#endif
+
+#include <fstream>
+
+#define dout_subsys ceph_subsys_
+
+MemoryModel::MemoryModel(CephContext *cct_)
+  : cct(cct_)
+{
+}
+
+void MemoryModel::_sample(snap *psnap)
+{
+  ifstream f;
+
+  f.open(PROCPREFIX "/proc/self/status");
+  if (!f.is_open()) {
+    ldout(cct, 0) << "check_memory_usage unable to open " PROCPREFIX "/proc/self/status" << dendl;
+    return;
+  }
+  while (!f.eof()) {
+    string line;
+    getline(f, line);
+    
+    if (strncmp(line.c_str(), "VmSize:", 7) == 0)
+      psnap->size = atol(line.c_str() + 7);
+    else if (strncmp(line.c_str(), "VmRSS:", 6) == 0)
+      psnap->rss = atol(line.c_str() + 7);
+    else if (strncmp(line.c_str(), "VmHWM:", 6) == 0)
+      psnap->hwm = atol(line.c_str() + 7);
+    else if (strncmp(line.c_str(), "VmLib:", 6) == 0)
+      psnap->lib = atol(line.c_str() + 7);
+    else if (strncmp(line.c_str(), "VmPeak:", 7) == 0)
+      psnap->peak = atol(line.c_str() + 7);
+    else if (strncmp(line.c_str(), "VmData:", 7) == 0)
+      psnap->data = atol(line.c_str() + 7);
+  }
+  f.close();
+
+  f.open(PROCPREFIX "/proc/self/maps");
+  if (!f.is_open()) {
+    ldout(cct, 0) << "check_memory_usage unable to open " PROCPREFIX "/proc/self/maps" << dendl;
+    return;
+  }
+
+  long heap = 0;
+  while (f.is_open() && !f.eof()) {
+    string line;
+    getline(f, line);
+    //ldout(cct, 0) << "line is " << line << dendl;
+
+    const char *start = line.c_str();
+    const char *dash = start;
+    while (*dash && *dash != '-') dash++;
+    if (!*dash)
+      continue;
+    const char *end = dash + 1;
+    while (*end && *end != ' ') end++;
+    if (!*end)
+      continue;
+    unsigned long long as = strtoll(start, 0, 16);
+    unsigned long long ae = strtoll(dash+1, 0, 16);
+
+    //ldout(cct, 0) << std::hex << as << " to " << ae << std::dec << dendl;
+
+    end++;
+    const char *mode = end;
+
+    int skip = 4;
+    while (skip--) {
+      end++;
+      while (*end && *end != ' ') end++;
+    }
+    if (*end)
+      end++;
+
+    long size = ae - as;
+    //ldout(cct, 0) << "size " << size << " mode is '" << mode << "' end is '" << end << "'" << dendl;
+
+    /*
+     * anything 'rw' and anon is assumed to be heap.
+     */
+    if (mode[0] == 'r' && mode[1] == 'w' && !*end)
+      heap += size;
+  }
+
+  psnap->heap = heap >> 10;
+
+}
diff --git a/src/common/MemoryModel.h b/src/common/MemoryModel.h
new file mode 100644
index 00000000..dc529b38
--- /dev/null
+++ b/src/common/MemoryModel.h
@@ -0,0 +1,54 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MEMORYMODEL_H
+#define CEPH_MEMORYMODEL_H
+
+class CephContext;
+
+class MemoryModel {
+public:
+  struct snap {
+    long peak;
+    long size;
+    long hwm;
+    long rss;
+    long data;
+    long lib;
+    
+    long heap;
+
+    snap() : peak(0), size(0), hwm(0), rss(0), data(0), lib(0),
+	     heap(0)
+    {}
+
+    long get_total() { return size; }
+    long get_rss() { return rss; }
+    long get_heap() { return heap; }
+  } last;
+
+private:
+  CephContext *cct;
+  void _sample(snap *p);
+
+public:
+  explicit MemoryModel(CephContext *cct);
+  void sample(snap *p = 0) {
+    _sample(&last);
+    if (p)
+      *p = last;
+  }
+};
+
+#endif
diff --git a/src/common/Mutex.cc b/src/common/Mutex.cc
new file mode 100644
index 00000000..e029adc7
--- /dev/null
+++ b/src/common/Mutex.cc
@@ -0,0 +1,90 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/Mutex.h"
+#include "common/config.h"
+#include "common/Clock.h"
+#include "common/valgrind.h"
+
+Mutex::Mutex(const std::string &n, bool r, bool ld,
+	     bool bt) :
+  name(n), id(-1), recursive(r), lockdep(ld), backtrace(bt), nlock(0),
+  locked_by(0)
+{
+  ANNOTATE_BENIGN_RACE_SIZED(&id, sizeof(id), "Mutex lockdep id");
+  ANNOTATE_BENIGN_RACE_SIZED(&nlock, sizeof(nlock), "Mutex nlock");
+  ANNOTATE_BENIGN_RACE_SIZED(&locked_by, sizeof(locked_by), "Mutex locked_by");
+  if (recursive) {
+    // Mutexes of type PTHREAD_MUTEX_RECURSIVE do all the same checks as
+    // mutexes of type PTHREAD_MUTEX_ERRORCHECK.
+    pthread_mutexattr_t attr;
+    pthread_mutexattr_init(&attr);
+    pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+    pthread_mutex_init(&_m,&attr);
+    pthread_mutexattr_destroy(&attr);
+    if (lockdep && g_lockdep)
+      _register();
+  }
+  else if (lockdep) {
+    // If the mutex type is PTHREAD_MUTEX_ERRORCHECK, then error checking
+    // shall be provided. If a thread attempts to relock a mutex that it
+    // has already locked, an error shall be returned. If a thread
+    // attempts to unlock a mutex that it has not locked or a mutex which
+    // is unlocked, an error shall be returned.
+    pthread_mutexattr_t attr;
+    pthread_mutexattr_init(&attr);
+    pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK);
+    pthread_mutex_init(&_m, &attr);
+    pthread_mutexattr_destroy(&attr);
+    if (g_lockdep)
+      _register();
+  }
+  else {
+    // If the mutex type is PTHREAD_MUTEX_DEFAULT, attempting to recursively
+    // lock the mutex results in undefined behavior. Attempting to unlock the
+    // mutex if it was not locked by the calling thread results in undefined
+    // behavior. Attempting to unlock the mutex if it is not locked results in
+    // undefined behavior.
+    pthread_mutex_init(&_m, NULL);
+  }
+}
+
+Mutex::~Mutex() {
+  ceph_assert(nlock == 0);
+
+  // helgrind gets confused by condition wakeups leading to mutex destruction
+  ANNOTATE_BENIGN_RACE_SIZED(&_m, sizeof(_m), "Mutex primitive");
+  pthread_mutex_destroy(&_m);
+
+  if (lockdep && g_lockdep) {
+    lockdep_unregister(id);
+  }
+}
+
+void Mutex::lock(bool no_lockdep)
+{
+  if (lockdep && g_lockdep && !no_lockdep && !recursive) _will_lock();
+  int r = pthread_mutex_lock(&_m);
+  ceph_assert(r == 0);
+  if (lockdep && g_lockdep) _locked();
+  _post_lock();
+}
+
+void Mutex::unlock()
+{
+  _pre_unlock();
+  if (lockdep && g_lockdep) _will_unlock();
+  int r = pthread_mutex_unlock(&_m);
+  ceph_assert(r == 0);
+}
diff --git a/src/common/Mutex.h b/src/common/Mutex.h
new file mode 100644
index 00000000..792ba323
--- /dev/null
+++ b/src/common/Mutex.h
@@ -0,0 +1,113 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MUTEX_H
+#define CEPH_MUTEX_H
+
+#include "include/ceph_assert.h"
+#include "lockdep.h"
+
+#include <string>
+#include <pthread.h>
+#include <mutex>
+
+using namespace ceph;
+
+class Mutex {
+private:
+  std::string name;
+  int id;
+  bool recursive;
+  bool lockdep;
+  bool backtrace;  // gather backtrace on lock acquisition
+
+  pthread_mutex_t _m;
+  int nlock;
+  pthread_t locked_by;
+
+  // don't allow copying.
+  void operator=(const Mutex &M);
+  Mutex(const Mutex &M);
+
+  void _register() {
+    id = lockdep_register(name.c_str());
+  }
+  void _will_lock() { // about to lock
+    id = lockdep_will_lock(name.c_str(), id, backtrace, recursive);
+  }
+  void _locked() {    // just locked
+    id = lockdep_locked(name.c_str(), id, backtrace);
+  }
+  void _will_unlock() {  // about to unlock
+    id = lockdep_will_unlock(name.c_str(), id);
+  }
+
+public:
+  Mutex(const std::string &n, bool r = false, bool ld=true, bool bt=false);
+  ~Mutex();
+  bool is_locked() const {
+    return (nlock > 0);
+  }
+  bool is_locked_by_me() const {
+    return nlock > 0 && locked_by == pthread_self();
+  }
+
+  bool TryLock() {
+    return try_lock();
+  }
+  bool try_lock() {
+    int r = pthread_mutex_trylock(&_m);
+    if (r == 0) {
+      if (lockdep && g_lockdep) _locked();
+      _post_lock();
+    }
+    return r == 0;
+  }
+
+  void Lock(bool no_lockdep=false) {
+    lock(no_lockdep);
+  }
+  void lock(bool no_lockdep=false);
+
+  void _post_lock() {
+    if (!recursive) {
+      ceph_assert(nlock == 0);
+      locked_by = pthread_self();
+    };
+    nlock++;
+  }
+
+  void _pre_unlock() {
+    ceph_assert(nlock > 0);
+    --nlock;
+    if (!recursive) {
+      ceph_assert(locked_by == pthread_self());
+      locked_by = 0;
+      ceph_assert(nlock == 0);
+    }
+  }
+  void Unlock() {
+    unlock();
+  }
+  void unlock();
+
+  friend class Cond;
+
+
+public:
+  typedef std::lock_guard<Mutex> Locker;
+};
+
+
+#endif
diff --git a/src/common/OpQueue.h b/src/common/OpQueue.h
new file mode 100644
index 00000000..db98cbd3
--- /dev/null
+++ b/src/common/OpQueue.h
@@ -0,0 +1,63 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef OP_QUEUE_H
+#define OP_QUEUE_H
+
+#include "include/msgr.h"
+
+#include <list>
+#include <functional>
+
+namespace ceph {
+  class Formatter;
+}
+
+/**
+ * Abstract class for all Op Queues
+ *
+ * In order to provide optimized code, be sure to declare all
+ * virtual functions as final in the derived class.
+ */
+
+template <typename T, typename K>
+class OpQueue {
+
+  public:
+    // Ops of this class should be deleted immediately. If out isn't
+    // nullptr then items should be added to the front in
+    // front-to-back order. The typical strategy is to visit items in
+    // the queue in *reverse* order and to use *push_front* to insert
+    // them into out.
+    virtual void remove_by_class(K k, std::list<T> *out) = 0;
+    // Enqueue op in the back of the strict queue
+    virtual void enqueue_strict(K cl, unsigned priority, T &&item) = 0;
+    // Enqueue op in the front of the strict queue
+    virtual void enqueue_strict_front(K cl, unsigned priority, T &&item) = 0;
+    // Enqueue op in the back of the regular queue
+    virtual void enqueue(K cl, unsigned priority, unsigned cost, T &&item) = 0;
+    // Enqueue the op in the front of the regular queue
+    virtual void enqueue_front(
+      K cl, unsigned priority, unsigned cost, T &&item) = 0;
+    // Returns if the queue is empty
+    virtual bool empty() const = 0;
+    // Return an op to be dispatch
+    virtual T dequeue() = 0;
+    // Formatted output of the queue
+    virtual void dump(ceph::Formatter *f) const = 0;
+    // Don't leak resources on destruction
+    virtual ~OpQueue() {}; 
+};
+
+#endif
diff --git a/src/common/OutputDataSocket.cc b/src/common/OutputDataSocket.cc
new file mode 100644
index 00000000..8d1c00f4
--- /dev/null
+++ b/src/common/OutputDataSocket.cc
@@ -0,0 +1,405 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include <poll.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#include "common/OutputDataSocket.h"
+#include "common/errno.h"
+#include "common/debug.h"
+#include "common/safe_io.h"
+#include "include/compat.h"
+#include "include/sock_compat.h"
+
+// re-include our assert to clobber the system one; fix dout:
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_asok
+#undef dout_prefix
+#define dout_prefix *_dout << "asok(" << (void*)m_cct << ") "
+
+using std::ostringstream;
+
+/*
+ * UNIX domain sockets created by an application persist even after that
+ * application closes, unless they're explicitly unlinked. This is because the
+ * directory containing the socket keeps a reference to the socket.
+ *
+ * This code makes things a little nicer by unlinking those dead sockets when
+ * the application exits normally.
+ */
+static pthread_mutex_t cleanup_lock = PTHREAD_MUTEX_INITIALIZER;
+static std::vector <const char*> cleanup_files;
+static bool cleanup_atexit = false;
+
+static void remove_cleanup_file(const char *file)
+{
+  pthread_mutex_lock(&cleanup_lock);
+  VOID_TEMP_FAILURE_RETRY(unlink(file));
+  for (std::vector <const char*>::iterator i = cleanup_files.begin();
+       i != cleanup_files.end(); ++i) {
+    if (strcmp(file, *i) == 0) {
+      free((void*)*i);
+      cleanup_files.erase(i);
+      break;
+    }
+  }
+  pthread_mutex_unlock(&cleanup_lock);
+}
+
+static void remove_all_cleanup_files()
+{
+  pthread_mutex_lock(&cleanup_lock);
+  for (std::vector <const char*>::iterator i = cleanup_files.begin();
+       i != cleanup_files.end(); ++i) {
+    VOID_TEMP_FAILURE_RETRY(unlink(*i));
+    free((void*)*i);
+  }
+  cleanup_files.clear();
+  pthread_mutex_unlock(&cleanup_lock);
+}
+
+static void add_cleanup_file(const char *file)
+{
+  char *fname = strdup(file);
+  if (!fname)
+    return;
+  pthread_mutex_lock(&cleanup_lock);
+  cleanup_files.push_back(fname);
+  if (!cleanup_atexit) {
+    atexit(remove_all_cleanup_files);
+    cleanup_atexit = true;
+  }
+  pthread_mutex_unlock(&cleanup_lock);
+}
+
+
+OutputDataSocket::OutputDataSocket(CephContext *cct, uint64_t _backlog)
+  : m_cct(cct),
+    data_max_backlog(_backlog),
+    m_sock_fd(-1),
+    m_shutdown_rd_fd(-1),
+    m_shutdown_wr_fd(-1),
+    going_down(false),
+    data_size(0),
+    skipped(0)
+{
+}
+
+OutputDataSocket::~OutputDataSocket()
+{
+  shutdown();
+}
+
+/*
+ * This thread listens on the UNIX domain socket for incoming connections.
+ * It only handles one connection at a time at the moment. All I/O is nonblocking,
+ * so that we can implement sensible timeouts. [TODO: make all I/O nonblocking]
+ *
+ * This thread also listens to m_shutdown_rd_fd. If there is any data sent to this
+ * pipe, the thread terminates itself gracefully, allowing the
+ * OutputDataSocketConfigObs class to join() it.
+ */
+
+#define PFL_SUCCESS ((void*)(intptr_t)0)
+#define PFL_FAIL ((void*)(intptr_t)1)
+
+std::string OutputDataSocket::create_shutdown_pipe(int *pipe_rd, int *pipe_wr)
+{
+  int pipefd[2];
+  if (pipe_cloexec(pipefd) < 0) {
+    int e = errno;
+    ostringstream oss;
+    oss << "OutputDataSocket::create_shutdown_pipe error: " << cpp_strerror(e);
+    return oss.str();
+  }
+  
+  *pipe_rd = pipefd[0];
+  *pipe_wr = pipefd[1];
+  return "";
+}
+
+std::string OutputDataSocket::bind_and_listen(const std::string &sock_path, int *fd)
+{
+  ldout(m_cct, 5) << "bind_and_listen " << sock_path << dendl;
+
+  struct sockaddr_un address;
+  if (sock_path.size() > sizeof(address.sun_path) - 1) {
+    ostringstream oss;
+    oss << "OutputDataSocket::bind_and_listen: "
+	<< "The UNIX domain socket path " << sock_path << " is too long! The "
+	<< "maximum length on this system is "
+	<< (sizeof(address.sun_path) - 1);
+    return oss.str();
+  }
+  int sock_fd = socket_cloexec(PF_UNIX, SOCK_STREAM, 0);
+  if (sock_fd < 0) {
+    int err = errno;
+    ostringstream oss;
+    oss << "OutputDataSocket::bind_and_listen: "
+	<< "failed to create socket: " << cpp_strerror(err);
+    return oss.str();
+  }
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&address, 0, sizeof(struct sockaddr_un));
+  address.sun_family = AF_UNIX;
+  snprintf(address.sun_path, sizeof(address.sun_path),
+	   "%s", sock_path.c_str());
+  if (::bind(sock_fd, (struct sockaddr*)&address,
+	   sizeof(struct sockaddr_un)) != 0) {
+    int err = errno;
+    if (err == EADDRINUSE) {
+      // The old UNIX domain socket must still be there.
+      // Let's unlink it and try again.
+      VOID_TEMP_FAILURE_RETRY(unlink(sock_path.c_str()));
+      if (::bind(sock_fd, (struct sockaddr*)&address,
+	       sizeof(struct sockaddr_un)) == 0) {
+	err = 0;
+      }
+      else {
+	err = errno;
+      }
+    }
+    if (err != 0) {
+      ostringstream oss;
+      oss << "OutputDataSocket::bind_and_listen: "
+	  << "failed to bind the UNIX domain socket to '" << sock_path
+	  << "': " << cpp_strerror(err);
+      close(sock_fd);
+      return oss.str();
+    }
+  }
+  if (listen(sock_fd, 5) != 0) {
+    int err = errno;
+    ostringstream oss;
+    oss << "OutputDataSocket::bind_and_listen: "
+	  << "failed to listen to socket: " << cpp_strerror(err);
+    close(sock_fd);
+    VOID_TEMP_FAILURE_RETRY(unlink(sock_path.c_str()));
+    return oss.str();
+  }
+  *fd = sock_fd;
+  return "";
+}
+
+void* OutputDataSocket::entry()
+{
+  ldout(m_cct, 5) << "entry start" << dendl;
+  while (true) {
+    struct pollfd fds[2];
+    // FIPS zeroization audit 20191115: this memset is not security related.
+    memset(fds, 0, sizeof(fds));
+    fds[0].fd = m_sock_fd;
+    fds[0].events = POLLIN | POLLRDBAND;
+    fds[1].fd = m_shutdown_rd_fd;
+    fds[1].events = POLLIN | POLLRDBAND;
+
+    int ret = poll(fds, 2, -1);
+    if (ret < 0) {
+      int err = errno;
+      if (err == EINTR) {
+	continue;
+      }
+      lderr(m_cct) << "OutputDataSocket: poll(2) error: '"
+		   << cpp_strerror(err) << dendl;
+      return PFL_FAIL;
+    }
+
+    if (fds[0].revents & POLLIN) {
+      // Send out some data
+      do_accept();
+    }
+    if (fds[1].revents & POLLIN) {
+      // Parent wants us to shut down
+      return PFL_SUCCESS;
+    }
+  }
+  ldout(m_cct, 5) << "entry exit" << dendl;
+
+  return PFL_SUCCESS; // unreachable
+}
+
+
+bool OutputDataSocket::do_accept()
+{
+  struct sockaddr_un address;
+  socklen_t address_length = sizeof(address);
+  ldout(m_cct, 30) << "OutputDataSocket: calling accept" << dendl;
+  int connection_fd = accept_cloexec(m_sock_fd, (struct sockaddr*) &address,
+			     &address_length);
+  if (connection_fd < 0) {
+    int err = errno;
+    lderr(m_cct) << "OutputDataSocket: do_accept error: '"
+			   << cpp_strerror(err) << dendl;
+    return false;
+  }
+  ldout(m_cct, 30) << "OutputDataSocket: finished accept" << dendl;
+
+  handle_connection(connection_fd);
+  close_connection(connection_fd);
+
+  return 0;
+}
+
+void OutputDataSocket::handle_connection(int fd)
+{
+  bufferlist bl;
+
+  m_lock.lock();
+  init_connection(bl);
+  m_lock.unlock();
+
+  if (bl.length()) {
+    /* need to special case the connection init buffer output, as it needs
+     * to be dumped before any data, including older data that was sent
+     * before the connection was established, or before we identified
+     * older connection was broken
+     */
+    int ret = safe_write(fd, bl.c_str(), bl.length());
+    if (ret < 0) {
+      return;
+    }
+  }
+
+  int ret = dump_data(fd);
+  if (ret < 0)
+    return;
+
+  do {
+    {
+      std::unique_lock l(m_lock);
+      if (!going_down) {
+	cond.wait(l);
+      }
+      if (going_down) {
+	break;
+      }
+    }
+    ret = dump_data(fd);
+  } while (ret >= 0);
+}
+
+int OutputDataSocket::dump_data(int fd)
+{
+  m_lock.lock();
+  vector<buffer::list> l = std::move(data);
+  data.clear();
+  data_size = 0;
+  m_lock.unlock();
+
+  for (auto iter = l.begin(); iter != l.end(); ++iter) {
+    bufferlist& bl = *iter;
+    int ret = safe_write(fd, bl.c_str(), bl.length());
+    if (ret >= 0) {
+      ret = safe_write(fd, delim.c_str(), delim.length());
+    }
+    if (ret < 0) {
+      std::scoped_lock lock(m_lock);
+      for (; iter != l.end(); ++iter) {
+        bufferlist& bl = *iter;
+	data.push_back(bl);
+	data_size += bl.length();
+      }
+      return ret;
+    }
+  }
+
+  return 0;
+}
+
+void OutputDataSocket::close_connection(int fd)
+{
+  VOID_TEMP_FAILURE_RETRY(close(fd));
+}
+
+bool OutputDataSocket::init(const std::string &path)
+{
+  ldout(m_cct, 5) << "init " << path << dendl;
+
+  /* Set up things for the new thread */
+  std::string err;
+  int pipe_rd = -1, pipe_wr = -1;
+  err = create_shutdown_pipe(&pipe_rd, &pipe_wr);
+  if (!err.empty()) {
+    lderr(m_cct) << "OutputDataSocketConfigObs::init: error: " << err << dendl;
+    return false;
+  }
+  int sock_fd;
+  err = bind_and_listen(path, &sock_fd);
+  if (!err.empty()) {
+    lderr(m_cct) << "OutputDataSocketConfigObs::init: failed: " << err << dendl;
+    close(pipe_rd);
+    close(pipe_wr);
+    return false;
+  }
+
+  /* Create new thread */
+  m_sock_fd = sock_fd;
+  m_shutdown_rd_fd = pipe_rd;
+  m_shutdown_wr_fd = pipe_wr;
+  m_path = path;
+  create("out_data_socket");
+  add_cleanup_file(m_path.c_str());
+  return true;
+}
+
+void OutputDataSocket::shutdown()
+{
+  m_lock.lock();
+  going_down = true;
+  cond.notify_all();
+  m_lock.unlock();
+
+  if (m_shutdown_wr_fd < 0)
+    return;
+
+  ldout(m_cct, 5) << "shutdown" << dendl;
+
+  // Send a byte to the shutdown pipe that the thread is listening to
+  char buf[1] = { 0x0 };
+  int ret = safe_write(m_shutdown_wr_fd, buf, sizeof(buf));
+  VOID_TEMP_FAILURE_RETRY(close(m_shutdown_wr_fd));
+  m_shutdown_wr_fd = -1;
+
+  if (ret == 0) {
+    join();
+  } else {
+    lderr(m_cct) << "OutputDataSocket::shutdown: failed to write "
+      "to thread shutdown pipe: error " << ret << dendl;
+  }
+
+  remove_cleanup_file(m_path.c_str());
+  m_path.clear();
+}
+
+void OutputDataSocket::append_output(bufferlist& bl)
+{
+  std::lock_guard l(m_lock);
+
+  if (data_size + bl.length() > data_max_backlog) {
+    if (skipped % 100 == 0) {
+      ldout(m_cct, 0) << "dropping data output, max backlog reached (skipped=="
+		      << skipped << ")"
+		      << dendl;
+      skipped = 1;
+    } else
+      ++skipped;
+    return;
+  }
+
+  data.push_back(bl);
+  data_size += bl.length();
+  cond.notify_all();
+}
diff --git a/src/common/OutputDataSocket.h b/src/common/OutputDataSocket.h
new file mode 100644
index 00000000..682dfcc6
--- /dev/null
+++ b/src/common/OutputDataSocket.h
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_COMMON_OUTPUTDATASOCKET_H
+#define CEPH_COMMON_OUTPUTDATASOCKET_H
+
+#include "common/ceph_mutex.h"
+#include "common/Thread.h"
+#include "include/buffer.h"
+
+class CephContext;
+
+class OutputDataSocket : public Thread
+{
+public:
+  OutputDataSocket(CephContext *cct, uint64_t _backlog);
+  ~OutputDataSocket() override;
+
+  bool init(const std::string &path);
+  
+  void append_output(bufferlist& bl);
+
+protected:
+  virtual void init_connection(bufferlist& bl) {}
+  void shutdown();
+
+  std::string create_shutdown_pipe(int *pipe_rd, int *pipe_wr);
+  std::string bind_and_listen(const std::string &sock_path, int *fd);
+
+  void *entry() override;
+  bool do_accept();
+
+  void handle_connection(int fd);
+  void close_connection(int fd);
+
+  int dump_data(int fd);
+
+  CephContext *m_cct;
+  uint64_t data_max_backlog;
+  std::string m_path;
+  int m_sock_fd;
+  int m_shutdown_rd_fd;
+  int m_shutdown_wr_fd;
+  bool going_down;
+
+  uint64_t data_size;
+  uint32_t skipped;
+
+  std::vector<buffer::list> data;
+
+  ceph::mutex m_lock = ceph::make_mutex("OutputDataSocket::m_lock");
+  ceph::condition_variable cond;
+  buffer::list delim;
+};
+
+#endif
diff --git a/src/common/PluginRegistry.cc b/src/common/PluginRegistry.cc
new file mode 100644
index 00000000..2cb7fcee
--- /dev/null
+++ b/src/common/PluginRegistry.cc
@@ -0,0 +1,233 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ * Copyright (C) 2014 Red Hat <contact@redhat.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ * 
+ */
+
+#include "PluginRegistry.h"
+#include "ceph_ver.h"
+#include "common/ceph_context.h"
+#include "common/errno.h"
+#include "common/debug.h"
+
+#include <dlfcn.h>
+
+#define PLUGIN_PREFIX "libceph_"
+#ifdef __APPLE__
+#define PLUGIN_SUFFIX ".dylib"
+#else
+#define PLUGIN_SUFFIX ".so"
+#endif
+#define PLUGIN_INIT_FUNCTION "__ceph_plugin_init"
+#define PLUGIN_VERSION_FUNCTION "__ceph_plugin_version"
+
+#define dout_subsys ceph_subsys_context
+
+PluginRegistry::PluginRegistry(CephContext *cct) :
+  cct(cct),
+  loading(false),
+  disable_dlclose(false)
+{
+}
+
+PluginRegistry::~PluginRegistry()
+{
+  if (disable_dlclose)
+    return;
+
+  for (std::map<std::string,std::map<std::string, Plugin*> >::iterator i =
+	 plugins.begin();
+       i != plugins.end();
+       ++i) {
+    for (std::map<std::string,Plugin*>::iterator j = i->second.begin();
+	 j != i->second.end(); ++j) {
+      void *library = j->second->library;
+      delete j->second;
+      dlclose(library);
+    }
+  }
+}
+
+int PluginRegistry::remove(const std::string& type, const std::string& name)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+
+  std::map<std::string,std::map<std::string,Plugin*> >::iterator i =
+    plugins.find(type);
+  if (i == plugins.end())
+    return -ENOENT;
+  std::map<std::string,Plugin*>::iterator j = i->second.find(name);
+  if (j == i->second.end())
+    return -ENOENT;
+
+  ldout(cct, 1) << __func__ << " " << type << " " << name << dendl;
+  void *library = j->second->library;
+  delete j->second;
+  dlclose(library);
+  i->second.erase(j);
+  if (i->second.empty())
+    plugins.erase(i);
+
+  return 0;
+}
+
+int PluginRegistry::add(const std::string& type,
+			const std::string& name,
+			Plugin* plugin)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  if (plugins.count(type) &&
+      plugins[type].count(name)) {
+    return -EEXIST;
+  }
+  ldout(cct, 1) << __func__ << " " << type << " " << name
+		<< " " << plugin << dendl;
+  plugins[type][name] = plugin;
+  return 0;
+}
+
+Plugin *PluginRegistry::get_with_load(const std::string& type,
+          const std::string& name)
+{
+  std::lock_guard l(lock);
+  Plugin* ret = get(type, name);
+  if (!ret) {
+    int err = load(type, name);
+    if (err == 0)
+      ret = get(type, name);
+  } 
+  return ret;
+}
+
+Plugin *PluginRegistry::get(const std::string& type,
+			    const std::string& name)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  Plugin *ret = 0;
+
+  std::map<std::string,Plugin*>::iterator j;
+  std::map<std::string,map<std::string,Plugin*> >::iterator i =
+    plugins.find(type);
+  if (i == plugins.end()) 
+    goto out;
+  j = i->second.find(name);
+  if (j == i->second.end()) 
+    goto out;
+  ret = j->second;
+
+ out:
+  ldout(cct, 1) << __func__ << " " << type << " " << name
+		<< " = " << ret << dendl;
+  return ret;
+}
+
+int PluginRegistry::load(const std::string &type,
+			 const std::string &name)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ldout(cct, 1) << __func__ << " " << type << " " << name << dendl;
+
+  // std::string fname = cct->_conf->plugin_dir + "/" + type + "/" PLUGIN_PREFIX
+  //  + name + PLUGIN_SUFFIX;
+  std::string fname = cct->_conf.get_val<std::string>("plugin_dir") + "/" + type + "/" + PLUGIN_PREFIX
+      + name + PLUGIN_SUFFIX;
+  void *library = dlopen(fname.c_str(), RTLD_NOW);
+  if (!library) {
+    string err1(dlerror());
+    // fall back to plugin_dir
+    std::string fname2 = cct->_conf.get_val<std::string>("plugin_dir") + "/" + PLUGIN_PREFIX +
+      name + PLUGIN_SUFFIX;
+    library = dlopen(fname2.c_str(), RTLD_NOW);
+    if (!library) {
+      lderr(cct) << __func__
+		 << " failed dlopen(): \""	<< err1.c_str() 
+		 << "\" or \"" << dlerror() << "\""
+		 << dendl;
+      return -EIO;
+    }
+  }
+
+  const char * (*code_version)() =
+    (const char *(*)())dlsym(library, PLUGIN_VERSION_FUNCTION);
+  if (code_version == NULL) {
+    lderr(cct) << __func__ << " code_version == NULL" << dlerror() << dendl;
+    return -EXDEV;
+  }
+  if (code_version() != string(CEPH_GIT_NICE_VER)) {
+    lderr(cct) << __func__ << " plugin " << fname << " version "
+	       << code_version() << " != expected "
+	       << CEPH_GIT_NICE_VER << dendl;
+    dlclose(library);
+    return -EXDEV;
+  }
+
+  int (*code_init)(CephContext *,
+		   const std::string& type,
+		   const std::string& name) =
+    (int (*)(CephContext *,
+	     const std::string& type,
+	     const std::string& name))dlsym(library, PLUGIN_INIT_FUNCTION);
+  if (code_init) {
+    int r = code_init(cct, type, name);
+    if (r != 0) {
+      lderr(cct) << __func__ << " " << fname << " "
+		 << PLUGIN_INIT_FUNCTION << "(" << cct
+		 << "," << type << "," << name << "): " << cpp_strerror(r)
+		 << dendl;
+      dlclose(library);
+      return r;
+    }
+  } else {
+    lderr(cct) << __func__ << " " << fname << " dlsym(" << PLUGIN_INIT_FUNCTION
+	       << "): " << dlerror() << dendl;
+    dlclose(library);
+    return -ENOENT;
+  }
+
+  Plugin *plugin = get(type, name);
+  if (plugin == 0) {
+    lderr(cct) << __func__ << " " << fname << " "
+	       << PLUGIN_INIT_FUNCTION << "()"
+	       << "did not register plugin type " << type << " name " << name
+	       << dendl;
+    dlclose(library);
+    return -EBADF;
+  }
+
+  plugin->library = library;
+
+  ldout(cct, 1) << __func__ << ": " << type << " " << name
+		<< " loaded and registered" << dendl;
+  return 0;
+}
+
+/*
+int ErasureCodePluginRegistry::preload(const std::string &plugins,
+				       const std::string &directory,
+				       ostream &ss)
+{
+  std::lock_guard l(lock);
+  list<string> plugins_list;
+  get_str_list(plugins, plugins_list);
+  for (list<string>::iterator i = plugins_list.begin();
+       i != plugins_list.end();
+       ++i) {
+    ErasureCodePlugin *plugin;
+    int r = load(*i, directory, &plugin, ss);
+    if (r)
+      return r;
+  }
+  return 0;
+}
+*/
diff --git a/src/common/PluginRegistry.h b/src/common/PluginRegistry.h
new file mode 100644
index 00000000..62020a96
--- /dev/null
+++ b/src/common/PluginRegistry.h
@@ -0,0 +1,69 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ * Copyright (C) 2014 Red Hat <contact@redhat.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ * 
+ */
+
+#ifndef CEPH_COMMON_PLUGINREGISTRY_H
+#define CEPH_COMMON_PLUGINREGISTRY_H
+
+#include <map>
+#include <string>
+#include "common/ceph_mutex.h"
+
+class CephContext;
+
+extern "C" {
+  const char *__ceph_plugin_version();
+  int __ceph_plugin_init(CephContext *cct,
+			 const std::string& type,
+			 const std::string& name);
+}
+
+namespace ceph {
+
+  class Plugin {
+  public:
+    void *library;
+    CephContext *cct;
+
+    explicit Plugin(CephContext *cct) : library(NULL), cct(cct) {}
+    virtual ~Plugin() {}
+  };
+
+  class PluginRegistry {
+  public:
+    CephContext *cct;
+    ceph::mutex lock = ceph::make_mutex("PluginRegistery::lock");
+    bool loading;
+    bool disable_dlclose;
+    std::map<std::string,std::map<std::string,Plugin*> > plugins;
+
+    explicit PluginRegistry(CephContext *cct);
+    ~PluginRegistry();
+
+    int add(const std::string& type, const std::string& name,
+	    Plugin *factory);
+    int remove(const std::string& type, const std::string& name);
+    Plugin *get(const std::string& type, const std::string& name);
+    Plugin *get_with_load(const std::string& type, const std::string& name);
+
+    int load(const std::string& type,
+	     const std::string& name);
+    int preload();
+    int preload(const std::string& type);
+  };
+}
+
+#endif
diff --git a/src/common/Preforker.h b/src/common/Preforker.h
new file mode 100644
index 00000000..e92b2a44
--- /dev/null
+++ b/src/common/Preforker.h
@@ -0,0 +1,145 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_COMMON_PREFORKER_H
+#define CEPH_COMMON_PREFORKER_H
+
+#include <signal.h>
+#include <sys/socket.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <sstream>
+
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "include/ceph_assert.h"
+#include "include/compat.h"
+#include "include/sock_compat.h"
+
+/**
+ * pre-fork fork/daemonize helper class
+ *
+ * Hide the details of letting a process fork early, do a bunch of
+ * initialization work that may spam stdout or exit with an error, and
+ * then daemonize.  The exit() method will either exit directly (if we
+ * haven't forked) or pass a message to the parent with the error if
+ * we have.
+ */
+class Preforker {
+  pid_t childpid;
+  bool forked;
+  int fd[2];  // parent's, child's
+
+public:
+  Preforker()
+    : childpid(0),
+      forked(false)
+  {}
+
+  int prefork(std::string &err) {
+    ceph_assert(!forked);
+    std::ostringstream oss;
+    int r = socketpair_cloexec(AF_UNIX, SOCK_STREAM, 0, fd);
+    if (r < 0) {
+      int e = errno;
+      oss << "[" << getpid() << "]: unable to create socketpair: " << cpp_strerror(e);
+      err = oss.str();
+      return (errno = e, -1);
+    }
+
+    struct sigaction sa;
+    sa.sa_handler = SIG_IGN;
+    sigemptyset(&sa.sa_mask);
+    sa.sa_flags = 0;
+    if (sigaction(SIGHUP, &sa, nullptr) != 0) {
+      int e = errno;
+      oss << "[" << getpid() << "]: unable to ignore SIGHUP: " << cpp_strerror(e);
+      err = oss.str();
+      return (errno = e, -1);
+    }
+
+    forked = true;
+
+    childpid = fork();
+    if (childpid < 0) {
+      int e = errno;
+      oss << "[" << getpid() << "]: unable to fork: " << cpp_strerror(e);
+      err = oss.str();
+      return (errno = e, -1);
+    }
+    if (is_child()) {
+      ::close(fd[0]);
+    } else {
+      ::close(fd[1]);
+    }
+    return 0;
+  }
+
+  int get_signal_fd() const {
+    return forked ? fd[1] : 0;
+  }
+
+  bool is_child() {
+    return childpid == 0;
+  }
+
+  bool is_parent() {
+    return childpid != 0;
+  }
+
+  int parent_wait(std::string &err_msg) {
+    ceph_assert(forked);
+
+    int r = -1;
+    std::ostringstream oss;
+    int err = safe_read_exact(fd[0], &r, sizeof(r));
+    if (err == 0 && r == -1) {
+      // daemonize
+      ::close(0);
+      ::close(1);
+      ::close(2);
+      r = 0;
+    } else if (err) {
+      oss << "[" << getpid() << "]: " << cpp_strerror(err);
+    } else {
+      // wait for child to exit
+      int status;
+      err = waitpid(childpid, &status, 0);
+      if (err < 0) {
+        oss << "[" << getpid() << "]" << " waitpid error: " << cpp_strerror(err);
+      } else if (WIFSIGNALED(status)) {
+        oss << "[" << getpid() << "]" << " exited with a signal";
+      } else if (!WIFEXITED(status)) {
+        oss << "[" << getpid() << "]" << " did not exit normally";
+      } else {
+        err = WEXITSTATUS(status);
+        if (err != 0)
+         oss << "[" << getpid() << "]" << " returned exit_status " << cpp_strerror(err);
+      }
+    }
+    err_msg = oss.str();
+    return err;
+  }
+
+  int signal_exit(int r) {
+    if (forked) {
+      /* If we get an error here, it's too late to do anything reasonable about it. */
+      [[maybe_unused]] auto n = safe_write(fd[1], &r, sizeof(r));
+    }
+    return r;
+  }
+  void exit(int r) {
+    if (is_child())
+        signal_exit(r);
+    ::exit(r);
+  }
+
+  void daemonize() {
+    ceph_assert(forked);
+    static int r = -1;
+    int r2 = ::write(fd[1], &r, sizeof(r));
+    r += r2;  // make the compiler shut up about the unused return code from ::write(2).
+  }
+  
+};
+
+#endif
diff --git a/src/common/PrioritizedQueue.h b/src/common/PrioritizedQueue.h
new file mode 100644
index 00000000..6d7de129
--- /dev/null
+++ b/src/common/PrioritizedQueue.h
@@ -0,0 +1,348 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef PRIORITY_QUEUE_H
+#define PRIORITY_QUEUE_H
+
+#include "include/ceph_assert.h"
+
+#include "common/Formatter.h"
+#include "common/OpQueue.h"
+
+/**
+ * Manages queue for normal and strict priority items
+ *
+ * On dequeue, the queue will select the lowest priority queue
+ * such that the q has bucket > cost of front queue item.
+ *
+ * If there is no such queue, we choose the next queue item for
+ * the highest priority queue.
+ *
+ * Before returning a dequeued item, we place into each bucket
+ * cost * (priority/total_priority) tokens.
+ *
+ * enqueue_strict and enqueue_strict_front queue items into queues
+ * which are serviced in strict priority order before items queued
+ * with enqueue and enqueue_front
+ *
+ * Within a priority class, we schedule round robin based on the class
+ * of type K used to enqueue items.  e.g. you could use entity_inst_t
+ * to provide fairness for different clients.
+ */
+template <typename T, typename K>
+class PrioritizedQueue : public OpQueue <T, K> {
+  int64_t total_priority;
+  int64_t max_tokens_per_subqueue;
+  int64_t min_cost;
+
+  typedef std::list<std::pair<unsigned, T> > ListPairs;
+
+  struct SubQueue {
+  private:
+    typedef std::map<K, ListPairs> Classes;
+    Classes q;
+    unsigned tokens, max_tokens;
+    int64_t size;
+    typename Classes::iterator cur;
+  public:
+    SubQueue(const SubQueue &other)
+      : q(other.q),
+	tokens(other.tokens),
+	max_tokens(other.max_tokens),
+	size(other.size),
+	cur(q.begin()) {}
+    SubQueue()
+      : tokens(0),
+	max_tokens(0),
+	size(0), cur(q.begin()) {}
+    void set_max_tokens(unsigned mt) {
+      max_tokens = mt;
+    }
+    unsigned get_max_tokens() const {
+      return max_tokens;
+    }
+    unsigned num_tokens() const {
+      return tokens;
+    }
+    void put_tokens(unsigned t) {
+      tokens += t;
+      if (tokens > max_tokens) {
+	tokens = max_tokens;
+      }
+    }
+    void take_tokens(unsigned t) {
+      if (tokens > t) {
+	tokens -= t;
+      } else {
+	tokens = 0;
+      }
+    }
+    void enqueue(K cl, unsigned cost, T &&item) {
+      q[cl].push_back(std::make_pair(cost, std::move(item)));
+      if (cur == q.end())
+	cur = q.begin();
+      size++;
+    }
+    void enqueue_front(K cl, unsigned cost, T &&item) {
+      q[cl].push_front(std::make_pair(cost, std::move(item)));
+      if (cur == q.end())
+	cur = q.begin();
+      size++;
+    }
+    std::pair<unsigned, T> &front() const {
+      ceph_assert(!(q.empty()));
+      ceph_assert(cur != q.end());
+      return cur->second.front();
+    }
+    T pop_front() {
+      ceph_assert(!(q.empty()));
+      ceph_assert(cur != q.end());
+      T ret = std::move(cur->second.front().second);
+      cur->second.pop_front();
+      if (cur->second.empty()) {
+	q.erase(cur++);
+      } else {
+	++cur;
+      }
+      if (cur == q.end()) {
+	cur = q.begin();
+      }
+      size--;
+      return ret;
+    }
+    unsigned length() const {
+      ceph_assert(size >= 0);
+      return (unsigned)size;
+    }
+    bool empty() const {
+      return q.empty();
+    }
+    void remove_by_class(K k, std::list<T> *out) {
+      typename Classes::iterator i = q.find(k);
+      if (i == q.end()) {
+	return;
+      }
+      size -= i->second.size();
+      if (i == cur) {
+	++cur;
+      }
+      if (out) {
+	for (typename ListPairs::reverse_iterator j =
+	       i->second.rbegin();
+	     j != i->second.rend();
+	     ++j) {
+	  out->push_front(std::move(j->second));
+	}
+      }
+      q.erase(i);
+      if (cur == q.end()) {
+	cur = q.begin();
+      }
+    }
+
+    void dump(ceph::Formatter *f) const {
+      f->dump_int("tokens", tokens);
+      f->dump_int("max_tokens", max_tokens);
+      f->dump_int("size", size);
+      f->dump_int("num_keys", q.size());
+      if (!empty()) {
+	f->dump_int("first_item_cost", front().first);
+      }
+    }
+  };
+
+  typedef std::map<unsigned, SubQueue> SubQueues;
+  SubQueues high_queue;
+  SubQueues queue;
+
+  SubQueue *create_queue(unsigned priority) {
+    typename SubQueues::iterator p = queue.find(priority);
+    if (p != queue.end()) {
+      return &p->second;
+    }
+    total_priority += priority;
+    SubQueue *sq = &queue[priority];
+    sq->set_max_tokens(max_tokens_per_subqueue);
+    return sq;
+  }
+
+  void remove_queue(unsigned priority) {
+    ceph_assert(queue.count(priority));
+    queue.erase(priority);
+    total_priority -= priority;
+    ceph_assert(total_priority >= 0);
+  }
+
+  void distribute_tokens(unsigned cost) {
+    if (total_priority == 0) {
+      return;
+    }
+    for (typename SubQueues::iterator i = queue.begin();
+	 i != queue.end();
+	 ++i) {
+      i->second.put_tokens(((i->first * cost) / total_priority) + 1);
+    }
+  }
+
+public:
+  PrioritizedQueue(unsigned max_per, unsigned min_c)
+    : total_priority(0),
+      max_tokens_per_subqueue(max_per),
+      min_cost(min_c)
+  {}
+
+  unsigned length() const {
+    unsigned total = 0;
+    for (typename SubQueues::const_iterator i = queue.begin();
+	 i != queue.end();
+	 ++i) {
+      ceph_assert(i->second.length());
+      total += i->second.length();
+    }
+    for (typename SubQueues::const_iterator i = high_queue.begin();
+	 i != high_queue.end();
+	 ++i) {
+      ceph_assert(i->second.length());
+      total += i->second.length();
+    }
+    return total;
+  }
+
+  void remove_by_class(K k, std::list<T> *out = 0) final {
+    for (typename SubQueues::iterator i = queue.begin();
+	 i != queue.end();
+	 ) {
+      i->second.remove_by_class(k, out);
+      if (i->second.empty()) {
+	unsigned priority = i->first;
+	++i;
+	remove_queue(priority);
+      } else {
+	++i;
+      }
+    }
+    for (typename SubQueues::iterator i = high_queue.begin();
+	 i != high_queue.end();
+	 ) {
+      i->second.remove_by_class(k, out);
+      if (i->second.empty()) {
+	high_queue.erase(i++);
+      } else {
+	++i;
+      }
+    }
+  }
+
+  void enqueue_strict(K cl, unsigned priority, T&& item) final {
+    high_queue[priority].enqueue(cl, 0, std::move(item));
+  }
+
+  void enqueue_strict_front(K cl, unsigned priority, T&& item) final {
+    high_queue[priority].enqueue_front(cl, 0, std::move(item));
+  }
+
+  void enqueue(K cl, unsigned priority, unsigned cost, T&& item) final {
+    if (cost < min_cost)
+      cost = min_cost;
+    if (cost > max_tokens_per_subqueue)
+      cost = max_tokens_per_subqueue;
+    create_queue(priority)->enqueue(cl, cost, std::move(item));
+  }
+
+  void enqueue_front(K cl, unsigned priority, unsigned cost, T&& item) final {
+    if (cost < min_cost)
+      cost = min_cost;
+    if (cost > max_tokens_per_subqueue)
+      cost = max_tokens_per_subqueue;
+    create_queue(priority)->enqueue_front(cl, cost, std::move(item));
+  }
+
+  bool empty() const final {
+    ceph_assert(total_priority >= 0);
+    ceph_assert((total_priority == 0) || !(queue.empty()));
+    return queue.empty() && high_queue.empty();
+  }
+
+  T dequeue() final {
+    ceph_assert(!empty());
+
+    if (!(high_queue.empty())) {
+      T ret = std::move(high_queue.rbegin()->second.front().second);
+      high_queue.rbegin()->second.pop_front();
+      if (high_queue.rbegin()->second.empty()) {
+	high_queue.erase(high_queue.rbegin()->first);
+      }
+      return ret;
+    }
+
+    // if there are multiple buckets/subqueues with sufficient tokens,
+    // we behave like a strict priority queue among all subqueues that
+    // are eligible to run.
+    for (typename SubQueues::iterator i = queue.begin();
+	 i != queue.end();
+	 ++i) {
+      ceph_assert(!(i->second.empty()));
+      if (i->second.front().first < i->second.num_tokens()) {
+	unsigned cost = i->second.front().first;
+	i->second.take_tokens(cost);
+	T ret = std::move(i->second.front().second);
+	i->second.pop_front();
+	if (i->second.empty()) {
+	  remove_queue(i->first);
+	}
+	distribute_tokens(cost);
+	return ret;
+      }
+    }
+
+    // if no subqueues have sufficient tokens, we behave like a strict
+    // priority queue.
+    unsigned cost = queue.rbegin()->second.front().first;
+    T ret = std::move(queue.rbegin()->second.front().second);
+    queue.rbegin()->second.pop_front();
+    if (queue.rbegin()->second.empty()) {
+      remove_queue(queue.rbegin()->first);
+    }
+    distribute_tokens(cost);
+    return ret;
+  }
+
+  void dump(ceph::Formatter *f) const final {
+    f->dump_int("total_priority", total_priority);
+    f->dump_int("max_tokens_per_subqueue", max_tokens_per_subqueue);
+    f->dump_int("min_cost", min_cost);
+    f->open_array_section("high_queues");
+    for (typename SubQueues::const_iterator p = high_queue.begin();
+	 p != high_queue.end();
+	 ++p) {
+      f->open_object_section("subqueue");
+      f->dump_int("priority", p->first);
+      p->second.dump(f);
+      f->close_section();
+    }
+    f->close_section();
+    f->open_array_section("queues");
+    for (typename SubQueues::const_iterator p = queue.begin();
+	 p != queue.end();
+	 ++p) {
+      f->open_object_section("subqueue");
+      f->dump_int("priority", p->first);
+      p->second.dump(f);
+      f->close_section();
+    }
+    f->close_section();
+  }
+};
+
+#endif
diff --git a/src/common/PriorityCache.cc b/src/common/PriorityCache.cc
new file mode 100644
index 00000000..bb4366b6
--- /dev/null
+++ b/src/common/PriorityCache.cc
@@ -0,0 +1,398 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "PriorityCache.h"
+#include "common/dout.h"
+#include "perfglue/heap_profiler.h"
+#define dout_context cct
+#define dout_subsys ceph_subsys_prioritycache
+#undef dout_prefix
+#define dout_prefix *_dout << "prioritycache "
+
+namespace PriorityCache
+{
+  int64_t get_chunk(uint64_t usage, uint64_t total_bytes)
+  {
+    uint64_t chunk = total_bytes;
+
+    // Find the nearest power of 2
+    chunk -= 1;
+    chunk |= chunk >> 1;
+    chunk |= chunk >> 2;
+    chunk |= chunk >> 4;
+    chunk |= chunk >> 8;
+    chunk |= chunk >> 16;
+    chunk |= chunk >> 32;
+    chunk += 1;
+    // shrink it to 1/256 of the rounded up cache size
+    chunk /= 256;
+
+    // bound the chunk size to be between 4MB and 32MB
+    chunk = (chunk > 4ul*1024*1024) ? chunk : 4ul*1024*1024;
+    chunk = (chunk < 16ul*1024*1024) ? chunk : 16ul*1024*1024;
+
+    /* Add 16 chunks of headroom and round up to the near chunk.  Note that
+     * if RocksDB is used, it's a good idea to have N MB of headroom where
+     * N is the target_file_size_base value.  RocksDB will read SST files
+     * into the block cache during compaction which potentially can force out
+     * all existing cached data.  Once compaction is finished, the SST data is
+     * released leaving an empty cache.  Having enough headroom to absorb
+     * compaction reads allows the kv cache grow even during extremely heavy
+     * compaction workloads.
+     */
+    uint64_t val = usage + (16 * chunk);
+    uint64_t r = (val) % chunk;
+    if (r > 0)
+      val = val + chunk - r;
+    return val;
+  }
+
+  Manager::Manager(CephContext *c,
+                   uint64_t min,
+                   uint64_t max,
+                   uint64_t target,
+                   bool reserve_extra) :
+      cct(c),
+      caches{},
+      min_mem(min),
+      max_mem(max),
+      target_mem(target),
+      tuned_mem(min),
+      reserve_extra(reserve_extra)
+  {
+    PerfCountersBuilder b(cct, "prioritycache", 
+                          MallocStats::M_FIRST, MallocStats::M_LAST);
+
+    b.add_u64(MallocStats::M_TARGET_BYTES, "target_bytes",
+              "target process memory usage in bytes", "t",
+              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+
+    b.add_u64(MallocStats::M_MAPPED_BYTES, "mapped_bytes",
+              "total bytes mapped by the process", "m",
+              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+
+    b.add_u64(MallocStats::M_UNMAPPED_BYTES, "unmapped_bytes",
+              "unmapped bytes that the kernel has yet to reclaimed", "u",
+              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+
+    b.add_u64(MallocStats::M_HEAP_BYTES, "heap_bytes",
+              "aggregate bytes in use by the heap", "h",
+              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+
+    b.add_u64(MallocStats::M_CACHE_BYTES, "cache_bytes",
+              "current memory available for caches.", "c",
+              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+
+    logger = b.create_perf_counters();
+    cct->get_perfcounters_collection()->add(logger);
+
+    tune_memory();
+  }
+
+  Manager::~Manager()
+  {
+    clear();
+    cct->get_perfcounters_collection()->remove(logger);
+    delete logger;
+  }
+
+  void Manager::tune_memory()
+  {
+    size_t heap_size = 0;
+    size_t unmapped = 0;
+    uint64_t mapped = 0;
+
+    ceph_heap_release_free_memory();
+    ceph_heap_get_numeric_property("generic.heap_size", &heap_size);
+    ceph_heap_get_numeric_property("tcmalloc.pageheap_unmapped_bytes", &unmapped);
+    mapped = heap_size - unmapped;
+
+    uint64_t new_size = tuned_mem;
+    new_size = (new_size < max_mem) ? new_size : max_mem;
+    new_size = (new_size > min_mem) ? new_size : min_mem;
+
+    // Approach the min/max slowly, but bounce away quickly.
+    if ((uint64_t) mapped < target_mem) {
+      double ratio = 1 - ((double) mapped / target_mem);
+      new_size += ratio * (max_mem - new_size);
+    } else { 
+      double ratio = 1 - ((double) target_mem / mapped);
+      new_size -= ratio * (new_size - min_mem);
+    }
+
+    ldout(cct, 5) << __func__
+                  << " target: " << target_mem
+                  << " mapped: " << mapped  
+                  << " unmapped: " << unmapped
+                  << " heap: " << heap_size
+                  << " old mem: " << tuned_mem
+                  << " new mem: " << new_size << dendl;
+
+    tuned_mem = new_size;
+
+    logger->set(MallocStats::M_TARGET_BYTES, target_mem);
+    logger->set(MallocStats::M_MAPPED_BYTES, mapped);
+    logger->set(MallocStats::M_UNMAPPED_BYTES, unmapped);
+    logger->set(MallocStats::M_HEAP_BYTES, heap_size);
+    logger->set(MallocStats::M_CACHE_BYTES, new_size);
+  }
+
+  void Manager::insert(const std::string& name, std::shared_ptr<PriCache> c,
+                       bool enable_perf_counters)
+  {
+    ceph_assert(!caches.count(name));
+    ceph_assert(!indexes.count(name));
+
+    caches.emplace(name, c);
+
+    if (!enable_perf_counters) {
+      return;
+    }
+
+    // TODO: If we ever assign more than
+    // PERF_COUNTER_MAX_BOUND - PERF_COUNTER_LOWER_BOUND perf counters for
+    // priority caching we could run out of slots.  Recycle them some day?
+    // Also note that start and end are *exclusive*.
+    int start = cur_index++;
+    int end = cur_index + Extra::E_LAST + 1;
+
+    ceph_assert(end < PERF_COUNTER_MAX_BOUND);
+    indexes.emplace(name, std::vector<int>(Extra::E_LAST + 1));
+
+    PerfCountersBuilder b(cct, "prioritycache:" + name, start, end);
+
+    b.add_u64(cur_index + Priority::PRI0, "pri0_bytes",
+              "bytes allocated to pri0", "p0",
+              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Priority::PRI1, "pri1_bytes",
+              "bytes allocated to pri1", "p1",
+              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Priority::PRI2, "pri2_bytes",
+              "bytes allocated to pri2", "p2",
+              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Priority::PRI3, "pri3_bytes",
+              "bytes allocated to pri3", "p3",
+              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Priority::PRI4, "pri4_bytes",
+              "bytes allocated to pri4", "p4",
+              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Priority::PRI5, "pri5_bytes",
+              "bytes allocated to pri5", "p5",
+              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Priority::PRI6, "pri6_bytes",
+              "bytes allocated to pri6", "p6",
+              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Priority::PRI7, "pri7_bytes",
+              "bytes allocated to pri7", "p7",
+              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Priority::PRI8, "pri8_bytes",
+              "bytes allocated to pri8", "p8",
+              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Priority::PRI9, "pri9_bytes",
+              "bytes allocated to pri9", "p9",
+              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Priority::PRI10, "pri10_bytes",
+              "bytes allocated to pri10", "p10",
+              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Priority::PRI11, "pri11_bytes",
+              "bytes allocated to pri11", "p11",
+              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Extra::E_RESERVED, "reserved_bytes",
+              "bytes reserved for future growth.", "r",
+              PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
+
+    b.add_u64(cur_index + Extra::E_COMMITTED, "committed_bytes",
+              "total bytes committed,", "c",
+              PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
+
+    for (int i = 0; i < Extra::E_LAST+1; i++) {
+      indexes[name][i] = cur_index + i;
+    }
+
+    auto l = b.create_perf_counters();
+    loggers.emplace(name, l);
+    cct->get_perfcounters_collection()->add(l);
+
+    cur_index = end;
+  }
+
+  void Manager::erase(const std::string& name)
+  {
+    auto li = loggers.find(name);
+    if (li != loggers.end()) {
+      cct->get_perfcounters_collection()->remove(li->second);
+      delete li->second;
+      loggers.erase(li);
+    }
+    indexes.erase(name);
+    caches.erase(name);
+  }
+
+  void Manager::clear()
+  {
+    auto li = loggers.begin();
+    while (li != loggers.end()) {
+      cct->get_perfcounters_collection()->remove(li->second);
+      delete li->second;
+      li = loggers.erase(li);
+    }
+    indexes.clear();
+    caches.clear();
+  }
+
+  void Manager::balance()
+  {
+    int64_t mem_avail = tuned_mem;
+    // Each cache is going to get a little extra from get_chunk, so shrink the
+    // available memory here to compensate.
+    if (reserve_extra) {
+      mem_avail -= get_chunk(1, tuned_mem) * caches.size();
+    }
+
+    if (mem_avail < 0) {
+      // There's so little memory available that just assigning a chunk per
+      // cache pushes us over the limit. Set mem_avail to 0 and continue to
+      // ensure each priority's byte counts are zeroed in balance_priority.
+      mem_avail = 0;
+    }
+
+    // Assign memory for each priority level
+    for (int i = 0; i < Priority::LAST+1; i++) {
+      ldout(cct, 10) << __func__ << " assigning cache bytes for PRI: " << i << dendl;
+
+      auto pri = static_cast<Priority>(i);
+      balance_priority(&mem_avail, pri);
+
+      // Update the per-priority perf counters
+      for (auto &l : loggers) {
+        auto it = caches.find(l.first);
+        ceph_assert(it != caches.end());
+
+        auto bytes = it->second->get_cache_bytes(pri);
+        l.second->set(indexes[it->first][pri], bytes);
+      }
+    }
+    // assert if we assigned more memory than is available.
+    ceph_assert(mem_avail >= 0);
+
+    for (auto &l : loggers) {
+      auto it = caches.find(l.first);
+      ceph_assert(it != caches.end());
+
+      // Commit the new cache size
+      int64_t committed = it->second->commit_cache_size(tuned_mem);
+
+      // Update the perf counters
+      int64_t alloc = it->second->get_cache_bytes();
+
+      l.second->set(indexes[it->first][Extra::E_RESERVED], committed - alloc);
+      l.second->set(indexes[it->first][Extra::E_COMMITTED], committed);
+    }
+  }
+
+  void Manager::balance_priority(int64_t *mem_avail, Priority pri)
+  {
+    std::unordered_map<std::string, std::shared_ptr<PriCache>> tmp_caches = caches;
+    double cur_ratios = 0;
+    double new_ratios = 0;
+    uint64_t round = 0;
+
+    // First, zero this priority's bytes, sum the initial ratios.
+    for (auto it = caches.begin(); it != caches.end(); it++) {
+      it->second->set_cache_bytes(pri, 0);
+      cur_ratios += it->second->get_cache_ratio();
+    }
+
+    // For other priorities, loop until caches are satisified or we run out of
+    // memory (stop if we can't guarantee a full byte allocation).
+    while (!tmp_caches.empty() && *mem_avail > static_cast<int64_t>(tmp_caches.size())) {
+      uint64_t total_assigned = 0;
+      for (auto it = tmp_caches.begin(); it != tmp_caches.end();) {
+        int64_t cache_wants = it->second->request_cache_bytes(pri, tuned_mem);
+        // Usually the ratio should be set to the fraction of the current caches'
+        // assigned ratio compared to the total ratio of all caches that still
+        // want memory.  There is a special case where the only caches left are
+        // all assigned 0% ratios but still want memory.  In that case, give 
+        // them an equal shot at the remaining memory for this priority.
+        double ratio = 1.0 / tmp_caches.size();
+        if (cur_ratios > 0) {
+          ratio = it->second->get_cache_ratio() / cur_ratios;
+        }
+        int64_t fair_share = static_cast<int64_t>(*mem_avail * ratio);
+
+        ldout(cct, 10) << __func__ << " " << it->first
+                       << " pri: " << (int) pri
+                       << " round: " << round
+                       << " wanted: " << cache_wants
+                       << " ratio: " << it->second->get_cache_ratio()
+                       << " cur_ratios: " << cur_ratios
+                       << " fair_share: " << fair_share
+                       << " mem_avail: " << *mem_avail
+                       << dendl;
+
+        if (cache_wants > fair_share) {
+          // If we want too much, take what we can get but stick around for more
+          it->second->add_cache_bytes(pri, fair_share);
+          total_assigned += fair_share;
+          new_ratios += it->second->get_cache_ratio();
+          ++it;
+        } else {
+          // Otherwise assign only what we want
+          if (cache_wants > 0) {
+            it->second->add_cache_bytes(pri, cache_wants);
+            total_assigned += cache_wants;
+          }
+          // Either the cache didn't want anything or got what it wanted, so
+          // remove it from the tmp list.
+          it = tmp_caches.erase(it);
+        }
+      }
+      // Reset the ratios 
+      *mem_avail -= total_assigned;
+      cur_ratios = new_ratios;
+      new_ratios = 0;
+      ++round;
+    }
+
+    // If this is the last priority, divide up any remaining memory based
+    // solely on the ratios.
+    if (pri == Priority::LAST) {
+      uint64_t total_assigned = 0;
+      for (auto it = caches.begin(); it != caches.end(); it++) {
+        double ratio = it->second->get_cache_ratio();
+        int64_t fair_share = static_cast<int64_t>(*mem_avail * ratio);
+        it->second->set_cache_bytes(Priority::LAST, fair_share);
+        total_assigned += fair_share;
+      }
+      *mem_avail -= total_assigned;
+      return;
+    }
+  }
+
+  PriCache::~PriCache()
+  {
+  }
+}
diff --git a/src/common/PriorityCache.h b/src/common/PriorityCache.h
new file mode 100644
index 00000000..6ac60702
--- /dev/null
+++ b/src/common/PriorityCache.h
@@ -0,0 +1,149 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_PRIORITY_CACHE_H
+#define CEPH_PRIORITY_CACHE_H
+
+#include <stdint.h>
+#include <string>
+#include <vector>
+#include <memory>
+#include <unordered_map>
+#include "common/perf_counters.h"
+#include "include/ceph_assert.h"
+
+namespace PriorityCache {
+  // Reserve 16384 slots for PriorityCache perf counters
+  const int PERF_COUNTER_LOWER_BOUND = 1073741824;
+  const int PERF_COUNTER_MAX_BOUND = 1073758208;
+
+  enum MallocStats {
+    M_FIRST = PERF_COUNTER_LOWER_BOUND,
+    M_TARGET_BYTES,
+    M_MAPPED_BYTES,
+    M_UNMAPPED_BYTES,
+    M_HEAP_BYTES,
+    M_CACHE_BYTES,
+    M_LAST,
+  };
+
+  enum Priority {
+    PRI0,
+    PRI1,
+    PRI2,
+    PRI3,
+    PRI4,
+    PRI5,
+    PRI6,
+    PRI7,
+    PRI8,
+    PRI9,
+    PRI10,
+    PRI11,
+    LAST = PRI11,
+  };
+
+  enum Extra {
+    E_RESERVED = Priority::LAST+1,
+    E_COMMITTED,
+    E_LAST = E_COMMITTED,
+  };
+
+  int64_t get_chunk(uint64_t usage, uint64_t total_bytes);
+
+  struct PriCache {
+    virtual ~PriCache();
+
+    /* Ask the cache to request memory for the given priority. Note that the
+     * cache may ultimately be allocated less memory than it requests here.
+     */
+    virtual int64_t request_cache_bytes(PriorityCache::Priority pri, uint64_t total_cache) const = 0;
+
+    // Get the number of bytes currently allocated to the given priority.
+    virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const = 0;
+
+    // Get the number of bytes currently allocated to all priorities.
+    virtual int64_t get_cache_bytes() const = 0;
+
+    // Allocate bytes for a given priority.
+    virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) = 0;
+
+    // Allocate additional bytes for a given priority.
+    virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) = 0;
+
+    /* Commit the current number of bytes allocated to the cache.  Space is
+     * allocated in chunks based on the allocation size and current total size
+     * of memory available for caches. */
+    virtual int64_t commit_cache_size(uint64_t total_cache) = 0;
+
+    /* Get the current number of bytes allocated to the cache. this may be
+     * larger than the value returned by get_cache_bytes as it includes extra
+     * space for future growth. */
+    virtual int64_t get_committed_size() const = 0;
+
+    // Get the ratio of available memory this cache should target.
+    virtual double get_cache_ratio() const = 0;
+
+    // Set the ratio of available memory this cache should target.
+    virtual void set_cache_ratio(double ratio) = 0;
+
+    // Get the name of this cache.
+    virtual std::string get_cache_name() const = 0;
+  };
+
+  class Manager {
+    CephContext* cct = nullptr;
+    PerfCounters* logger;
+    std::unordered_map<std::string, PerfCounters*> loggers;
+    std::unordered_map<std::string, std::vector<int>> indexes;
+    std::unordered_map<std::string, std::shared_ptr<PriCache>> caches;
+
+    // Start perf counter slots after the malloc stats.
+    int cur_index = MallocStats::M_LAST;
+
+    uint64_t min_mem = 0;
+    uint64_t max_mem = 0;
+    uint64_t target_mem = 0;
+    uint64_t tuned_mem = 0;
+    bool reserve_extra;
+
+  public:
+    Manager(CephContext *c, uint64_t min, uint64_t max, uint64_t target,
+            bool reserve_extra);
+    ~Manager();
+    void set_min_memory(uint64_t min) {
+      min_mem = min;
+    }
+    void set_max_memory(uint64_t max) {
+      max_mem = max;
+    }
+    void set_target_memory(uint64_t target) {
+      target_mem = target;
+    }
+    uint64_t get_tuned_mem() const {
+      return tuned_mem;
+    }
+    void insert(const std::string& name, const std::shared_ptr<PriCache> c,
+                bool enable_perf_counters);
+    void erase(const std::string& name);
+    void clear();
+    void tune_memory();
+    void balance();
+
+  private:
+    void balance_priority(int64_t *mem_avail, Priority pri);
+  };
+}
+
+#endif
diff --git a/src/common/QueueRing.h b/src/common/QueueRing.h
new file mode 100644
index 00000000..af5c47be
--- /dev/null
+++ b/src/common/QueueRing.h
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef QUEUE_RING_H
+#define QUEUE_RING_H
+
+#include "common/ceph_mutex.h"
+
+#include <list>
+#include <atomic>
+#include <vector>
+
+template <class T>
+class QueueRing {
+  struct QueueBucket {
+    ceph::mutex lock = ceph::make_mutex("QueueRing::QueueBucket::lock");
+    ceph::condition_variable cond;
+    typename std::list<T> entries;
+
+    QueueBucket() {}
+    QueueBucket(const QueueBucket& rhs) {
+      entries = rhs.entries;
+    }
+
+    void enqueue(const T& entry) {
+      lock.lock();
+      if (entries.empty()) {
+        cond.notify_all();
+      }
+      entries.push_back(entry);
+      lock.unlock();
+    }
+
+    void dequeue(T *entry) {
+      std::unique_lock l(lock);
+      while (entries.empty()) {
+        cond.wait(l);
+      };
+      ceph_assert(!entries.empty());
+      *entry = entries.front();
+      entries.pop_front();
+    };
+  };
+
+  std::vector<QueueBucket> buckets;
+  int num_buckets;
+
+  std::atomic<int64_t> cur_read_bucket = { 0 };
+  std::atomic<int64_t> cur_write_bucket = { 0 };
+
+public:
+  QueueRing(int n) : buckets(n), num_buckets(n) {
+  }
+
+  void enqueue(const T& entry) {
+    buckets[++cur_write_bucket % num_buckets].enqueue(entry);
+  };
+
+  void dequeue(T *entry) {
+    buckets[++cur_read_bucket % num_buckets].dequeue(entry);
+  }
+};
+
+#endif
diff --git a/src/common/RWLock.h b/src/common/RWLock.h
new file mode 100644
index 00000000..94ba7a00
--- /dev/null
+++ b/src/common/RWLock.h
@@ -0,0 +1,267 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+
+#ifndef CEPH_RWLock_Posix__H
+#define CEPH_RWLock_Posix__H
+
+#include <pthread.h>
+#include <string>
+#include "include/ceph_assert.h"
+#include "acconfig.h"
+#include "lockdep.h"
+#include "common/valgrind.h"
+
+#include <atomic>
+
+class RWLock final
+{
+  mutable pthread_rwlock_t L;
+  std::string name;
+  mutable int id;
+  mutable std::atomic<unsigned> nrlock = { 0 }, nwlock = { 0 };
+  bool track, lockdep;
+
+  std::string unique_name(const char* name) const;
+
+public:
+  RWLock(const RWLock& other) = delete;
+  const RWLock& operator=(const RWLock& other) = delete;
+
+  RWLock(const std::string &n, bool track_lock=true, bool ld=true, bool prioritize_write=false)
+    : name(n), id(-1), track(track_lock),
+      lockdep(ld) {
+#if defined(HAVE_PTHREAD_RWLOCKATTR_SETKIND_NP)
+    if (prioritize_write) {
+      pthread_rwlockattr_t attr;
+      pthread_rwlockattr_init(&attr);
+      // PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP
+      //   Setting the lock kind to this avoids writer starvation as long as
+      //   long as any read locking is not done in a recursive fashion.
+      pthread_rwlockattr_setkind_np(&attr,
+          PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP);
+      pthread_rwlock_init(&L, &attr);
+      pthread_rwlockattr_destroy(&attr);
+    } else 
+#endif 
+    // Next block is in {} to possibly connect to the above if when code is used.
+    {
+      pthread_rwlock_init(&L, NULL);
+    }
+    ANNOTATE_BENIGN_RACE_SIZED(&id, sizeof(id), "RWLock lockdep id");
+    ANNOTATE_BENIGN_RACE_SIZED(&nrlock, sizeof(nrlock), "RWlock nrlock");
+    ANNOTATE_BENIGN_RACE_SIZED(&nwlock, sizeof(nwlock), "RWlock nwlock");
+    if (lockdep && g_lockdep) id = lockdep_register(name.c_str());
+  }
+
+  bool is_locked() const {
+    ceph_assert(track);
+    return (nrlock > 0) || (nwlock > 0);
+  }
+
+  bool is_wlocked() const {
+    ceph_assert(track);
+    return (nwlock > 0);
+  }
+  ~RWLock() {
+    // The following check is racy but we are about to destroy
+    // the object and we assume that there are no other users.
+    if (track)
+      ceph_assert(!is_locked());
+    pthread_rwlock_destroy(&L);
+    if (lockdep && g_lockdep) {
+      lockdep_unregister(id);
+    }
+  }
+
+  void unlock(bool lockdep=true) const {
+    if (track) {
+      if (nwlock > 0) {
+        nwlock--;
+      } else {
+        ceph_assert(nrlock > 0);
+        nrlock--;
+      }
+    }
+    if (lockdep && this->lockdep && g_lockdep)
+      id = lockdep_will_unlock(name.c_str(), id);
+    int r = pthread_rwlock_unlock(&L);
+    ceph_assert(r == 0);
+  }
+
+  // read
+  void get_read() const {
+    if (lockdep && g_lockdep) id = lockdep_will_lock(name.c_str(), id);
+    int r = pthread_rwlock_rdlock(&L);
+    ceph_assert(r == 0);
+    if (lockdep && g_lockdep) id = lockdep_locked(name.c_str(), id);
+    if (track)
+      nrlock++;
+  }
+  bool try_get_read() const {
+    if (pthread_rwlock_tryrdlock(&L) == 0) {
+      if (track)
+         nrlock++;
+      if (lockdep && g_lockdep) id = lockdep_locked(name.c_str(), id);
+      return true;
+    }
+    return false;
+  }
+  void put_read() const {
+    unlock();
+  }
+
+  // write
+  void get_write(bool lockdep=true) {
+    if (lockdep && this->lockdep && g_lockdep)
+      id = lockdep_will_lock(name.c_str(), id);
+    int r = pthread_rwlock_wrlock(&L);
+    ceph_assert(r == 0);
+    if (lockdep && this->lockdep && g_lockdep)
+      id = lockdep_locked(name.c_str(), id);
+    if (track)
+      nwlock++;
+
+  }
+  bool try_get_write(bool lockdep=true) {
+    if (pthread_rwlock_trywrlock(&L) == 0) {
+      if (lockdep && this->lockdep && g_lockdep)
+	id = lockdep_locked(name.c_str(), id);
+      if (track)
+         nwlock++;
+      return true;
+    }
+    return false;
+  }
+  void put_write() {
+    unlock();
+  }
+
+  void get(bool for_write) {
+    if (for_write) {
+      get_write();
+    } else {
+      get_read();
+    }
+  }
+
+public:
+  class RLocker {
+    const RWLock &m_lock;
+
+    bool locked;
+
+  public:
+   explicit  RLocker(const RWLock& lock) : m_lock(lock) {
+      m_lock.get_read();
+      locked = true;
+    }
+    void unlock() {
+      ceph_assert(locked);
+      m_lock.unlock();
+      locked = false;
+    }
+    ~RLocker() {
+      if (locked) {
+        m_lock.unlock();
+      }
+    }
+  };
+
+  class WLocker {
+    RWLock &m_lock;
+
+    bool locked;
+
+  public:
+    explicit WLocker(RWLock& lock) : m_lock(lock) {
+      m_lock.get_write();
+      locked = true;
+    }
+    void unlock() {
+      ceph_assert(locked);
+      m_lock.unlock();
+      locked = false;
+    }
+    ~WLocker() {
+      if (locked) {
+        m_lock.unlock();
+      }
+    }
+  };
+
+  class Context {
+    RWLock& lock;
+
+  public:
+    enum LockState {
+      Untaken = 0,
+      TakenForRead = 1,
+      TakenForWrite = 2,
+    };
+
+  private:
+    LockState state;
+
+  public:
+    explicit Context(RWLock& l) : lock(l), state(Untaken) {}
+    Context(RWLock& l, LockState s) : lock(l), state(s) {}
+
+    void get_write() {
+      ceph_assert(state == Untaken);
+
+      lock.get_write();
+      state = TakenForWrite;
+    }
+
+    void get_read() {
+      ceph_assert(state == Untaken);
+
+      lock.get_read();
+      state = TakenForRead;
+    }
+
+    void unlock() {
+      ceph_assert(state != Untaken);
+      lock.unlock();
+      state = Untaken;
+    }
+
+    void promote() {
+      ceph_assert(state == TakenForRead);
+      unlock();
+      get_write();
+    }
+
+    LockState get_state() { return state; }
+    void set_state(LockState s) {
+      state = s;
+    }
+
+    bool is_locked() {
+      return (state != Untaken);
+    }
+
+    bool is_rlocked() {
+      return (state == TakenForRead);
+    }
+
+    bool is_wlocked() {
+      return (state == TakenForWrite);
+    }
+  };
+};
+
+#endif // !CEPH_RWLock_Posix__H
diff --git a/src/common/Readahead.cc b/src/common/Readahead.cc
new file mode 100644
index 00000000..2d2b35ff
--- /dev/null
+++ b/src/common/Readahead.cc
@@ -0,0 +1,194 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/Readahead.h"
+#include "common/Cond.h"
+
+Readahead::Readahead()
+  : m_trigger_requests(10),
+    m_readahead_min_bytes(0),
+    m_readahead_max_bytes(NO_LIMIT),
+    m_alignments(),
+    m_nr_consec_read(0),
+    m_consec_read_bytes(0),
+    m_last_pos(0),
+    m_readahead_pos(0),
+    m_readahead_trigger_pos(0),
+    m_readahead_size(0),
+    m_pending(0) {
+}
+
+Readahead::~Readahead() {
+}
+
+Readahead::extent_t Readahead::update(const vector<extent_t>& extents, uint64_t limit) {
+  m_lock.lock();
+  for (vector<extent_t>::const_iterator p = extents.begin(); p != extents.end(); ++p) {
+    _observe_read(p->first, p->second);
+  }
+  if (m_readahead_pos >= limit|| m_last_pos >= limit) {
+    m_lock.unlock();
+    return extent_t(0, 0);
+  }
+  pair<uint64_t, uint64_t> extent = _compute_readahead(limit);
+  m_lock.unlock();
+  return extent;
+}
+
+Readahead::extent_t Readahead::update(uint64_t offset, uint64_t length, uint64_t limit) {
+  m_lock.lock();
+  _observe_read(offset, length);
+  if (m_readahead_pos >= limit || m_last_pos >= limit) {
+    m_lock.unlock();
+    return extent_t(0, 0);
+  }
+  extent_t extent = _compute_readahead(limit);
+  m_lock.unlock();
+  return extent;
+}
+
+void Readahead::_observe_read(uint64_t offset, uint64_t length) {
+  if (offset == m_last_pos) {
+    m_nr_consec_read++;
+    m_consec_read_bytes += length;
+  } else {
+    m_nr_consec_read = 0;
+    m_consec_read_bytes = 0;
+    m_readahead_trigger_pos = 0;
+    m_readahead_size = 0;
+    m_readahead_pos = 0;
+  }
+  m_last_pos = offset + length;
+}
+
+Readahead::extent_t Readahead::_compute_readahead(uint64_t limit) {
+  uint64_t readahead_offset = 0;
+  uint64_t readahead_length = 0;
+  if (m_nr_consec_read >= m_trigger_requests) {
+    // currently reading sequentially
+    if (m_last_pos >= m_readahead_trigger_pos) {
+      // need to read ahead
+      if (m_readahead_size == 0) {
+	// initial readahead trigger
+	m_readahead_size = m_consec_read_bytes;
+	m_readahead_pos = m_last_pos;
+      } else {
+	// continuing readahead trigger
+	m_readahead_size *= 2;
+	if (m_last_pos > m_readahead_pos) {
+	  m_readahead_pos = m_last_pos;
+	}
+      }
+      m_readahead_size = std::max(m_readahead_size, m_readahead_min_bytes);
+      m_readahead_size = std::min(m_readahead_size, m_readahead_max_bytes);
+      readahead_offset = m_readahead_pos;
+      readahead_length = m_readahead_size;
+
+      // Snap to the first alignment possible
+      uint64_t readahead_end = readahead_offset + readahead_length;
+      for (vector<uint64_t>::iterator p = m_alignments.begin(); p != m_alignments.end(); ++p) {
+	// Align the readahead, if possible.
+	uint64_t alignment = *p;
+	uint64_t align_prev = readahead_end / alignment * alignment;
+	uint64_t align_next = align_prev + alignment;
+	uint64_t dist_prev = readahead_end - align_prev;
+	uint64_t dist_next = align_next - readahead_end;
+	if (dist_prev < readahead_length / 2 && dist_prev < dist_next) {
+	  // we can snap to the previous alignment point by a less than 50% reduction in size
+	  ceph_assert(align_prev > readahead_offset);
+	  readahead_length = align_prev - readahead_offset;
+	  break;
+	} else if(dist_next < readahead_length / 2) {
+	  // we can snap to the next alignment point by a less than 50% increase in size
+	  ceph_assert(align_next > readahead_offset);
+	  readahead_length = align_next - readahead_offset;
+	  break;
+	}
+	// Note that m_readahead_size should remain unadjusted.
+      }
+
+      if (m_readahead_pos + readahead_length > limit) {
+	readahead_length = limit - m_readahead_pos;
+      }
+
+      m_readahead_trigger_pos = m_readahead_pos + readahead_length / 2;
+      m_readahead_pos += readahead_length;
+    }
+  }
+  return extent_t(readahead_offset, readahead_length);
+}
+
+void Readahead::inc_pending(int count) {
+  ceph_assert(count > 0);
+  m_pending_lock.lock();
+  m_pending += count;
+  m_pending_lock.unlock();
+}
+
+void Readahead::dec_pending(int count) {
+  ceph_assert(count > 0);
+  m_pending_lock.lock();
+  ceph_assert(m_pending >= count);
+  m_pending -= count;
+  if (m_pending == 0) {
+    std::list<Context *> pending_waiting(std::move(m_pending_waiting));
+    m_pending_lock.unlock();
+
+    for (auto ctx : pending_waiting) {
+      ctx->complete(0);
+    }
+  } else {
+    m_pending_lock.unlock();
+  }
+}
+
+void Readahead::wait_for_pending() {
+  C_SaferCond ctx;
+  wait_for_pending(&ctx);
+  ctx.wait();
+}
+
+void Readahead::wait_for_pending(Context *ctx) {
+  m_pending_lock.lock();
+  if (m_pending > 0) {
+    m_pending_lock.unlock();
+    m_pending_waiting.push_back(ctx);
+    return;
+  }
+  m_pending_lock.unlock();
+
+  ctx->complete(0);
+}
+void Readahead::set_trigger_requests(int trigger_requests) {
+  m_lock.lock();
+  m_trigger_requests = trigger_requests;
+  m_lock.unlock();
+}
+
+uint64_t Readahead::get_min_readahead_size(void) {
+  std::lock_guard lock(m_lock);
+  return m_readahead_min_bytes;
+}
+
+uint64_t Readahead::get_max_readahead_size(void) {
+  std::lock_guard lock(m_lock);
+  return m_readahead_max_bytes;
+}
+
+void Readahead::set_min_readahead_size(uint64_t min_readahead_size) {
+  m_lock.lock();
+  m_readahead_min_bytes = min_readahead_size;
+  m_lock.unlock();
+}
+
+void Readahead::set_max_readahead_size(uint64_t max_readahead_size) {
+  m_lock.lock();
+  m_readahead_max_bytes = max_readahead_size;
+  m_lock.unlock();
+}
+
+void Readahead::set_alignments(const vector<uint64_t> &alignments) {
+  m_lock.lock();
+  m_alignments = alignments;
+  m_lock.unlock();
+}
diff --git a/src/common/Readahead.h b/src/common/Readahead.h
new file mode 100644
index 00000000..716e58cd
--- /dev/null
+++ b/src/common/Readahead.h
@@ -0,0 +1,167 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_READAHEAD_H
+#define CEPH_READAHEAD_H
+
+#include <list>
+#include <vector>
+
+#include "include/Context.h"
+#include "common/ceph_mutex.h"
+
+/**
+   This class provides common state and logic for code that needs to perform readahead
+   on linear things such as RBD images or files.
+   Unless otherwise specified, all methods are thread-safe.
+
+   Minimum and maximum readahead sizes may be violated by up to 50\% if alignment is enabled.
+   Minimum readahead size may be violated if the end of the readahead target is reached.
+ */
+class Readahead {
+public:
+  typedef std::pair<uint64_t, uint64_t> extent_t;
+
+  // equal to UINT64_MAX
+  static const uint64_t NO_LIMIT = 18446744073709551615ULL;
+
+  Readahead();
+
+  ~Readahead();
+
+  /**
+     Update state with new reads and return readahead to be performed.
+     If the length of the returned extent is 0, no readahead should be performed.
+     The readahead extent is guaranteed not to pass \c limit.
+
+     Note that passing in NO_LIMIT as the limit and truncating the returned extent
+     is not the same as passing in the correct limit, because the internal state
+     will differ in the two cases.
+
+     @param extents read operations since last call to update
+     @param limit size of the thing readahead is being applied to
+   */
+  extent_t update(const std::vector<extent_t>& extents, uint64_t limit);
+
+  /**
+     Update state with a new read and return readahead to be performed.
+     If the length of the returned extent is 0, no readahead should be performed.
+     The readahead extent is guaranteed not to pass \c limit.
+
+     Note that passing in NO_LIMIT as the limit and truncating the returned extent
+     is not the same as passing in the correct limit, because the internal state
+     will differ in the two cases.
+
+     @param offset offset of the read operation
+     @param length length of the read operation
+     @param limit size of the thing readahead is being applied to
+   */
+  extent_t update(uint64_t offset, uint64_t length, uint64_t limit);
+
+  /**
+     Increment the pending counter.
+   */
+  void inc_pending(int count = 1);
+
+  /**
+     Decrement the pending counter.
+     The counter must not be decremented below 0.
+   */
+  void dec_pending(int count = 1);
+
+  /**
+     Waits until the pending count reaches 0.
+   */
+  void wait_for_pending();
+  void wait_for_pending(Context *ctx);
+
+  /**
+     Sets the number of sequential requests necessary to trigger readahead.
+   */
+  void set_trigger_requests(int trigger_requests);
+
+  /**
+     Gets the minimum size of a readahead request, in bytes.
+   */
+  uint64_t get_min_readahead_size(void);
+
+  /**
+     Gets the maximum size of a readahead request, in bytes.
+   */
+  uint64_t get_max_readahead_size(void);
+
+  /**
+     Sets the minimum size of a readahead request, in bytes.
+   */
+  void set_min_readahead_size(uint64_t min_readahead_size);
+
+  /**
+     Sets the maximum size of a readahead request, in bytes.
+   */
+  void set_max_readahead_size(uint64_t max_readahead_size);
+
+  /**
+     Sets the alignment units.
+     If the end point of a readahead request can be aligned to an alignment unit
+     by increasing or decreasing the size of the request by 50\% or less, it will.
+     Alignments are tested in order, so larger numbers should almost always come first.
+   */
+  void set_alignments(const std::vector<uint64_t> &alignments);
+
+private:
+  /**
+     Records that a read request has been received.
+     m_lock must be held while calling.
+   */
+  void _observe_read(uint64_t offset, uint64_t length);
+
+  /**
+     Computes the next readahead request.
+     m_lock must be held while calling.
+  */
+  extent_t _compute_readahead(uint64_t limit);
+
+  /// Number of sequential requests necessary to trigger readahead
+  int m_trigger_requests;
+
+  /// Minimum size of a readahead request, in bytes
+  uint64_t m_readahead_min_bytes;
+
+  /// Maximum size of a readahead request, in bytes
+  uint64_t m_readahead_max_bytes;
+
+  /// Alignment units, in bytes
+  std::vector<uint64_t> m_alignments;
+
+  /// Held while reading/modifying any state except m_pending
+  ceph::mutex m_lock = ceph::make_mutex("Readahead::m_lock");
+
+  /// Number of consecutive read requests in the current sequential stream
+  int m_nr_consec_read;
+
+  /// Number of bytes read in the current sequenial stream
+  uint64_t m_consec_read_bytes;
+
+  /// Position of the read stream
+  uint64_t m_last_pos;
+
+  /// Position of the readahead stream
+  uint64_t m_readahead_pos;
+
+  /// When readahead is already triggered and the read stream crosses this point, readahead is continued
+  uint64_t m_readahead_trigger_pos;
+
+  /// Size of the next readahead request (barring changes due to alignment, etc.)
+  uint64_t m_readahead_size;
+
+  /// Number of pending readahead requests, as determined by inc_pending() and dec_pending()
+  int m_pending;
+
+  /// Lock for m_pending
+  ceph::mutex m_pending_lock = ceph::make_mutex("Readahead::m_pending_lock");
+
+  /// Waiters for pending readahead
+  std::list<Context *> m_pending_waiting;
+};
+
+#endif
diff --git a/src/common/RefCountedObj.h b/src/common/RefCountedObj.h
new file mode 100644
index 00000000..7596936a
--- /dev/null
+++ b/src/common/RefCountedObj.h
@@ -0,0 +1,179 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_REFCOUNTEDOBJ_H
+#define CEPH_REFCOUNTEDOBJ_H
+ 
+#include "common/ceph_mutex.h"
+#include "common/ceph_context.h"
+#include "common/valgrind.h"
+#include "common/debug.h"
+
+#include <boost/smart_ptr/intrusive_ptr.hpp>
+
+// re-include our assert to clobber the system one; fix dout:
+#include "include/ceph_assert.h"
+
+struct RefCountedObject {
+private:
+  mutable std::atomic<uint64_t> nref;
+  CephContext *cct;
+public:
+  RefCountedObject(CephContext *c = NULL, int n=1) : nref(n), cct(c) {}
+  virtual ~RefCountedObject() {
+    ceph_assert(nref == 0);
+  }
+  
+  const RefCountedObject *get() const {
+    int v = ++nref;
+    if (cct)
+      lsubdout(cct, refs, 1) << "RefCountedObject::get " << this << " "
+			     << (v - 1) << " -> " << v
+			     << dendl;
+    return this;
+  }
+  RefCountedObject *get() {
+    int v = ++nref;
+    if (cct)
+      lsubdout(cct, refs, 1) << "RefCountedObject::get " << this << " "
+			     << (v - 1) << " -> " << v
+			     << dendl;
+    return this;
+  }
+  void put() const {
+    CephContext *local_cct = cct;
+    int v = --nref;
+    if (local_cct)
+      lsubdout(local_cct, refs, 1) << "RefCountedObject::put " << this << " "
+				   << (v + 1) << " -> " << v
+				   << dendl;
+    if (v == 0) {
+      ANNOTATE_HAPPENS_AFTER(&nref);
+      ANNOTATE_HAPPENS_BEFORE_FORGET_ALL(&nref);
+      delete this;
+    } else {
+      ANNOTATE_HAPPENS_BEFORE(&nref);
+    }
+  }
+  void set_cct(CephContext *c) {
+    cct = c;
+  }
+
+  uint64_t get_nref() const {
+    return nref;
+  }
+};
+
+#ifndef WITH_SEASTAR
+
+/**
+ * RefCountedCond
+ *
+ *  a refcounted condition, will be removed when all references are dropped
+ */
+
+struct RefCountedCond : public RefCountedObject {
+  bool complete;
+  ceph::mutex lock = ceph::make_mutex("RefCountedCond::lock");
+  ceph::condition_variable cond;
+  int rval;
+
+  RefCountedCond() : complete(false), rval(0) {}
+
+  int wait() {
+    std::unique_lock l(lock);
+    while (!complete) {
+      cond.wait(l);
+    }
+    return rval;
+  }
+
+  void done(int r) {
+    std::lock_guard l(lock);
+    rval = r;
+    complete = true;
+    cond.notify_all();
+  }
+
+  void done() {
+    done(0);
+  }
+};
+
+/**
+ * RefCountedWaitObject
+ *
+ * refcounted object that allows waiting for the object's last reference.
+ * Any referrer can either put or put_wait(). A simple put() will return
+ * immediately, a put_wait() will return only when the object is destroyed.
+ * e.g., useful when we want to wait for a specific event completion. We
+ * use RefCountedCond, as the condition can be referenced after the object
+ * destruction. 
+ *    
+ */
+struct RefCountedWaitObject {
+  std::atomic<uint64_t> nref = { 1 };
+  RefCountedCond *c;
+
+  RefCountedWaitObject() {
+    c = new RefCountedCond;
+  }
+  virtual ~RefCountedWaitObject() {
+    c->put();
+  }
+
+  RefCountedWaitObject *get() {
+    nref++;
+    return this;
+  }
+
+  bool put() {
+    bool ret = false;
+    RefCountedCond *cond = c;
+    cond->get();
+    if (--nref == 0) {
+      cond->done();
+      delete this;
+      ret = true;
+    }
+    cond->put();
+    return ret;
+  }
+
+  void put_wait() {
+    RefCountedCond *cond = c;
+
+    cond->get();
+    if (--nref == 0) {
+      cond->done();
+      delete this;
+    } else {
+      cond->wait();
+    }
+    cond->put();
+  }
+};
+
+#endif // WITH_SEASTAR
+
+static inline void intrusive_ptr_add_ref(const RefCountedObject *p) {
+  p->get();
+}
+static inline void intrusive_ptr_release(const RefCountedObject *p) {
+  p->put();
+}
+
+using RefCountedPtr = boost::intrusive_ptr<RefCountedObject>;
+
+#endif
diff --git a/src/common/Semaphore.h b/src/common/Semaphore.h
new file mode 100644
index 00000000..88aa9c84
--- /dev/null
+++ b/src/common/Semaphore.h
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_Sem_Posix__H
+#define CEPH_Sem_Posix__H
+
+#include "common/ceph_mutex.h"
+
+class Semaphore
+{
+  ceph::mutex m = ceph::make_mutex("Semaphore::m");
+  ceph::condition_variable c;
+  int count = 0;
+
+  public:
+
+  void Put()
+  { 
+    std::lock_guard l(m);
+    count++;
+    c.notify_all();
+  }
+
+  void Get() 
+  {
+    std::unique_lock l(m);
+    while(count <= 0) {
+      c.wait(l);
+    }
+    count--;
+  }
+};
+
+#endif // !_Mutex_Posix_
diff --git a/src/common/SloppyCRCMap.cc b/src/common/SloppyCRCMap.cc
new file mode 100644
index 00000000..102a6399
--- /dev/null
+++ b/src/common/SloppyCRCMap.cc
@@ -0,0 +1,182 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#include "common/SloppyCRCMap.h"
+#include "common/Formatter.h"
+
+using namespace std;
+
+void SloppyCRCMap::write(uint64_t offset, uint64_t len, const bufferlist& bl,
+			 std::ostream *out)
+{
+  int64_t left = len;
+  uint64_t pos = offset;
+  unsigned o = offset % block_size;
+  if (o) {
+    crc_map.erase(offset - o);
+    if (out)
+      *out << "write invalidate " << (offset - o) << "\n";
+    pos += (block_size - o);
+    left -= (block_size - o);
+  }
+  while (left >= block_size) {
+    bufferlist t;
+    t.substr_of(bl, pos - offset, block_size);
+    crc_map[pos] = t.crc32c(crc_iv);
+    if (out)
+      *out << "write set " << pos << " " << crc_map[pos] << "\n";
+    pos += block_size;
+    left -= block_size;
+  }
+  if (left > 0) {
+    crc_map.erase(pos);
+    if (out)
+      *out << "write invalidate " << pos << "\n";
+  }
+}
+
+int SloppyCRCMap::read(uint64_t offset, uint64_t len, const bufferlist& bl,
+		       std::ostream *err)
+{
+  int errors = 0;
+  int64_t left = len;
+  uint64_t pos = offset;
+  unsigned o = offset % block_size;
+  if (o) {
+    pos += (block_size - o);
+    left -= (block_size - o);
+  }
+  while (left >= block_size) {
+    // FIXME: this could be more efficient if we avoid doing a find()
+    // on each iteration
+    std::map<uint64_t,uint32_t>::iterator p = crc_map.find(pos);
+    if (p != crc_map.end()) {
+      bufferlist t;
+      t.substr_of(bl, pos - offset, block_size);
+      uint32_t crc = t.crc32c(crc_iv);
+      if (p->second != crc) {
+	errors++;
+	if (err)
+	  *err << "offset " << pos << " len " << block_size
+	       << " has crc " << crc << " expected " << p->second << "\n";
+      }
+    }
+    pos += block_size;
+    left -= block_size;
+  }
+  return errors;  
+}
+
+void SloppyCRCMap::truncate(uint64_t offset)
+{
+  offset -= offset % block_size;
+  std::map<uint64_t,uint32_t>::iterator p = crc_map.lower_bound(offset);
+  while (p != crc_map.end())
+    crc_map.erase(p++);
+}
+
+void SloppyCRCMap::zero(uint64_t offset, uint64_t len)
+{
+  int64_t left = len;
+  uint64_t pos = offset;
+  unsigned o = offset % block_size;
+  if (o) {
+    crc_map.erase(offset - o);
+    pos += (block_size - o);
+    left -= (block_size - o);
+  }
+  while (left >= block_size) {
+    crc_map[pos] = zero_crc;
+    pos += block_size;
+    left -= block_size;
+  }
+  if (left > 0)
+    crc_map.erase(pos);
+}
+
+void SloppyCRCMap::clone_range(uint64_t offset, uint64_t len,
+			       uint64_t srcoff, const SloppyCRCMap& src,
+			       std::ostream *out)
+{
+  int64_t left = len;
+  uint64_t pos = offset;
+  uint64_t srcpos = srcoff;
+  unsigned o = offset % block_size;
+  if (o) {
+    crc_map.erase(offset - o);
+    pos += (block_size - o);
+    srcpos += (block_size - o);
+    left -= (block_size - o);
+    if (out)
+      *out << "clone_range invalidate " << (offset - o) << "\n";
+  }
+  while (left >= block_size) {
+    // FIXME: this could be more efficient.
+    if (block_size == src.block_size) {
+      map<uint64_t,uint32_t>::const_iterator p = src.crc_map.find(srcpos);
+      if (p != src.crc_map.end()) {
+	crc_map[pos] = p->second;
+	if (out)
+	  *out << "clone_range copy " << pos << " " << p->second << "\n";
+      } else {
+	crc_map.erase(pos);
+	if (out)
+	  *out << "clone_range invalidate " << pos << "\n";
+      }
+    } else {
+      crc_map.erase(pos);
+      if (out)
+	*out << "clone_range invalidate " << pos << "\n";
+    }
+    pos += block_size;
+    srcpos += block_size;
+    left -= block_size;
+  }
+  if (left > 0) {
+    crc_map.erase(pos);
+    if (out)
+      *out << "clone_range invalidate " << pos << "\n";
+  }
+}
+
+void SloppyCRCMap::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(block_size, bl);
+  encode(crc_map, bl);
+  ENCODE_FINISH(bl);
+}
+
+void SloppyCRCMap::decode(bufferlist::const_iterator& bl)
+{
+  DECODE_START(1, bl);
+  uint32_t bs;
+  decode(bs, bl);
+  set_block_size(bs);
+  decode(crc_map, bl);
+  DECODE_FINISH(bl);
+}
+
+void SloppyCRCMap::dump(ceph::Formatter *f) const
+{
+  f->dump_unsigned("block_size", block_size);
+  f->open_array_section("crc_map");
+  for (map<uint64_t,uint32_t>::const_iterator p = crc_map.begin(); p != crc_map.end(); ++p) {
+    f->open_object_section("crc");
+    f->dump_unsigned("offset", p->first);
+    f->dump_unsigned("crc", p->second);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void SloppyCRCMap::generate_test_instances(list<SloppyCRCMap*>& ls)
+{
+  ls.push_back(new SloppyCRCMap);
+  ls.push_back(new SloppyCRCMap(2));
+  bufferlist bl;
+  bl.append("some data");
+  ls.back()->write(1, bl.length(), bl);
+  ls.back()->write(10, bl.length(), bl);
+  ls.back()->zero(4, 2);
+}
diff --git a/src/common/SloppyCRCMap.h b/src/common/SloppyCRCMap.h
new file mode 100644
index 00000000..0c2d646f
--- /dev/null
+++ b/src/common/SloppyCRCMap.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_COMMON_SLOPPYCRCMAP_H
+#define CEPH_COMMON_SLOPPYCRCMAP_H
+
+#include "include/encoding.h"
+
+namespace ceph {
+class Formatter;
+}
+
+/**
+ * SloppyCRCMap
+ *
+ * Opportunistically track CRCs on any reads or writes that cover full
+ * blocks.  Verify read results when we have CRC data available for
+ * the given extent.
+ */
+class SloppyCRCMap {
+  static const int crc_iv = 0xffffffff;
+
+  std::map<uint64_t, uint32_t> crc_map;  // offset -> crc(-1)
+  uint32_t block_size;
+  uint32_t zero_crc;
+
+public:
+  SloppyCRCMap(uint32_t b=0) {
+    set_block_size(b);
+  }
+
+  void set_block_size(uint32_t b) {
+    block_size = b;
+    //zero_crc = ceph_crc32c(0xffffffff, NULL, block_size);
+    if (b) {
+      bufferlist bl;
+      bl.append_zero(block_size);
+      zero_crc = bl.crc32c(crc_iv);
+    } else {
+      zero_crc = crc_iv;
+    }
+  }
+
+  /// update based on a write
+  void write(uint64_t offset, uint64_t len, const bufferlist& bl,
+	     std::ostream *out = NULL);
+
+  /// update based on a truncate
+  void truncate(uint64_t offset);
+
+  /// update based on a zero/punch_hole
+  void zero(uint64_t offset, uint64_t len);
+
+  /// update based on a zero/punch_hole
+  void clone_range(uint64_t offset, uint64_t len, uint64_t srcoff, const SloppyCRCMap& src,
+		   std::ostream *out = NULL);
+
+  /**
+   * validate a read result
+   *
+   * @param offset offset
+   * @param length length
+   * @param bl data read
+   * @param err option ostream to describe errors in detail
+   * @returns error count, 0 for success
+   */
+  int read(uint64_t offset, uint64_t len, const bufferlist& bl, std::ostream *err);
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<SloppyCRCMap*>& ls);
+};
+WRITE_CLASS_ENCODER(SloppyCRCMap)
+
+#endif
diff --git a/src/common/StackStringStream.h b/src/common/StackStringStream.h
new file mode 100644
index 00000000..046e37d7
--- /dev/null
+++ b/src/common/StackStringStream.h
@@ -0,0 +1,189 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.	See file COPYING.
+ *
+ */
+
+#ifndef COMMON_STACKSTRINGSTREAM_H
+#define COMMON_STACKSTRINGSTREAM_H
+
+#include <boost/container/small_vector.hpp>
+
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <ostream>
+#include <sstream>
+#include <string_view>
+#include <vector>
+
+#include "include/inline_memory.h"
+
+template<std::size_t SIZE>
+class StackStringBuf : public std::basic_streambuf<char>
+{
+public:
+  StackStringBuf()
+    : vec{SIZE, boost::container::default_init_t{}}
+  {
+    setp(vec.data(), vec.data() + vec.size());
+  }
+  StackStringBuf(const StackStringBuf&) = delete;
+  StackStringBuf& operator=(const StackStringBuf&) = delete;
+  StackStringBuf(StackStringBuf&& o) = delete;
+  StackStringBuf& operator=(StackStringBuf&& o) = delete;
+  ~StackStringBuf() override = default;
+
+  void clear()
+  {
+    vec.resize(SIZE);
+    setp(vec.data(), vec.data() + SIZE);
+  }
+
+  std::string_view strv() const
+  {
+    return std::string_view(pbase(), pptr() - pbase());
+  }
+
+protected:
+  std::streamsize xsputn(const char *s, std::streamsize n)
+  {
+    std::streamsize capacity = epptr() - pptr();
+    std::streamsize left = n;
+    if (capacity >= left) {
+      maybe_inline_memcpy(pptr(), s, left, 32);
+      pbump(left);
+    } else {
+      maybe_inline_memcpy(pptr(), s, capacity, 64);
+      s += capacity;
+      left -= capacity;
+      vec.insert(vec.end(), s, s + left);
+      setp(vec.data(), vec.data() + vec.size());
+      pbump(vec.size());
+    }
+    return n;
+  }
+
+  int overflow(int c)
+  {
+    if (traits_type::not_eof(c)) {
+      char str = traits_type::to_char_type(c);
+      vec.push_back(str);
+      return c;
+    } else {
+      return traits_type::eof();
+    }
+  }
+
+private:
+
+  boost::container::small_vector<char, SIZE> vec;
+};
+
+template<std::size_t SIZE>
+class StackStringStream : public std::basic_ostream<char>
+{
+public:
+  StackStringStream() : basic_ostream<char>(&ssb), default_fmtflags(flags()) {}
+  StackStringStream(const StackStringStream& o) = delete;
+  StackStringStream& operator=(const StackStringStream& o) = delete;
+  StackStringStream(StackStringStream&& o) = delete;
+  StackStringStream& operator=(StackStringStream&& o) = delete;
+  ~StackStringStream() override = default;
+
+  void reset() {
+    clear(); /* reset state flags */
+    flags(default_fmtflags); /* reset fmtflags to constructor defaults */
+    ssb.clear();
+  }
+
+  std::string_view strv() const {
+    return ssb.strv();
+  }
+
+private:
+  StackStringBuf<SIZE> ssb;
+  fmtflags const default_fmtflags;
+};
+
+/* In an ideal world, we could use StackStringStream indiscriminately, but alas
+ * it's very expensive to construct/destruct. So, we cache them in a
+ * thread_local vector. DO NOT share these with other threads. The copy/move
+ * constructors are deliberately restrictive to make this more difficult to
+ * accidentally do.
+ */
+class CachedStackStringStream {
+public:
+  using sss = StackStringStream<4096>;
+  using osptr = std::unique_ptr<sss>;
+
+  CachedStackStringStream() {
+    if (cache.destructed || cache.c.empty()) {
+      osp = std::make_unique<sss>();
+    } else {
+      osp = std::move(cache.c.back());
+      cache.c.pop_back();
+      osp->reset();
+    }
+  }
+  CachedStackStringStream(const CachedStackStringStream&) = delete;
+  CachedStackStringStream& operator=(const CachedStackStringStream&) = delete;
+  CachedStackStringStream(CachedStackStringStream&&) = delete;
+  CachedStackStringStream& operator=(CachedStackStringStream&&) = delete;
+  ~CachedStackStringStream() {
+    if (!cache.destructed && cache.c.size() < max_elems) {
+      cache.c.emplace_back(std::move(osp));
+    }
+  }
+
+  sss& operator*() {
+    return *osp;
+  }
+  sss const& operator*() const {
+    return *osp;
+  }
+  sss* operator->() {
+    return osp.get();
+  }
+  sss const* operator->() const {
+    return osp.get();
+  }
+
+  sss const* get() const {
+    return osp.get();
+  }
+  sss* get() {
+    return osp.get();
+  }
+
+private:
+  static constexpr std::size_t max_elems = 8;
+
+  /* The thread_local cache may be destructed before other static structures.
+   * If those destructors try to create a CachedStackStringStream (e.g. for
+   * logging) and access this cache, that access will be undefined. So note if
+   * the cache has been destructed and check before use.
+   */
+  struct Cache {
+    using container = std::vector<osptr>;
+
+    Cache() {}
+    ~Cache() { destructed = true; }
+
+    container c;
+    bool destructed = false;
+  };
+
+  inline static thread_local Cache cache;
+  osptr osp;
+};
+
+#endif
diff --git a/src/common/SubProcess.cc b/src/common/SubProcess.cc
new file mode 100644
index 00000000..b7d6ca4c
--- /dev/null
+++ b/src/common/SubProcess.cc
@@ -0,0 +1,394 @@
+#include "SubProcess.h"
+
+#if defined(__FreeBSD__) || defined(__APPLE__)
+#include <sys/types.h>
+#include <signal.h>
+#endif
+#include <stdarg.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <iostream>
+
+#include "common/errno.h"
+#include "include/ceph_assert.h"
+#include "include/compat.h"
+
+SubProcess::SubProcess(const char *cmd_, std_fd_op stdin_op_, std_fd_op stdout_op_, std_fd_op stderr_op_) :
+  cmd(cmd_),
+  cmd_args(),
+  stdin_op(stdin_op_),
+  stdout_op(stdout_op_),
+  stderr_op(stderr_op_),
+  stdin_pipe_out_fd(-1),
+  stdout_pipe_in_fd(-1),
+  stderr_pipe_in_fd(-1),
+  pid(-1),
+  errstr() {
+}
+
+SubProcess::~SubProcess() {
+  ceph_assert(!is_spawned());
+  ceph_assert(stdin_pipe_out_fd == -1);
+  ceph_assert(stdout_pipe_in_fd == -1);
+  ceph_assert(stderr_pipe_in_fd == -1);
+}
+
+void SubProcess::add_cmd_args(const char *arg, ...) {
+  ceph_assert(!is_spawned());
+
+  va_list ap;
+  va_start(ap, arg);
+  const char *p = arg;
+  do {
+    add_cmd_arg(p);
+    p = va_arg(ap, const char*);
+  } while (p != NULL);
+  va_end(ap);
+}
+
+void SubProcess::add_cmd_arg(const char *arg) {
+  ceph_assert(!is_spawned());
+
+  cmd_args.push_back(arg);
+}
+
+int SubProcess::get_stdin() const {
+  ceph_assert(is_spawned());
+  ceph_assert(stdin_op == PIPE);
+
+  return stdin_pipe_out_fd;
+}
+
+int SubProcess::get_stdout() const {
+  ceph_assert(is_spawned());
+  ceph_assert(stdout_op == PIPE);
+
+  return stdout_pipe_in_fd;
+}
+
+int SubProcess::get_stderr() const {
+  ceph_assert(is_spawned());
+  ceph_assert(stderr_op == PIPE);
+
+  return stderr_pipe_in_fd;
+}
+
+void SubProcess::close(int &fd) {
+  if (fd == -1)
+    return;
+
+  ::close(fd);
+  fd = -1;
+}
+
+void SubProcess::close_stdin() {
+  ceph_assert(is_spawned());
+  ceph_assert(stdin_op == PIPE);
+
+  close(stdin_pipe_out_fd);
+}
+
+void SubProcess::close_stdout() {
+  ceph_assert(is_spawned());
+  ceph_assert(stdout_op == PIPE);
+
+  close(stdout_pipe_in_fd);
+}
+
+void SubProcess::close_stderr() {
+  ceph_assert(is_spawned());
+  ceph_assert(stderr_op == PIPE);
+
+  close(stderr_pipe_in_fd);
+}
+
+void SubProcess::kill(int signo) const {
+  ceph_assert(is_spawned());
+
+  int ret = ::kill(pid, signo);
+  ceph_assert(ret == 0);
+}
+
+const std::string SubProcess::err() const {
+  return errstr.str();
+}
+
+class fd_buf : public std::streambuf {
+  int fd;
+public:
+  fd_buf (int fd) : fd(fd)
+  {}
+protected:
+  int_type overflow (int_type c) override {
+    if (c == EOF) return EOF;
+    char buf = c;
+    if (write (fd, &buf, 1) != 1) {
+      return EOF;
+    }
+    return c;
+  }
+  std::streamsize xsputn (const char* s, std::streamsize count) override {
+    return write(fd, s, count);
+  }
+};
+
+int SubProcess::spawn() {
+  ceph_assert(!is_spawned());
+  ceph_assert(stdin_pipe_out_fd == -1);
+  ceph_assert(stdout_pipe_in_fd == -1);
+  ceph_assert(stderr_pipe_in_fd == -1);
+
+  enum { IN = 0, OUT = 1 };
+
+  int ipipe[2], opipe[2], epipe[2];
+
+  ipipe[0] = ipipe[1] = opipe[0] = opipe[1] = epipe[0] = epipe[1] = -1;
+
+  int ret = 0;
+
+  if ((stdin_op == PIPE  && pipe_cloexec(ipipe) == -1) ||
+      (stdout_op == PIPE && pipe_cloexec(opipe) == -1) ||
+      (stderr_op == PIPE && pipe_cloexec(epipe) == -1)) {
+    ret = -errno;
+    errstr << "pipe failed: " << cpp_strerror(errno);
+    goto fail;
+  }
+
+  pid = fork();
+
+  if (pid > 0) { // Parent
+    stdin_pipe_out_fd = ipipe[OUT]; close(ipipe[IN ]);
+    stdout_pipe_in_fd = opipe[IN ]; close(opipe[OUT]);
+    stderr_pipe_in_fd = epipe[IN ]; close(epipe[OUT]);
+    return 0;
+  }
+
+  if (pid == 0) { // Child
+    close(ipipe[OUT]);
+    close(opipe[IN ]);
+    close(epipe[IN ]);
+
+    if (ipipe[IN] >= 0) {
+      if (ipipe[IN] == STDIN_FILENO) {
+        ::fcntl(STDIN_FILENO, F_SETFD, 0); /* clear FD_CLOEXEC */
+      } else {
+        ::dup2(ipipe[IN], STDIN_FILENO);
+        ::close(ipipe[IN]);
+      }
+    }
+    if (opipe[OUT] >= 0) {
+      if (opipe[OUT] == STDOUT_FILENO) {
+        ::fcntl(STDOUT_FILENO, F_SETFD, 0); /* clear FD_CLOEXEC */
+      } else {
+        ::dup2(opipe[OUT], STDOUT_FILENO);
+        ::close(opipe[OUT]);
+        static fd_buf buf(STDOUT_FILENO);
+        std::cout.rdbuf(&buf);
+      }
+    }
+    if (epipe[OUT] >= 0) {
+      if (epipe[OUT] == STDERR_FILENO) {
+        ::fcntl(STDERR_FILENO, F_SETFD, 0); /* clear FD_CLOEXEC */
+      } else {
+        ::dup2(epipe[OUT], STDERR_FILENO);
+        ::close(epipe[OUT]);
+        static fd_buf buf(STDERR_FILENO);
+        std::cerr.rdbuf(&buf);
+      }
+    }
+
+    int maxfd = sysconf(_SC_OPEN_MAX);
+    if (maxfd == -1)
+      maxfd = 16384;
+    for (int fd = 0; fd <= maxfd; fd++) {
+      if (fd == STDIN_FILENO && stdin_op != CLOSE)
+	continue;
+      if (fd == STDOUT_FILENO && stdout_op != CLOSE)
+	continue;
+      if (fd == STDERR_FILENO && stderr_op != CLOSE)
+	continue;
+      ::close(fd);
+    }
+
+    exec();
+    ceph_abort(); // Never reached
+  }
+
+  ret = -errno;
+  errstr << "fork failed: " << cpp_strerror(errno);
+
+fail:
+  close(ipipe[0]);
+  close(ipipe[1]);
+  close(opipe[0]);
+  close(opipe[1]);
+  close(epipe[0]);
+  close(epipe[1]);
+
+  return ret;
+}
+
+void SubProcess::exec() {
+  ceph_assert(is_child());
+
+  std::vector<const char *> args;
+  args.push_back(cmd.c_str());
+  for (std::vector<std::string>::iterator i = cmd_args.begin();
+       i != cmd_args.end();
+       i++) {
+    args.push_back(i->c_str());
+  }
+  args.push_back(NULL);
+
+  int ret = execvp(cmd.c_str(), (char * const *)&args[0]);
+  ceph_assert(ret == -1);
+
+  std::cerr << cmd << ": exec failed: " << cpp_strerror(errno) << "\n";
+  _exit(EXIT_FAILURE);
+}
+
+int SubProcess::join() {
+  ceph_assert(is_spawned());
+
+  close(stdin_pipe_out_fd);
+  close(stdout_pipe_in_fd);
+  close(stderr_pipe_in_fd);
+
+  int status;
+
+  while (waitpid(pid, &status, 0) == -1)
+    ceph_assert(errno == EINTR);
+
+  pid = -1;
+
+  if (WIFEXITED(status)) {
+    if (WEXITSTATUS(status) != EXIT_SUCCESS)
+      errstr << cmd << ": exit status: " << WEXITSTATUS(status);
+    return WEXITSTATUS(status);
+  }
+  if (WIFSIGNALED(status)) {
+    errstr << cmd << ": got signal: " << WTERMSIG(status);
+    return 128 + WTERMSIG(status);
+  }
+  errstr << cmd << ": waitpid: unknown status returned\n";
+  return EXIT_FAILURE;
+}
+
+SubProcessTimed::SubProcessTimed(const char *cmd, std_fd_op stdin_op,
+				 std_fd_op stdout_op, std_fd_op stderr_op,
+				 int timeout_, int sigkill_) :
+  SubProcess(cmd, stdin_op, stdout_op, stderr_op),
+  timeout(timeout_),
+  sigkill(sigkill_) {
+}
+
+static bool timedout = false; // only used after fork
+void timeout_sighandler(int sig) {
+  timedout = true;
+}
+static void dummy_sighandler(int sig) {}
+
+void SubProcessTimed::exec() {
+  ceph_assert(is_child());
+
+  if (timeout <= 0) {
+    SubProcess::exec();
+    ceph_abort(); // Never reached
+  }
+
+  sigset_t mask, oldmask;
+  int pid;
+
+  // Restore default action for SIGTERM in case the parent process decided
+  // to ignore it.
+  if (signal(SIGTERM, SIG_DFL) == SIG_ERR) {
+    std::cerr << cmd << ": signal failed: " << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+  // Because SIGCHLD is ignored by default, setup dummy handler for it,
+  // so we can mask it.
+  if (signal(SIGCHLD, dummy_sighandler) == SIG_ERR) {
+    std::cerr << cmd << ": signal failed: " << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+  // Setup timeout handler.
+  if (signal(SIGALRM, timeout_sighandler) == SIG_ERR) {
+    std::cerr << cmd << ": signal failed: " << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+  // Block interesting signals.
+  sigemptyset(&mask);
+  sigaddset(&mask, SIGINT);
+  sigaddset(&mask, SIGTERM);
+  sigaddset(&mask, SIGCHLD);
+  sigaddset(&mask, SIGALRM);
+  if (sigprocmask(SIG_SETMASK, &mask, &oldmask) == -1) {
+    std::cerr << cmd << ": sigprocmask failed: " << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+
+  pid = fork();
+
+  if (pid == -1) {
+    std::cerr << cmd << ": fork failed: " << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+
+  if (pid == 0) { // Child
+    // Restore old sigmask.
+    if (sigprocmask(SIG_SETMASK, &oldmask, NULL) == -1) {
+      std::cerr << cmd << ": sigprocmask failed: " << cpp_strerror(errno) << "\n";
+      goto fail_exit;
+    }
+    (void)setpgid(0, 0); // Become process group leader.
+    SubProcess::exec();
+    ceph_abort(); // Never reached
+  }
+
+  // Parent
+  (void)alarm(timeout);
+
+  for (;;) {
+    int signo;
+    if (sigwait(&mask, &signo) == -1) {
+      std::cerr << cmd << ": sigwait failed: " << cpp_strerror(errno) << "\n";
+      goto fail_exit;
+    }
+    switch (signo) {
+    case SIGCHLD:
+      int status;
+      if (waitpid(pid, &status, WNOHANG) == -1) {
+	std::cerr << cmd << ": waitpid failed: " << cpp_strerror(errno) << "\n";
+	goto fail_exit;
+      }
+      if (WIFEXITED(status))
+	_exit(WEXITSTATUS(status));
+      if (WIFSIGNALED(status))
+	_exit(128 + WTERMSIG(status));
+      std::cerr << cmd << ": unknown status returned\n";
+      goto fail_exit;
+    case SIGINT:
+    case SIGTERM:
+      // Pass SIGINT and SIGTERM, which are usually used to terminate
+      // a process, to the child.
+      if (::kill(pid, signo) == -1) {
+	std::cerr << cmd << ": kill failed: " << cpp_strerror(errno) << "\n";
+	goto fail_exit;
+      }
+      continue;
+    case SIGALRM:
+      std::cerr << cmd << ": timed out (" << timeout << " sec)\n";
+      if (::killpg(pid, sigkill) == -1) {
+	std::cerr << cmd << ": kill failed: " << cpp_strerror(errno) << "\n";
+	goto fail_exit;
+      }
+      continue;
+    default:
+      std::cerr << cmd << ": sigwait: invalid signal: " << signo << "\n";
+      goto fail_exit;
+    }
+  }
+
+fail_exit:
+  _exit(EXIT_FAILURE);
+}
diff --git a/src/common/SubProcess.h b/src/common/SubProcess.h
new file mode 100644
index 00000000..763822af
--- /dev/null
+++ b/src/common/SubProcess.h
@@ -0,0 +1,118 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph distributed storage system
+ *
+ * Copyright (C) 2015 Mirantis Inc
+ *
+ * Author: Mykola Golub <mgolub@mirantis.com>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef SUB_PROCESS_H
+#define SUB_PROCESS_H
+
+#if defined(__FreeBSD__) || defined(__APPLE__)
+#include <signal.h>
+#endif
+
+#include <sys/wait.h>
+#include <sstream>
+#include <vector>
+
+/**
+ * SubProcess:
+ * A helper class to spawn a subprocess.
+ *
+ * Example:
+ *
+ *   SubProcess cat("cat", SubProcess::PIPE, SubProcess::PIPE);
+ *   if (cat.spawn() != 0) {
+ *     std::cerr << "cat failed: " << cat.err() << std::endl;
+ *     return false;
+ *   }
+ *   write_to_fd(cat.get_stdout(), "hello world!\n");
+ *   cat.close_stdout();
+ *   read_from_fd(cat.get_stdin(), buf);
+ *   if (cat.join() != 0) {
+ *     std::cerr << cat.err() << std::endl;
+ *     return false;
+ *   }
+ */
+
+class SubProcess {
+public:
+  enum std_fd_op{
+    KEEP,
+    CLOSE,
+    PIPE
+  };
+public:
+  SubProcess(const char *cmd,
+             std_fd_op stdin_op = CLOSE,
+             std_fd_op stdout_op = CLOSE,
+             std_fd_op stderr_op = CLOSE);
+  virtual ~SubProcess();
+
+  void add_cmd_args(const char *arg, ...);
+  void add_cmd_arg(const char *arg);
+
+  virtual int spawn(); // Returns 0 on success or -errno on failure.
+  virtual int join();  // Returns exit code (0 on success).
+
+  bool is_spawned() const { return pid > 0; }
+
+  int get_stdin() const;
+  int get_stdout() const;
+  int get_stderr() const;
+
+  void close_stdin();
+  void close_stdout();
+  void close_stderr();
+
+  void kill(int signo = SIGTERM) const;
+
+  const std::string err() const;
+
+protected:
+  bool is_child() const { return pid == 0; }
+  virtual void exec();
+
+private:
+  void close(int &fd);
+
+protected:
+  std::string cmd;
+  std::vector<std::string> cmd_args;
+  std_fd_op stdin_op;
+  std_fd_op stdout_op;
+  std_fd_op stderr_op;
+  int stdin_pipe_out_fd;
+  int stdout_pipe_in_fd;
+  int stderr_pipe_in_fd;
+  int pid;
+  std::ostringstream errstr;
+};
+
+class SubProcessTimed : public SubProcess {
+public:
+  SubProcessTimed(const char *cmd, std_fd_op stdin_op = CLOSE,
+		  std_fd_op stdout_op = CLOSE, std_fd_op stderr_op = CLOSE,
+		  int timeout = 0, int sigkill = SIGKILL);
+
+protected:
+  void exec() override;
+
+private:
+  int timeout;
+  int sigkill;
+};
+
+void timeout_sighandler(int sig);
+
+#endif
diff --git a/src/common/TextTable.cc b/src/common/TextTable.cc
new file mode 100644
index 00000000..c94719a9
--- /dev/null
+++ b/src/common/TextTable.cc
@@ -0,0 +1,87 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "TextTable.h"
+
+using namespace std;
+
+void TextTable::define_column(const string &heading,
+			      enum TextTable::Align hd_align,
+			      enum TextTable::Align col_align)
+{
+  TextTableColumn def(heading, heading.length(), hd_align, col_align);
+  col.push_back(def);
+}
+
+void TextTable::clear() {
+  currow = 0;
+  curcol = 0;
+  indent = 0;
+  row.clear();
+  // reset widths to heading widths
+  for (unsigned int i = 0; i < col.size(); i++)
+    col[i].width = col[i].heading.size();
+}
+
+/**
+ * Pad s with space to appropriate alignment
+ *
+ * @param s string to pad
+ * @param width width of field to contain padded string
+ * @param align desired alignment (LEFT, CENTER, RIGHT)
+ *
+ * @return padded string
+ */
+static string
+pad(string s, int width, TextTable::Align align)
+{
+  int lpad, rpad;
+  lpad = 0;
+  rpad = 0;
+  switch (align) {
+    case TextTable::LEFT:
+      rpad = width - s.length();
+      break;
+    case TextTable::CENTER:
+      lpad = width / 2 - s.length() / 2;
+      rpad = width - lpad - s.length();
+      break;
+    case TextTable::RIGHT:
+      lpad = width - s.length();
+      break;
+  }
+
+  return string(lpad, ' ') + s + string(rpad, ' ');
+}
+
+std::ostream &operator<<(std::ostream &out, const TextTable &t)
+{
+  for (unsigned int i = 0; i < t.col.size(); i++) {
+    TextTable::TextTableColumn col = t.col[i];
+    out << string(t.indent, ' ')
+        << pad(col.heading, col.width, col.hd_align)
+	<< ' ';
+  }
+  out << endl;
+
+  for (unsigned int i = 0; i < t.row.size(); i++) {
+    for (unsigned int j = 0; j < t.row[i].size(); j++) {
+      TextTable::TextTableColumn col = t.col[j];
+      out << string(t.indent, ' ')
+	  << pad(t.row[i][j], col.width, col.col_align)
+	  << ' ';
+    }
+    out << endl;
+  }
+  return out;
+}
diff --git a/src/common/TextTable.h b/src/common/TextTable.h
new file mode 100644
index 00000000..2d6ad3cc
--- /dev/null
+++ b/src/common/TextTable.h
@@ -0,0 +1,165 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef TEXT_TABLE_H_
+#define TEXT_TABLE_H_
+
+#include <vector>
+#include <sstream>
+#include "include/ceph_assert.h"
+
+/**
+ * TextTable:
+ * Manage tabular output of data.  Caller defines heading of each column
+ * and alignment of heading and column data,
+ * then inserts rows of data including tuples of
+ * length (ncolumns) terminated by TextTable::endrow.  When all rows
+ * are inserted, caller asks for output with ostream <<
+ * which sizes/pads/dumps the table to ostream.
+ *
+ * Columns autosize to largest heading or datum.  One space is printed
+ * between columns.
+ */
+
+class TextTable {
+
+public:
+  enum Align {LEFT = 1, CENTER, RIGHT};
+
+private:
+  struct TextTableColumn {
+    std::string heading;
+    int width;
+    Align hd_align;
+    Align col_align;
+
+    TextTableColumn() {}
+    TextTableColumn(const std::string &h, int w, Align ha, Align ca) :
+		    heading(h), width(w), hd_align(ha), col_align(ca) { }
+    ~TextTableColumn() {}
+  };
+
+  std::vector<TextTableColumn> col;	// column definitions
+  unsigned int curcol, currow;		// col, row being inserted into
+  unsigned int indent;			// indent width when rendering
+
+protected:
+  std::vector<std::vector<std::string> > row;	// row data array
+
+public:
+  TextTable(): curcol(0), currow(0), indent(0) {}
+  ~TextTable() {}
+
+  /**
+   * Define a column in the table.
+   *
+   * @param heading Column heading string (or "")
+   * @param hd_align Alignment for heading in column
+   * @param col_align Data alignment
+   *
+   * @note alignment is of type TextTable::Align; values are
+   * TextTable::LEFT, TextTable::CENTER, or TextTable::RIGHT
+   *
+   */
+  void define_column(const std::string& heading, Align hd_align,
+		     Align col_align);
+
+  /**
+   * Set indent for table.  Only affects table output.
+   *
+   * @param i Number of spaces to indent
+   */
+  void set_indent(int i) { indent = i; }
+
+  /**
+   * Add item to table, perhaps on new row.
+   * table << val1 << val2 << TextTable::endrow;
+   *
+   * @param: value to output.
+   *
+   * @note: Numerics are output in decimal; strings are not truncated.
+   * Output formatting choice is limited to alignment in define_column().
+   *
+   * @return TextTable& for chaining.
+   */
+
+  template<typename T> TextTable& operator<<(const T& item)
+  {
+    if (row.size() < currow + 1)
+      row.resize(currow + 1);
+
+    /**
+     * col.size() is a good guess for how big row[currow] needs to be,
+     * so just expand it out now
+     */
+    if (row[currow].size() < col.size()) {
+      row[currow].resize(col.size());
+    }
+
+    // inserting more items than defined columns is a coding error
+    ceph_assert(curcol + 1 <= col.size());
+
+    // get rendered width of item alone
+    std::ostringstream oss;
+    oss << item;
+    int width = oss.str().length();
+    oss.seekp(0);
+
+    // expand column width if necessary
+    if (width > col[curcol].width) {
+      col[curcol].width = width;
+    }
+
+    // now store the rendered item with its proper width
+    row[currow][curcol] = oss.str();
+
+    curcol++;
+    return *this;
+  }
+
+  /**
+   * Degenerate type/variable here is just to allow selection of the
+   * following operator<< for "<< TextTable::endrow"
+   */
+
+  struct endrow_t {};
+  static constexpr endrow_t endrow{};
+
+  /**
+   * Implements TextTable::endrow
+   */
+
+  TextTable &operator<<(endrow_t)
+  {
+    curcol = 0;
+    currow++;
+    return *this;
+  }
+
+  /**
+   * Render table to ostream (i.e. cout << table)
+   */
+
+  friend std::ostream &operator<<(std::ostream &out, const TextTable &t);
+
+  /**
+   * clear: Reset everything in a TextTable except column defs
+   * resize cols to heading widths, clear indent
+   */
+
+  void clear();
+};
+
+#endif
+
diff --git a/src/common/Thread.cc b/src/common/Thread.cc
new file mode 100644
index 00000000..edbd7d34
--- /dev/null
+++ b/src/common/Thread.cc
@@ -0,0 +1,222 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <signal.h>
+#include <unistd.h>
+#ifdef __linux__
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#endif
+
+#include "common/Thread.h"
+#include "common/code_environment.h"
+#include "common/debug.h"
+#include "common/signal.h"
+
+#ifdef HAVE_SCHED
+#include <sched.h>
+#endif
+
+
+pid_t ceph_gettid(void)
+{
+#ifdef __linux__
+  return syscall(SYS_gettid);
+#else
+  return -ENOSYS;
+#endif
+}
+
+static int _set_affinity(int id)
+{
+#ifdef HAVE_SCHED
+  if (id >= 0 && id < CPU_SETSIZE) {
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+
+    CPU_SET(id, &cpuset);
+
+    if (sched_setaffinity(0, sizeof(cpuset), &cpuset) < 0)
+      return -errno;
+    /* guaranteed to take effect immediately */
+    sched_yield();
+  }
+#endif
+  return 0;
+}
+
+Thread::Thread()
+  : thread_id(0),
+    pid(0),
+    cpuid(-1),
+    thread_name(NULL)
+{
+}
+
+Thread::~Thread()
+{
+}
+
+void *Thread::_entry_func(void *arg) {
+  void *r = ((Thread*)arg)->entry_wrapper();
+  return r;
+}
+
+void *Thread::entry_wrapper()
+{
+  int p = ceph_gettid(); // may return -ENOSYS on other platforms
+  if (p > 0)
+    pid = p;
+  if (pid && cpuid >= 0)
+    _set_affinity(cpuid);
+
+  ceph_pthread_setname(pthread_self(), thread_name);
+  return entry();
+}
+
+const pthread_t &Thread::get_thread_id() const
+{
+  return thread_id;
+}
+
+bool Thread::is_started() const
+{
+  return thread_id != 0;
+}
+
+bool Thread::am_self() const
+{
+  return (pthread_self() == thread_id);
+}
+
+int Thread::kill(int signal)
+{
+  if (thread_id)
+    return pthread_kill(thread_id, signal);
+  else
+    return -EINVAL;
+}
+
+int Thread::try_create(size_t stacksize)
+{
+  pthread_attr_t *thread_attr = NULL;
+  pthread_attr_t thread_attr_loc;
+  
+  stacksize &= CEPH_PAGE_MASK;  // must be multiple of page
+  if (stacksize) {
+    thread_attr = &thread_attr_loc;
+    pthread_attr_init(thread_attr);
+    pthread_attr_setstacksize(thread_attr, stacksize);
+  }
+
+  int r;
+
+  // The child thread will inherit our signal mask.  Set our signal mask to
+  // the set of signals we want to block.  (It's ok to block signals more
+  // signals than usual for a little while-- they will just be delivered to
+  // another thread or delieverd to this thread later.)
+  sigset_t old_sigset;
+  if (g_code_env == CODE_ENVIRONMENT_LIBRARY) {
+    block_signals(NULL, &old_sigset);
+  }
+  else {
+    int to_block[] = { SIGPIPE , 0 };
+    block_signals(to_block, &old_sigset);
+  }
+  r = pthread_create(&thread_id, thread_attr, _entry_func, (void*)this);
+  restore_sigset(&old_sigset);
+
+  if (thread_attr) {
+    pthread_attr_destroy(thread_attr);	
+  }
+
+  return r;
+}
+
+void Thread::create(const char *name, size_t stacksize)
+{
+  ceph_assert(strlen(name) < 16);
+  thread_name = name;
+
+  int ret = try_create(stacksize);
+  if (ret != 0) {
+    char buf[256];
+    snprintf(buf, sizeof(buf), "Thread::try_create(): pthread_create "
+	     "failed with error %d", ret);
+    dout_emergency(buf);
+    ceph_assert(ret == 0);
+  }
+}
+
+int Thread::join(void **prval)
+{
+  if (thread_id == 0) {
+    ceph_abort_msg("join on thread that was never started");
+    return -EINVAL;
+  }
+
+  int status = pthread_join(thread_id, prval);
+  if (status != 0) {
+    char buf[256];
+    snprintf(buf, sizeof(buf), "Thread::join(): pthread_join "
+             "failed with error %d\n", status);
+    dout_emergency(buf);
+    ceph_assert(status == 0);
+  }
+
+  thread_id = 0;
+  return status;
+}
+
+int Thread::detach()
+{
+  return pthread_detach(thread_id);
+}
+
+int Thread::set_affinity(int id)
+{
+  int r = 0;
+  cpuid = id;
+  if (pid && ceph_gettid() == pid)
+    r = _set_affinity(id);
+  return r;
+}
+
+// Functions for std::thread
+// =========================
+
+void set_thread_name(std::thread& t, const std::string& s) {
+  int r = ceph_pthread_setname(t.native_handle(), s.c_str());
+  if (r != 0) {
+    throw std::system_error(r, std::generic_category());
+  }
+}
+std::string get_thread_name(const std::thread& t) {
+  std::string s(256, '\0');
+
+  int r = ceph_pthread_getname(const_cast<std::thread&>(t).native_handle(),
+			       s.data(), s.length());
+  if (r != 0) {
+    throw std::system_error(r, std::generic_category());
+  }
+  s.resize(std::strlen(s.data()));
+  return s;
+}
+
+void kill(std::thread& t, int signal)
+{
+  auto r = pthread_kill(t.native_handle(), signal);
+  if (r != 0) {
+    throw std::system_error(r, std::generic_category());
+  }
+}
diff --git a/src/common/Thread.h b/src/common/Thread.h
new file mode 100644
index 00000000..0ab65fca
--- /dev/null
+++ b/src/common/Thread.h
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_THREAD_H
+#define CEPH_THREAD_H
+
+#include <functional>
+#include <string_view>
+#include <system_error>
+#include <thread>
+
+#include <pthread.h>
+#include <sys/types.h>
+
+#include "include/compat.h"
+
+extern pid_t ceph_gettid();
+
+class Thread {
+ private:
+  pthread_t thread_id;
+  pid_t pid;
+  int cpuid;
+  const char *thread_name;
+
+  void *entry_wrapper();
+
+ public:
+  Thread(const Thread&) = delete;
+  Thread& operator=(const Thread&) = delete;
+
+  Thread();
+  virtual ~Thread();
+
+ protected:
+  virtual void *entry() = 0;
+
+ private:
+  static void *_entry_func(void *arg);
+
+ public:
+  const pthread_t &get_thread_id() const;
+  pid_t get_pid() const { return pid; }
+  bool is_started() const;
+  bool am_self() const;
+  int kill(int signal);
+  int try_create(size_t stacksize);
+  void create(const char *name, size_t stacksize = 0);
+  int join(void **prval = 0);
+  int detach();
+  int set_affinity(int cpuid);
+};
+
+// Functions for with std::thread
+
+void set_thread_name(std::thread& t, const std::string& s);
+std::string get_thread_name(const std::thread& t);
+void kill(std::thread& t, int signal);
+
+template<typename Fun, typename... Args>
+std::thread make_named_thread(std::string_view n,
+			      Fun&& fun,
+			      Args&& ...args) {
+
+  return std::thread([n = std::string(n)](auto&& fun, auto&& ...args) {
+		       ceph_pthread_setname(pthread_self(), n.data());
+		       std::invoke(std::forward<Fun>(fun),
+				   std::forward<Args>(args)...);
+		     }, std::forward<Fun>(fun), std::forward<Args>(args)...);
+}
+#endif
diff --git a/src/common/Throttle.cc b/src/common/Throttle.cc
new file mode 100644
index 00000000..40a8e6ed
--- /dev/null
+++ b/src/common/Throttle.cc
@@ -0,0 +1,843 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/scope_guard.h"
+
+#include "common/Throttle.h"
+#include "common/ceph_time.h"
+#include "common/perf_counters.h"
+
+
+// re-include our assert to clobber the system one; fix dout:
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_throttle
+
+#undef dout_prefix
+#define dout_prefix *_dout << "throttle(" << name << " " << (void*)this << ") "
+
+using ceph::mono_clock;
+using ceph::mono_time;
+
+enum {
+  l_throttle_first = 532430,
+  l_throttle_val,
+  l_throttle_max,
+  l_throttle_get_started,
+  l_throttle_get,
+  l_throttle_get_sum,
+  l_throttle_get_or_fail_fail,
+  l_throttle_get_or_fail_success,
+  l_throttle_take,
+  l_throttle_take_sum,
+  l_throttle_put,
+  l_throttle_put_sum,
+  l_throttle_wait,
+  l_throttle_last,
+};
+
+Throttle::Throttle(CephContext *cct, const std::string& n, int64_t m,
+		   bool _use_perf)
+  : cct(cct), name(n), max(m),
+    use_perf(_use_perf)
+{
+  ceph_assert(m >= 0);
+
+  if (!use_perf)
+    return;
+
+  if (cct->_conf->throttler_perf_counter) {
+    PerfCountersBuilder b(cct, string("throttle-") + name, l_throttle_first, l_throttle_last);
+    b.add_u64(l_throttle_val, "val", "Currently available throttle");
+    b.add_u64(l_throttle_max, "max", "Max value for throttle");
+    b.add_u64_counter(l_throttle_get_started, "get_started", "Number of get calls, increased before wait");
+    b.add_u64_counter(l_throttle_get, "get", "Gets");
+    b.add_u64_counter(l_throttle_get_sum, "get_sum", "Got data");
+    b.add_u64_counter(l_throttle_get_or_fail_fail, "get_or_fail_fail", "Get blocked during get_or_fail");
+    b.add_u64_counter(l_throttle_get_or_fail_success, "get_or_fail_success", "Successful get during get_or_fail");
+    b.add_u64_counter(l_throttle_take, "take", "Takes");
+    b.add_u64_counter(l_throttle_take_sum, "take_sum", "Taken data");
+    b.add_u64_counter(l_throttle_put, "put", "Puts");
+    b.add_u64_counter(l_throttle_put_sum, "put_sum", "Put data");
+    b.add_time_avg(l_throttle_wait, "wait", "Waiting latency");
+
+    logger = { b.create_perf_counters(), cct };
+    cct->get_perfcounters_collection()->add(logger.get());
+    logger->set(l_throttle_max, max);
+  }
+}
+
+Throttle::~Throttle()
+{
+  std::lock_guard l(lock);
+  ceph_assert(conds.empty());
+}
+
+void Throttle::_reset_max(int64_t m)
+{
+  // lock must be held.
+  if (max == m)
+    return;
+  if (!conds.empty())
+    conds.front().notify_one();
+  if (logger)
+    logger->set(l_throttle_max, m);
+  max = m;
+}
+
+bool Throttle::_wait(int64_t c, std::unique_lock<std::mutex>& l)
+{
+  mono_time start;
+  bool waited = false;
+  if (_should_wait(c) || !conds.empty()) { // always wait behind other waiters.
+    {
+      auto cv = conds.emplace(conds.end());
+      auto w = make_scope_guard([this, cv]() {
+	  conds.erase(cv);
+	});
+      waited = true;
+      ldout(cct, 2) << "_wait waiting..." << dendl;
+      if (logger)
+	start = mono_clock::now();
+
+      cv->wait(l, [this, c, cv]() { return (!_should_wait(c) &&
+					    cv == conds.begin()); });
+      ldout(cct, 2) << "_wait finished waiting" << dendl;
+      if (logger) {
+	logger->tinc(l_throttle_wait, mono_clock::now() - start);
+      }
+    }
+    // wake up the next guy
+    if (!conds.empty())
+      conds.front().notify_one();
+  }
+  return waited;
+}
+
+bool Throttle::wait(int64_t m)
+{
+  if (0 == max && 0 == m) {
+    return false;
+  }
+
+  std::unique_lock l(lock);
+  if (m) {
+    ceph_assert(m > 0);
+    _reset_max(m);
+  }
+  ldout(cct, 10) << "wait" << dendl;
+  return _wait(0, l);
+}
+
+int64_t Throttle::take(int64_t c)
+{
+  if (0 == max) {
+    return 0;
+  }
+  ceph_assert(c >= 0);
+  ldout(cct, 10) << "take " << c << dendl;
+  {
+    std::lock_guard l(lock);
+    count += c;
+  }
+  if (logger) {
+    logger->inc(l_throttle_take);
+    logger->inc(l_throttle_take_sum, c);
+    logger->set(l_throttle_val, count);
+  }
+  return count;
+}
+
+bool Throttle::get(int64_t c, int64_t m)
+{
+  if (0 == max && 0 == m) {
+    return false;
+  }
+
+  ceph_assert(c >= 0);
+  ldout(cct, 10) << "get " << c << " (" << count.load() << " -> " << (count.load() + c) << ")" << dendl;
+  if (logger) {
+    logger->inc(l_throttle_get_started);
+  }
+  bool waited = false;
+  {
+    std::unique_lock l(lock);
+    if (m) {
+      ceph_assert(m > 0);
+      _reset_max(m);
+    }
+    waited = _wait(c, l);
+    count += c;
+  }
+  if (logger) {
+    logger->inc(l_throttle_get);
+    logger->inc(l_throttle_get_sum, c);
+    logger->set(l_throttle_val, count);
+  }
+  return waited;
+}
+
+/* Returns true if it successfully got the requested amount,
+ * or false if it would block.
+ */
+bool Throttle::get_or_fail(int64_t c)
+{
+  if (0 == max) {
+    return true;
+  }
+
+  assert (c >= 0);
+  std::lock_guard l(lock);
+  if (_should_wait(c) || !conds.empty()) {
+    ldout(cct, 10) << "get_or_fail " << c << " failed" << dendl;
+    if (logger) {
+      logger->inc(l_throttle_get_or_fail_fail);
+    }
+    return false;
+  } else {
+    ldout(cct, 10) << "get_or_fail " << c << " success (" << count.load()
+		   << " -> " << (count.load() + c) << ")" << dendl;
+    count += c;
+    if (logger) {
+      logger->inc(l_throttle_get_or_fail_success);
+      logger->inc(l_throttle_get);
+      logger->inc(l_throttle_get_sum, c);
+      logger->set(l_throttle_val, count);
+    }
+    return true;
+  }
+}
+
+int64_t Throttle::put(int64_t c)
+{
+  if (0 == max) {
+    return 0;
+  }
+
+  ceph_assert(c >= 0);
+  ldout(cct, 10) << "put " << c << " (" << count.load() << " -> "
+		 << (count.load()-c) << ")" << dendl;
+  std::lock_guard l(lock);
+  if (c) {
+    if (!conds.empty())
+      conds.front().notify_one();
+    // if count goes negative, we failed somewhere!
+    ceph_assert(count >= c);
+    count -= c;
+    if (logger) {
+      logger->inc(l_throttle_put);
+      logger->inc(l_throttle_put_sum, c);
+      logger->set(l_throttle_val, count);
+    }
+  }
+  return count;
+}
+
+void Throttle::reset()
+{
+  std::lock_guard l(lock);
+  if (!conds.empty())
+    conds.front().notify_one();
+  count = 0;
+  if (logger) {
+    logger->set(l_throttle_val, 0);
+  }
+}
+
+enum {
+  l_backoff_throttle_first = l_throttle_last + 1,
+  l_backoff_throttle_val,
+  l_backoff_throttle_max,
+  l_backoff_throttle_get,
+  l_backoff_throttle_get_sum,
+  l_backoff_throttle_take,
+  l_backoff_throttle_take_sum,
+  l_backoff_throttle_put,
+  l_backoff_throttle_put_sum,
+  l_backoff_throttle_wait,
+  l_backoff_throttle_last,
+};
+
+BackoffThrottle::BackoffThrottle(CephContext *cct, const std::string& n,
+				 unsigned expected_concurrency, bool _use_perf)
+  : cct(cct), name(n),
+    conds(expected_concurrency),///< [in] determines size of conds
+    use_perf(_use_perf)
+{
+  if (!use_perf)
+    return;
+
+  if (cct->_conf->throttler_perf_counter) {
+    PerfCountersBuilder b(cct, string("throttle-") + name,
+			  l_backoff_throttle_first, l_backoff_throttle_last);
+    b.add_u64(l_backoff_throttle_val, "val", "Currently available throttle");
+    b.add_u64(l_backoff_throttle_max, "max", "Max value for throttle");
+    b.add_u64_counter(l_backoff_throttle_get, "get", "Gets");
+    b.add_u64_counter(l_backoff_throttle_get_sum, "get_sum", "Got data");
+    b.add_u64_counter(l_backoff_throttle_take, "take", "Takes");
+    b.add_u64_counter(l_backoff_throttle_take_sum, "take_sum", "Taken data");
+    b.add_u64_counter(l_backoff_throttle_put, "put", "Puts");
+    b.add_u64_counter(l_backoff_throttle_put_sum, "put_sum", "Put data");
+    b.add_time_avg(l_backoff_throttle_wait, "wait", "Waiting latency");
+
+    logger = { b.create_perf_counters(), cct };
+    cct->get_perfcounters_collection()->add(logger.get());
+    logger->set(l_backoff_throttle_max, max);
+  }
+}
+
+BackoffThrottle::~BackoffThrottle()
+{
+  std::lock_guard l(lock);
+  ceph_assert(waiters.empty());
+}
+
+bool BackoffThrottle::set_params(
+  double _low_threshold,
+  double _high_threshold,
+  double _expected_throughput,
+  double _high_multiple,
+  double _max_multiple,
+  uint64_t _throttle_max,
+  ostream *errstream)
+{
+  bool valid = true;
+  if (_low_threshold > _high_threshold) {
+    valid = false;
+    if (errstream) {
+      *errstream << "low_threshold (" << _low_threshold
+		 << ") > high_threshold (" << _high_threshold
+		 << ")" << std::endl;
+    }
+  }
+
+  if (_high_multiple > _max_multiple) {
+    valid = false;
+    if (errstream) {
+      *errstream << "_high_multiple (" << _high_multiple
+		 << ") > _max_multiple (" << _max_multiple
+		 << ")" << std::endl;
+    }
+  }
+
+  if (_low_threshold > 1 || _low_threshold < 0) {
+    valid = false;
+    if (errstream) {
+      *errstream << "invalid low_threshold (" << _low_threshold << ")"
+		 << std::endl;
+    }
+  }
+
+  if (_high_threshold > 1 || _high_threshold < 0) {
+    valid = false;
+    if (errstream) {
+      *errstream << "invalid high_threshold (" << _high_threshold << ")"
+		 << std::endl;
+    }
+  }
+
+  if (_max_multiple < 0) {
+    valid = false;
+    if (errstream) {
+      *errstream << "invalid _max_multiple ("
+		 << _max_multiple << ")"
+		 << std::endl;
+    }
+  }
+
+  if (_high_multiple < 0) {
+    valid = false;
+    if (errstream) {
+      *errstream << "invalid _high_multiple ("
+		 << _high_multiple << ")"
+		 << std::endl;
+    }
+  }
+
+  if (_expected_throughput < 0) {
+    valid = false;
+    if (errstream) {
+      *errstream << "invalid _expected_throughput("
+		 << _expected_throughput << ")"
+		 << std::endl;
+    }
+  }
+
+  if (!valid)
+    return false;
+
+  locker l(lock);
+  low_threshold = _low_threshold;
+  high_threshold = _high_threshold;
+  high_delay_per_count = _high_multiple / _expected_throughput;
+  max_delay_per_count = _max_multiple / _expected_throughput;
+  max = _throttle_max;
+
+  if (logger)
+    logger->set(l_backoff_throttle_max, max);
+
+  if (high_threshold - low_threshold > 0) {
+    s0 = high_delay_per_count / (high_threshold - low_threshold);
+  } else {
+    low_threshold = high_threshold;
+    s0 = 0;
+  }
+
+  if (1 - high_threshold > 0) {
+    s1 = (max_delay_per_count - high_delay_per_count)
+      / (1 - high_threshold);
+  } else {
+    high_threshold = 1;
+    s1 = 0;
+  }
+
+  _kick_waiters();
+  return true;
+}
+
+ceph::timespan BackoffThrottle::_get_delay(uint64_t c) const
+{
+  if (max == 0)
+    return ceph::timespan(0);
+
+  double r = ((double)current) / ((double)max);
+  if (r < low_threshold) {
+    return ceph::timespan(0);
+  } else if (r < high_threshold) {
+    return c * ceph::make_timespan(
+      (r - low_threshold) * s0);
+  } else {
+    return c * ceph::make_timespan(
+      high_delay_per_count + ((r - high_threshold) * s1));
+  }
+}
+
+ceph::timespan BackoffThrottle::get(uint64_t c)
+{
+  locker l(lock);
+  auto delay = _get_delay(c);
+
+  if (logger) {
+    logger->inc(l_backoff_throttle_get);
+    logger->inc(l_backoff_throttle_get_sum, c);
+  }
+
+  // fast path
+  if (delay.count() == 0 &&
+      waiters.empty() &&
+      ((max == 0) || (current == 0) || ((current + c) <= max))) {
+    current += c;
+
+    if (logger) {
+      logger->set(l_backoff_throttle_val, current);
+    }
+
+    return ceph::make_timespan(0);
+  }
+
+  auto ticket = _push_waiter();
+  auto wait_from = mono_clock::now();
+  bool waited = false;
+
+  while (waiters.begin() != ticket) {
+    (*ticket)->wait(l);
+    waited = true;
+  }
+
+  auto start = mono_clock::now();
+  delay = _get_delay(c);
+  while (true) {
+    if (max != 0 && current != 0 && (current + c) > max) {
+      (*ticket)->wait(l);
+      waited = true;
+    } else if (delay.count() > 0) {
+      (*ticket)->wait_for(l, delay);
+      waited = true;
+    } else {
+      break;
+    }
+    ceph_assert(ticket == waiters.begin());
+    delay = _get_delay(c);
+    auto elapsed = mono_clock::now() - start;
+    if (delay <= elapsed) {
+      delay = timespan::zero();
+    } else {
+      delay -= elapsed;
+    }
+  }
+  waiters.pop_front();
+  _kick_waiters();
+
+  current += c;
+
+  if (logger) {
+    logger->set(l_backoff_throttle_val, current);
+    if (waited) {
+      logger->tinc(l_backoff_throttle_wait, mono_clock::now() - wait_from);
+    }
+  }
+
+  return mono_clock::now() - start;
+}
+
+uint64_t BackoffThrottle::put(uint64_t c)
+{
+  locker l(lock);
+  ceph_assert(current >= c);
+  current -= c;
+  _kick_waiters();
+
+  if (logger) {
+    logger->inc(l_backoff_throttle_put);
+    logger->inc(l_backoff_throttle_put_sum, c);
+    logger->set(l_backoff_throttle_val, current);
+  }
+
+  return current;
+}
+
+uint64_t BackoffThrottle::take(uint64_t c)
+{
+  locker l(lock);
+  current += c;
+
+  if (logger) {
+    logger->inc(l_backoff_throttle_take);
+    logger->inc(l_backoff_throttle_take_sum, c);
+    logger->set(l_backoff_throttle_val, current);
+  }
+
+  return current;
+}
+
+uint64_t BackoffThrottle::get_current()
+{
+  locker l(lock);
+  return current;
+}
+
+uint64_t BackoffThrottle::get_max()
+{
+  locker l(lock);
+  return max;
+}
+
+SimpleThrottle::SimpleThrottle(uint64_t max, bool ignore_enoent)
+  : m_max(max), m_ignore_enoent(ignore_enoent) {}
+
+SimpleThrottle::~SimpleThrottle()
+{
+  std::lock_guard l(m_lock);
+  ceph_assert(m_current == 0);
+  ceph_assert(waiters == 0);
+}
+
+void SimpleThrottle::start_op()
+{
+  std::unique_lock l(m_lock);
+  waiters++;
+  m_cond.wait(l, [this]() { return m_max != m_current; });
+  waiters--;
+  ++m_current;
+}
+
+void SimpleThrottle::end_op(int r)
+{
+  std::lock_guard l(m_lock);
+  --m_current;
+  if (r < 0 && !m_ret && !(r == -ENOENT && m_ignore_enoent))
+    m_ret = r;
+  m_cond.notify_all();
+}
+
+bool SimpleThrottle::pending_error() const
+{
+  std::lock_guard l(m_lock);
+  return (m_ret < 0);
+}
+
+int SimpleThrottle::wait_for_ret()
+{
+  std::unique_lock l(m_lock);
+  waiters++;
+  m_cond.wait(l, [this]() { return m_current == 0; });
+  waiters--;
+  return m_ret;
+}
+
+void C_OrderedThrottle::finish(int r) {
+  m_ordered_throttle->finish_op(m_tid, r);
+}
+
+OrderedThrottle::OrderedThrottle(uint64_t max, bool ignore_enoent)
+  : m_max(max), m_ignore_enoent(ignore_enoent) {}
+
+OrderedThrottle::~OrderedThrottle() {
+  std::lock_guard l(m_lock);
+  ceph_assert(waiters == 0);
+}
+
+C_OrderedThrottle *OrderedThrottle::start_op(Context *on_finish) {
+  ceph_assert(on_finish);
+
+  std::unique_lock l(m_lock);
+  uint64_t tid = m_next_tid++;
+  m_tid_result[tid] = Result(on_finish);
+  auto ctx = std::make_unique<C_OrderedThrottle>(this, tid);
+
+  complete_pending_ops(l);
+  while (m_max == m_current) {
+    ++waiters;
+    m_cond.wait(l);
+    --waiters;
+    complete_pending_ops(l);
+  }
+  ++m_current;
+
+  return ctx.release();
+}
+
+void OrderedThrottle::end_op(int r) {
+  std::lock_guard l(m_lock);
+  ceph_assert(m_current > 0);
+
+  if (r < 0 && m_ret_val == 0 && (r != -ENOENT || !m_ignore_enoent)) {
+    m_ret_val = r;
+  }
+  --m_current;
+  m_cond.notify_all();
+}
+
+void OrderedThrottle::finish_op(uint64_t tid, int r) {
+  std::lock_guard l(m_lock);
+
+  auto it = m_tid_result.find(tid);
+  ceph_assert(it != m_tid_result.end());
+
+  it->second.finished = true;
+  it->second.ret_val = r;
+  m_cond.notify_all();
+}
+
+bool OrderedThrottle::pending_error() const {
+  std::lock_guard l(m_lock);
+  return (m_ret_val < 0);
+}
+
+int OrderedThrottle::wait_for_ret() {
+  std::unique_lock l(m_lock);
+  complete_pending_ops(l);
+
+  while (m_current > 0) {
+    ++waiters;
+    m_cond.wait(l);
+    --waiters;
+    complete_pending_ops(l);
+  }
+  return m_ret_val;
+}
+
+void OrderedThrottle::complete_pending_ops(std::unique_lock<std::mutex>& l) {
+  while (true) {
+    auto it = m_tid_result.begin();
+    if (it == m_tid_result.end() || it->first != m_complete_tid ||
+        !it->second.finished) {
+      break;
+    }
+
+    Result result = it->second;
+    m_tid_result.erase(it);
+
+    l.unlock();
+    result.on_finish->complete(result.ret_val);
+    l.lock();
+
+    ++m_complete_tid;
+  }
+}
+
+#undef dout_prefix
+#define dout_prefix *_dout << "TokenBucketThrottle(" << m_name << " " \
+                           << (void*)this << ") "
+
+uint64_t TokenBucketThrottle::Bucket::get(uint64_t c) {
+  if (0 == max) {
+    return 0;
+  }
+
+  uint64_t got = 0;
+  if (remain >= c) {
+    // There is enough token in bucket, take c.
+    got = c;
+    remain -= c;
+  } else {
+    // There is not enough, take all remain.
+    got = remain;
+    remain = 0;
+  }
+  return got;
+}
+
+uint64_t TokenBucketThrottle::Bucket::put(uint64_t c) {
+  if (0 == max) {
+    return 0;
+  }
+
+  if (c) {
+    // put c tokens into bucket
+    uint64_t current = remain;
+    if ((current + c) <= max) {
+      remain += c;
+    } else {
+      remain = max;
+    }
+  }
+  return remain;
+}
+
+void TokenBucketThrottle::Bucket::set_max(uint64_t m) {
+  if (remain > m || 0 == m) {
+    remain = m;
+  }
+  max = m;
+}
+
+TokenBucketThrottle::TokenBucketThrottle(
+    CephContext *cct,
+    const std::string &name,
+    uint64_t capacity,
+    uint64_t avg,
+    SafeTimer *timer,
+    Mutex *timer_lock)
+  : m_cct(cct), m_name(name),
+    m_throttle(m_cct, name + "_bucket", capacity),
+    m_avg(avg), m_timer(timer), m_timer_lock(timer_lock),
+    m_lock(name + "_lock")
+{}
+
+TokenBucketThrottle::~TokenBucketThrottle() {
+  // cancel the timer events.
+  {
+    std::lock_guard timer_locker(*m_timer_lock);
+    cancel_timer();
+  }
+
+  list<Blocker> tmp_blockers;
+  {
+    std::lock_guard blockers_lock(m_lock);
+    tmp_blockers.splice(tmp_blockers.begin(), m_blockers, m_blockers.begin(), m_blockers.end());
+  }
+
+  for (auto b : tmp_blockers) {
+    b.ctx->complete(0);
+  }
+}
+
+int TokenBucketThrottle::set_limit(uint64_t average, uint64_t burst) {
+  {
+    std::lock_guard<Mutex> lock(m_lock);
+
+    if (0 < burst && burst < average) {
+      // the burst should never less than the average.
+      return -EINVAL;
+    }
+
+    m_avg = average;
+    m_burst = burst;
+
+    if (0 == average) {
+      // The limit is not set, and no tokens will be put into the bucket.
+      // So, we can schedule the timer slowly, or even cancel it.
+      m_tick = 1000;
+    } else {
+      // calculate the tick(ms), don't less than the minimum.
+      m_tick = 1000 / average;
+      if (m_tick < m_tick_min) {
+        m_tick = m_tick_min;
+      }
+
+      // this is for the number(avg) can not be divisible.
+      m_ticks_per_second = 1000 / m_tick;
+      m_current_tick = 0;
+
+      // for the default configuration of burst.
+      m_throttle.set_max(0 == burst ? average : burst);
+    }
+    // turn millisecond to second
+    m_schedule_tick = m_tick / 1000.0;
+  }
+
+  // The schedule period will be changed when the average rate is set.
+  {
+    std::lock_guard<Mutex> timer_locker(*m_timer_lock);
+    cancel_timer();
+    schedule_timer();
+  }
+  return 0;
+}
+
+void TokenBucketThrottle::set_schedule_tick_min(uint64_t tick) {
+  std::lock_guard lock(m_lock);
+  if (tick != 0) {
+    m_tick_min = tick;
+  }
+}
+
+uint64_t TokenBucketThrottle::tokens_filled(double tick) {
+  return (0 == m_avg) ? 0 : (tick / m_ticks_per_second * m_avg);
+}
+
+uint64_t TokenBucketThrottle::tokens_this_tick() {
+  if (0 == m_avg) {
+    return 0;
+  }
+  if (m_current_tick >= m_ticks_per_second) {
+    m_current_tick = 0;
+  }
+  m_current_tick++;
+
+  return tokens_filled(m_current_tick) - tokens_filled(m_current_tick - 1);
+}
+
+void TokenBucketThrottle::add_tokens() {
+  list<Blocker> tmp_blockers;
+  {
+    std::lock_guard lock(m_lock);
+    // put tokens into bucket.
+    m_throttle.put(tokens_this_tick());
+    if (0 == m_avg || 0 == m_throttle.max)
+      tmp_blockers.swap(m_blockers);
+    // check the m_blockers from head to tail, if blocker can get
+    // enough tokens, let it go.
+    while (!m_blockers.empty()) {
+      Blocker &blocker = m_blockers.front();
+      uint64_t got = m_throttle.get(blocker.tokens_requested);
+      if (got == blocker.tokens_requested) {
+        // got enough tokens for front.
+        tmp_blockers.splice(tmp_blockers.end(), m_blockers, m_blockers.begin());
+      } else {
+        // there is no more tokens.
+        blocker.tokens_requested -= got;
+        break;
+      }
+    }
+  }
+
+  for (auto b : tmp_blockers) {
+    b.ctx->complete(0);
+  }
+}
+
+void TokenBucketThrottle::schedule_timer() {
+  m_token_ctx = new FunctionContext(
+      [this](int r) {
+        schedule_timer();
+      });
+  m_timer->add_event_after(m_schedule_tick, m_token_ctx);
+
+  add_tokens();
+}
+
+void TokenBucketThrottle::cancel_timer() {
+  m_timer->cancel_event(m_token_ctx);
+}
diff --git a/src/common/Throttle.h b/src/common/Throttle.h
new file mode 100644
index 00000000..1a6acac5
--- /dev/null
+++ b/src/common/Throttle.h
@@ -0,0 +1,464 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_THROTTLE_H
+#define CEPH_THROTTLE_H
+
+#include <atomic>
+#include <chrono>
+#include <iostream>
+#include <list>
+#include <map>
+
+#include "common/ceph_mutex.h"
+#include "include/Context.h"
+#include "common/ThrottleInterface.h"
+#include "common/Timer.h"
+#include "common/convenience.h"
+#include "common/perf_counters_collection.h"
+
+/**
+ * @class Throttle
+ * Throttles the maximum number of active requests.
+ *
+ * This class defines the maximum number of slots currently taken away. The
+ * excessive requests for more of them are delayed, until some slots are put
+ * back, so @p get_current() drops below the limit after fulfills the requests.
+ */
+class Throttle final : public ThrottleInterface {
+  CephContext *cct;
+  const std::string name;
+  PerfCountersRef logger;
+  std::atomic<int64_t> count = { 0 }, max = { 0 };
+  std::mutex lock;
+  std::list<std::condition_variable> conds;
+  const bool use_perf;
+
+public:
+  Throttle(CephContext *cct, const std::string& n, int64_t m = 0, bool _use_perf = true);
+  ~Throttle() override;
+
+private:
+  void _reset_max(int64_t m);
+  bool _should_wait(int64_t c) const {
+    int64_t m = max;
+    int64_t cur = count;
+    return
+      m &&
+      ((c <= m && cur + c > m) || // normally stay under max
+       (c >= m && cur > m));     // except for large c
+  }
+
+  bool _wait(int64_t c, std::unique_lock<std::mutex>& l);
+
+public:
+  /**
+   * gets the number of currently taken slots
+   * @returns the number of taken slots
+   */
+  int64_t get_current() const {
+    return count;
+  }
+
+  /**
+   * get the max number of slots
+   * @returns the max number of slots
+   */
+  int64_t get_max() const { return max; }
+
+  /**
+   * return true if past midpoint
+   */
+  bool past_midpoint() const {
+    return count >= max / 2;
+  }
+
+  /**
+   * set the new max number, and wait until the number of taken slots drains
+   * and drops below this limit.
+   *
+   * @param m the new max number
+   * @returns true if this method is blocked, false it it returns immediately
+   */
+  bool wait(int64_t m = 0);
+
+  /**
+   * take the specified number of slots from the stock regardless the throttling
+   * @param c number of slots to take
+   * @returns the total number of taken slots
+   */
+  int64_t take(int64_t c = 1) override;
+
+  /**
+   * get the specified amount of slots from the stock, but will wait if the
+   * total number taken by consumer would exceed the maximum number.
+   * @param c number of slots to get
+   * @param m new maximum number to set, ignored if it is 0
+   * @returns true if this request is blocked due to the throttling, false 
+   * otherwise
+   */
+  bool get(int64_t c = 1, int64_t m = 0);
+
+  /**
+   * the unblocked version of @p get()
+   * @returns true if it successfully got the requested amount,
+   * or false if it would block.
+   */
+  bool get_or_fail(int64_t c = 1);
+
+  /**
+   * put slots back to the stock
+   * @param c number of slots to return
+   * @returns number of requests being hold after this
+   */
+  int64_t put(int64_t c = 1) override;
+   /**
+   * reset the zero to the stock
+   */
+  void reset();
+
+  bool should_wait(int64_t c) const {
+    return _should_wait(c);
+  }
+  void reset_max(int64_t m) {
+    std::lock_guard l(lock);
+    _reset_max(m);
+  }
+};
+
+/**
+ * BackoffThrottle
+ *
+ * Creates a throttle which gradually induces delays when get() is called
+ * based on params low_threshold, high_threshold, expected_throughput,
+ * high_multiple, and max_multiple.
+ *
+ * In [0, low_threshold), we want no delay.
+ *
+ * In [low_threshold, high_threshold), delays should be injected based
+ * on a line from 0 at low_threshold to
+ * high_multiple * (1/expected_throughput) at high_threshold.
+ *
+ * In [high_threshold, 1), we want delays injected based on a line from
+ * (high_multiple * (1/expected_throughput)) at high_threshold to
+ * (high_multiple * (1/expected_throughput)) +
+ * (max_multiple * (1/expected_throughput)) at 1.
+ *
+ * Let the current throttle ratio (current/max) be r, low_threshold be l,
+ * high_threshold be h, high_delay (high_multiple / expected_throughput) be e,
+ * and max_delay (max_multiple / expected_throughput) be m.
+ *
+ * delay = 0, r \in [0, l)
+ * delay = (r - l) * (e / (h - l)), r \in [l, h)
+ * delay = e + (r - h)((m - e)/(1 - h))
+ */
+class BackoffThrottle {
+  CephContext *cct;
+  const std::string name;
+  PerfCountersRef logger;
+
+  std::mutex lock;
+  using locker = std::unique_lock<std::mutex>;
+
+  unsigned next_cond = 0;
+
+  /// allocated once to avoid constantly allocating new ones
+  vector<std::condition_variable> conds;
+
+  const bool use_perf;
+
+  /// pointers into conds
+  list<std::condition_variable*> waiters;
+
+  std::list<std::condition_variable*>::iterator _push_waiter() {
+    unsigned next = next_cond++;
+    if (next_cond == conds.size())
+      next_cond = 0;
+    return waiters.insert(waiters.end(), &(conds[next]));
+  }
+
+  void _kick_waiters() {
+    if (!waiters.empty())
+      waiters.front()->notify_all();
+  }
+
+  /// see above, values are in [0, 1].
+  double low_threshold = 0;
+  double high_threshold = 1;
+
+  /// see above, values are in seconds
+  double high_delay_per_count = 0;
+  double max_delay_per_count = 0;
+
+  /// Filled in in set_params
+  double s0 = 0; ///< e / (h - l), l != h, 0 otherwise
+  double s1 = 0; ///< (m - e)/(1 - h), 1 != h, 0 otherwise
+
+  /// max
+  uint64_t max = 0;
+  uint64_t current = 0;
+
+  ceph::timespan _get_delay(uint64_t c) const;
+
+public:
+  /**
+   * set_params
+   *
+   * Sets params.  If the params are invalid, returns false
+   * and populates errstream (if non-null) with a user comprehensible
+   * explanation.
+   */
+  bool set_params(
+    double _low_threshold,
+    double _high_threshold,
+    double expected_throughput,
+    double high_multiple,
+    double max_multiple,
+    uint64_t throttle_max,
+    ostream *errstream);
+
+  ceph::timespan get(uint64_t c = 1);
+  ceph::timespan wait() {
+    return get(0);
+  }
+  uint64_t put(uint64_t c = 1);
+  uint64_t take(uint64_t c = 1);
+  uint64_t get_current();
+  uint64_t get_max();
+
+  BackoffThrottle(CephContext *cct, const std::string& n,
+    unsigned expected_concurrency, ///< [in] determines size of conds
+    bool _use_perf = true);
+  ~BackoffThrottle();
+};
+
+
+/**
+ * @class SimpleThrottle
+ * This is a simple way to bound the number of concurrent operations.
+ *
+ * It tracks the first error encountered, and makes it available
+ * when all requests are complete. wait_for_ret() should be called
+ * before the instance is destroyed.
+ *
+ * Re-using the same instance isn't safe if you want to check each set
+ * of operations for errors, since the return value is not reset.
+ */
+class SimpleThrottle {
+public:
+  SimpleThrottle(uint64_t max, bool ignore_enoent);
+  ~SimpleThrottle();
+  void start_op();
+  void end_op(int r);
+  bool pending_error() const;
+  int wait_for_ret();
+private:
+  mutable std::mutex m_lock;
+  std::condition_variable m_cond;
+  uint64_t m_max;
+  uint64_t m_current = 0;
+  int m_ret = 0;
+  bool m_ignore_enoent;
+  uint32_t waiters = 0;
+};
+
+
+class OrderedThrottle;
+
+class C_OrderedThrottle : public Context {
+public:
+  C_OrderedThrottle(OrderedThrottle *ordered_throttle, uint64_t tid)
+    : m_ordered_throttle(ordered_throttle), m_tid(tid) {
+  }
+
+protected:
+  void finish(int r) override;
+
+private:
+  OrderedThrottle *m_ordered_throttle;
+  uint64_t m_tid;
+};
+
+/**
+ * @class OrderedThrottle
+ * Throttles the maximum number of active requests and completes them in order
+ *
+ * Operations can complete out-of-order but their associated Context callback
+ * will completed in-order during invocation of start_op() and wait_for_ret()
+ */
+class OrderedThrottle {
+public:
+  OrderedThrottle(uint64_t max, bool ignore_enoent);
+  ~OrderedThrottle();
+
+  C_OrderedThrottle *start_op(Context *on_finish);
+  void end_op(int r);
+
+  bool pending_error() const;
+  int wait_for_ret();
+
+protected:
+  friend class C_OrderedThrottle;
+
+  void finish_op(uint64_t tid, int r);
+
+private:
+  struct Result {
+    bool finished;
+    int ret_val;
+    Context *on_finish;
+
+    Result(Context *_on_finish = NULL)
+      : finished(false), ret_val(0), on_finish(_on_finish) {
+    }
+  };
+
+  typedef std::map<uint64_t, Result> TidResult;
+
+  mutable std::mutex m_lock;
+  std::condition_variable m_cond;
+  uint64_t m_max;
+  uint64_t m_current = 0;
+  int m_ret_val = 0;
+  bool m_ignore_enoent;
+
+  uint64_t m_next_tid = 0;
+  uint64_t m_complete_tid = 0;
+
+  TidResult m_tid_result;
+
+  void complete_pending_ops(std::unique_lock<std::mutex>& l);
+  uint32_t waiters = 0;
+};
+
+
+class TokenBucketThrottle {
+  struct Bucket {
+    CephContext *cct;
+    const std::string name;
+
+    uint64_t remain;
+    uint64_t max;
+
+    Bucket(CephContext *cct, const std::string &name, uint64_t m)
+      : cct(cct), name(name), remain(m), max(m) {}
+
+    uint64_t get(uint64_t c);
+    uint64_t put(uint64_t c);
+    void set_max(uint64_t m);
+  };
+
+  struct Blocker {
+    uint64_t tokens_requested;
+    Context *ctx;
+
+    Blocker(uint64_t _tokens_requested, Context* _ctx)
+      : tokens_requested(_tokens_requested), ctx(_ctx) {}
+  };
+
+  CephContext *m_cct;
+  const std::string m_name;
+  Bucket m_throttle;
+  uint64_t m_avg = 0;
+  uint64_t m_burst = 0;
+  SafeTimer *m_timer;
+  Mutex *m_timer_lock;
+  FunctionContext *m_token_ctx = nullptr;
+  list<Blocker> m_blockers;
+  Mutex m_lock;
+
+  // minimum of the filling period.
+  uint64_t m_tick_min = 50;
+  // tokens filling period, its unit is millisecond.
+  uint64_t m_tick = 0;
+  /**
+   * These variables are used to calculate how many tokens need to be put into
+   * the bucket within each tick.
+   *
+   * In actual use, the tokens to be put per tick(m_avg / m_ticks_per_second)
+   * may be a floating point number, but we need an 'uint64_t' to put into the
+   * bucket.
+   *
+   * For example, we set the value of rate to be 950, means 950 iops(or bps).
+   *
+   * In this case, the filling period(m_tick) should be 1000 / 950 = 1.052,
+   * which is too small for the SafeTimer. So we should set the period(m_tick)
+   * to be 50(m_tick_min), and 20 ticks in one second(m_ticks_per_second).
+   * The tokens filled in bucket per tick is 950 / 20 = 47.5, not an integer.
+   *
+   * To resolve this, we use a method called tokens_filled(m_current_tick) to
+   * calculate how many tokens will be put so far(until m_current_tick):
+   *
+   *   tokens_filled = m_current_tick / m_ticks_per_second * m_avg
+   *
+   * And the difference between two ticks will be the result we expect.
+   *   tokens in tick 0: (1 / 20 * 950) - (0 / 20 * 950) =  47 -   0 = 47
+   *   tokens in tick 1: (2 / 20 * 950) - (1 / 20 * 950) =  95 -  47 = 48
+   *   tokens in tick 2: (3 / 20 * 950) - (2 / 20 * 950) = 142 -  95 = 47
+   *
+   * As a result, the tokens filled in one second will shown as this:
+   *   tick    | 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|16|17|18|19|20|
+   *   tokens  |47|48|47|48|47|48|47|48|47|48|47|48|47|48|47|48|47|48|47|48|
+   */
+  uint64_t m_ticks_per_second = 0;
+  uint64_t m_current_tick = 0;
+
+  // period for the bucket filling tokens, its unit is seconds.
+  double m_schedule_tick = 1.0;
+
+public:
+  TokenBucketThrottle(CephContext *cct, const std::string &name,
+                      uint64_t capacity, uint64_t avg,
+                      SafeTimer *timer, Mutex *timer_lock);
+
+  ~TokenBucketThrottle();
+
+  const std::string &get_name() {
+    return m_name;
+  }
+
+  template <typename T, typename I, void(T::*MF)(int, I*, uint64_t)>
+  void add_blocker(uint64_t c, T *handler, I *item, uint64_t flag) {
+    Context *ctx = new FunctionContext([handler, item, flag](int r) {
+      (handler->*MF)(r, item, flag);
+      });
+    m_blockers.emplace_back(c, ctx);
+  }
+
+  template <typename T, typename I, void(T::*MF)(int, I*, uint64_t)>
+  bool get(uint64_t c, T *handler, I *item, uint64_t flag) {
+    bool wait = false;
+    uint64_t got = 0;
+    std::lock_guard lock(m_lock);
+    if (!m_blockers.empty()) {
+      // Keep the order of requests, add item after previous blocked requests.
+      wait = true;
+    } else {
+      if (0 == m_throttle.max || 0 == m_avg)
+        return false;
+
+      got = m_throttle.get(c);
+      if (got < c) {
+        // Not enough tokens, add a blocker for it.
+        wait = true;
+      }
+    }
+
+    if (wait)
+      add_blocker<T, I, MF>(c - got, handler, item, flag);
+
+    return wait;
+  }
+
+  int set_limit(uint64_t average, uint64_t burst);
+  void set_schedule_tick_min(uint64_t tick);
+
+private:
+  uint64_t tokens_filled(double tick);
+  uint64_t tokens_this_tick();
+  void add_tokens();
+  void schedule_timer();
+  void cancel_timer();
+};
+
+#endif
diff --git a/src/common/ThrottleInterface.h b/src/common/ThrottleInterface.h
new file mode 100644
index 00000000..49182a11
--- /dev/null
+++ b/src/common/ThrottleInterface.h
@@ -0,0 +1,23 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cstdint>
+
+class ThrottleInterface {
+public:
+  virtual ~ThrottleInterface() {}
+  /**
+   * take the specified number of slots from the stock regardless the throttling
+   * @param c number of slots to take
+   * @returns the total number of taken slots
+   */
+  virtual int64_t take(int64_t c = 1) = 0;
+  /**
+   * put slots back to the stock
+   * @param c number of slots to return
+   * @returns number of requests being hold after this
+   */
+  virtual int64_t put(int64_t c = 1) = 0;
+};
diff --git a/src/common/Timer.cc b/src/common/Timer.cc
new file mode 100644
index 00000000..68a51f63
--- /dev/null
+++ b/src/common/Timer.cc
@@ -0,0 +1,193 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "Cond.h"
+#include "Timer.h"
+
+
+#define dout_subsys ceph_subsys_timer
+#undef dout_prefix
+#define dout_prefix *_dout << "timer(" << this << ")."
+
+
+
+class SafeTimerThread : public Thread {
+  SafeTimer *parent;
+public:
+  explicit SafeTimerThread(SafeTimer *s) : parent(s) {}
+  void *entry() override {
+    parent->timer_thread();
+    return NULL;
+  }
+};
+
+
+
+typedef std::multimap < utime_t, Context *> scheduled_map_t;
+typedef std::map < Context*, scheduled_map_t::iterator > event_lookup_map_t;
+
+SafeTimer::SafeTimer(CephContext *cct_, Mutex &l, bool safe_callbacks)
+  : cct(cct_), lock(l),
+    safe_callbacks(safe_callbacks),
+    thread(NULL),
+    stopping(false)
+{
+}
+
+SafeTimer::~SafeTimer()
+{
+  ceph_assert(thread == NULL);
+}
+
+void SafeTimer::init()
+{
+  ldout(cct,10) << "init" << dendl;
+  thread = new SafeTimerThread(this);
+  thread->create("safe_timer");
+}
+
+void SafeTimer::shutdown()
+{
+  ldout(cct,10) << "shutdown" << dendl;
+  if (thread) {
+    ceph_assert(lock.is_locked());
+    cancel_all_events();
+    stopping = true;
+    cond.Signal();
+    lock.unlock();
+    thread->join();
+    lock.lock();
+    delete thread;
+    thread = NULL;
+  }
+}
+
+void SafeTimer::timer_thread()
+{
+  lock.lock();
+  ldout(cct,10) << "timer_thread starting" << dendl;
+  while (!stopping) {
+    utime_t now = ceph_clock_now();
+
+    while (!schedule.empty()) {
+      scheduled_map_t::iterator p = schedule.begin();
+
+      // is the future now?
+      if (p->first > now)
+	break;
+
+      Context *callback = p->second;
+      events.erase(callback);
+      schedule.erase(p);
+      ldout(cct,10) << "timer_thread executing " << callback << dendl;
+      
+      if (!safe_callbacks)
+	lock.unlock();
+      callback->complete(0);
+      if (!safe_callbacks)
+	lock.lock();
+    }
+
+    // recheck stopping if we dropped the lock
+    if (!safe_callbacks && stopping)
+      break;
+
+    ldout(cct,20) << "timer_thread going to sleep" << dendl;
+    if (schedule.empty())
+      cond.Wait(lock);
+    else
+      cond.WaitUntil(lock, schedule.begin()->first);
+    ldout(cct,20) << "timer_thread awake" << dendl;
+  }
+  ldout(cct,10) << "timer_thread exiting" << dendl;
+  lock.unlock();
+}
+
+Context* SafeTimer::add_event_after(double seconds, Context *callback)
+{
+  ceph_assert(lock.is_locked());
+
+  utime_t when = ceph_clock_now();
+  when += seconds;
+  return add_event_at(when, callback);
+}
+
+Context* SafeTimer::add_event_at(utime_t when, Context *callback)
+{
+  ceph_assert(lock.is_locked());
+  ldout(cct,10) << __func__ << " " << when << " -> " << callback << dendl;
+  if (stopping) {
+    ldout(cct,5) << __func__ << " already shutdown, event not added" << dendl;
+    delete callback;
+    return nullptr;
+  }
+  scheduled_map_t::value_type s_val(when, callback);
+  scheduled_map_t::iterator i = schedule.insert(s_val);
+
+  event_lookup_map_t::value_type e_val(callback, i);
+  pair < event_lookup_map_t::iterator, bool > rval(events.insert(e_val));
+
+  /* If you hit this, you tried to insert the same Context* twice. */
+  ceph_assert(rval.second);
+
+  /* If the event we have just inserted comes before everything else, we need to
+   * adjust our timeout. */
+  if (i == schedule.begin())
+    cond.Signal();
+  return callback;
+}
+
+bool SafeTimer::cancel_event(Context *callback)
+{
+  ceph_assert(lock.is_locked());
+  
+  auto p = events.find(callback);
+  if (p == events.end()) {
+    ldout(cct,10) << "cancel_event " << callback << " not found" << dendl;
+    return false;
+  }
+
+  ldout(cct,10) << "cancel_event " << p->second->first << " -> " << callback << dendl;
+  delete p->first;
+
+  schedule.erase(p->second);
+  events.erase(p);
+  return true;
+}
+
+void SafeTimer::cancel_all_events()
+{
+  ldout(cct,10) << "cancel_all_events" << dendl;
+  ceph_assert(lock.is_locked());
+
+  while (!events.empty()) {
+    auto p = events.begin();
+    ldout(cct,10) << " cancelled " << p->second->first << " -> " << p->first << dendl;
+    delete p->first;
+    schedule.erase(p->second);
+    events.erase(p);
+  }
+}
+
+void SafeTimer::dump(const char *caller) const
+{
+  if (!caller)
+    caller = "";
+  ldout(cct,10) << "dump " << caller << dendl;
+
+  for (scheduled_map_t::const_iterator s = schedule.begin();
+       s != schedule.end();
+       ++s)
+    ldout(cct,10) << " " << s->first << "->" << s->second << dendl;
+}
diff --git a/src/common/Timer.h b/src/common/Timer.h
new file mode 100644
index 00000000..f49ac0b0
--- /dev/null
+++ b/src/common/Timer.h
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_TIMER_H
+#define CEPH_TIMER_H
+
+#include "Cond.h"
+#include "Mutex.h"
+
+class CephContext;
+class Context;
+class SafeTimerThread;
+
+class SafeTimer
+{
+  CephContext *cct;
+  Mutex& lock;
+  Cond cond;
+  bool safe_callbacks;
+
+  friend class SafeTimerThread;
+  SafeTimerThread *thread;
+
+  void timer_thread();
+  void _shutdown();
+
+  std::multimap<utime_t, Context*> schedule;
+  std::map<Context*, std::multimap<utime_t, Context*>::iterator> events;
+  bool stopping;
+
+  void dump(const char *caller = 0) const;
+
+public:
+  // This class isn't supposed to be copied
+  SafeTimer(const SafeTimer&) = delete;
+  SafeTimer& operator=(const SafeTimer&) = delete;
+
+  /* Safe callbacks determines whether callbacks are called with the lock
+   * held.
+   *
+   * safe_callbacks = true (default option) guarantees that a cancelled
+   * event's callback will never be called.
+   *
+   * Under some circumstances, holding the lock can cause lock cycles.
+   * If you are able to relax requirements on cancelled callbacks, then
+   * setting safe_callbacks = false eliminates the lock cycle issue.
+   * */
+  SafeTimer(CephContext *cct, Mutex &l, bool safe_callbacks=true);
+  virtual ~SafeTimer();
+
+  /* Call with the event_lock UNLOCKED.
+   *
+   * Cancel all events and stop the timer thread.
+   *
+   * If there are any events that still have to run, they will need to take
+   * the event_lock first. */
+  void init();
+  void shutdown();
+
+  /* Schedule an event in the future
+   * Call with the event_lock LOCKED */
+  Context* add_event_after(double seconds, Context *callback);
+  Context* add_event_at(utime_t when, Context *callback);
+
+  /* Cancel an event.
+   * Call with the event_lock LOCKED
+   *
+   * Returns true if the callback was cancelled.
+   * Returns false if you never added the callback in the first place.
+   */
+  bool cancel_event(Context *callback);
+
+  /* Cancel all events.
+   * Call with the event_lock LOCKED
+   *
+   * When this function returns, all events have been cancelled, and there are no
+   * more in progress.
+   */
+  void cancel_all_events();
+
+};
+
+#endif
diff --git a/src/common/TracepointProvider.cc b/src/common/TracepointProvider.cc
new file mode 100644
index 00000000..38529f3d
--- /dev/null
+++ b/src/common/TracepointProvider.cc
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/TracepointProvider.h"
+#include "common/config.h"
+
+TracepointProvider::TracepointProvider(CephContext *cct, const char *library,
+                                       const char *config_key)
+  : m_cct(cct), m_library(library), m_config_keys{config_key, NULL}
+{
+  m_cct->_conf.add_observer(this);
+  verify_config(m_cct->_conf);
+}
+
+TracepointProvider::~TracepointProvider() {
+  m_cct->_conf.remove_observer(this);
+  if (m_handle) {
+    dlclose(m_handle);
+  }
+}
+
+void TracepointProvider::handle_conf_change(
+    const ConfigProxy& conf, const std::set<std::string> &changed) {
+  if (changed.count(m_config_keys[0])) {
+    verify_config(conf);
+  }
+}
+
+void TracepointProvider::verify_config(const ConfigProxy& conf) {
+  std::lock_guard locker(m_lock);
+  if (m_handle) {
+    return;
+  }
+
+  char buf[10];
+  char *pbuf = buf;
+  if (conf.get_val(m_config_keys[0], &pbuf, sizeof(buf)) != 0 ||
+      strncmp(buf, "true", 5) != 0) {
+    return;
+  }
+
+  m_handle = dlopen(m_library.c_str(), RTLD_NOW | RTLD_NODELETE);
+  ceph_assert(m_handle);
+}
+
diff --git a/src/common/TracepointProvider.h b/src/common/TracepointProvider.h
new file mode 100644
index 00000000..30e29060
--- /dev/null
+++ b/src/common/TracepointProvider.h
@@ -0,0 +1,82 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_TRACEPOINT_PROVIDER_H
+#define CEPH_TRACEPOINT_PROVIDER_H
+
+#include "common/ceph_context.h"
+#include "common/config_obs.h"
+#include "common/ceph_mutex.h"
+#include <dlfcn.h>
+
+class TracepointProvider : public md_config_obs_t {
+public:
+  struct Traits {
+    const char *library;
+    const char *config_key;
+
+    Traits(const char *library, const char *config_key)
+      : library(library), config_key(config_key) {
+    }
+  };
+
+  class Singleton {
+  public:
+    Singleton(CephContext *cct, const char *library, const char *config_key)
+      : tracepoint_provider(new TracepointProvider(cct, library, config_key)) {
+    }
+    ~Singleton() {
+      delete tracepoint_provider;
+    }
+
+    inline bool is_enabled() const {
+      return tracepoint_provider->m_handle != nullptr;
+    }
+  private:
+    TracepointProvider *tracepoint_provider;
+  };
+
+  template <const Traits &traits>
+  class TypedSingleton : public Singleton {
+  public:
+    explicit TypedSingleton(CephContext *cct)
+      : Singleton(cct, traits.library, traits.config_key) {
+    }
+  };
+
+  TracepointProvider(CephContext *cct, const char *library,
+                     const char *config_key);
+  ~TracepointProvider() override;
+
+  TracepointProvider(const TracepointProvider&) = delete;
+  TracepointProvider operator =(const TracepointProvider&) = delete;
+  TracepointProvider(TracepointProvider&&) = delete;
+  TracepointProvider operator =(TracepointProvider&&) = delete;
+
+  template <const Traits &traits>
+  static void initialize(CephContext *cct) {
+#ifdef WITH_LTTNG
+     cct->lookup_or_create_singleton_object<TypedSingleton<traits>>(
+       traits.library, false, cct);
+#endif
+  }
+
+protected:
+  const char** get_tracked_conf_keys() const override {
+    return m_config_keys;
+  }
+  void handle_conf_change(const ConfigProxy& conf,
+			  const std::set <std::string> &changed) override;
+
+private:
+  CephContext *m_cct;
+  std::string m_library;
+  mutable const char* m_config_keys[2];
+
+  ceph::mutex m_lock = ceph::make_mutex("TracepointProvider::m_lock");
+  void* m_handle = nullptr;
+
+  void verify_config(const ConfigProxy& conf);
+};
+
+#endif // CEPH_TRACEPOINT_PROVIDER_H
diff --git a/src/common/TrackedOp.cc b/src/common/TrackedOp.cc
new file mode 100644
index 00000000..49a7a988
--- /dev/null
+++ b/src/common/TrackedOp.cc
@@ -0,0 +1,479 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ * Copyright 2013 Inktank
+ */
+
+#include "TrackedOp.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_optracker
+#undef dout_prefix
+#define dout_prefix _prefix(_dout)
+
+static ostream& _prefix(std::ostream* _dout)
+{
+  return *_dout << "-- op tracker -- ";
+}
+
+void OpHistoryServiceThread::break_thread() {
+  queue_spinlock.lock();
+  _external_queue.clear();
+  _break_thread = true;
+  queue_spinlock.unlock();
+}
+
+void* OpHistoryServiceThread::entry() {
+  int sleep_time = 1000;
+  list<pair<utime_t, TrackedOpRef>> internal_queue;
+  while (1) {
+    queue_spinlock.lock();
+    if (_break_thread) {
+      queue_spinlock.unlock();
+      break;
+    }
+    internal_queue.swap(_external_queue);
+    queue_spinlock.unlock();
+    if (internal_queue.empty()) {
+      usleep(sleep_time);
+      if (sleep_time < 128000) {
+        sleep_time <<= 2;
+      }
+    } else {
+      sleep_time = 1000;
+    }
+
+    while (!internal_queue.empty()) {
+      pair<utime_t, TrackedOpRef> op = internal_queue.front();
+      _ophistory->_insert_delayed(op.first, op.second);
+      internal_queue.pop_front();
+    }
+  }
+  return nullptr;
+}
+
+
+void OpHistory::on_shutdown()
+{
+  opsvc.break_thread();
+  opsvc.join();
+  std::lock_guard history_lock(ops_history_lock);
+  arrived.clear();
+  duration.clear();
+  slow_op.clear();
+  shutdown = true;
+}
+
+void OpHistory::_insert_delayed(const utime_t& now, TrackedOpRef op)
+{
+  std::lock_guard history_lock(ops_history_lock);
+  if (shutdown)
+    return;
+  double opduration = op->get_duration();
+  duration.insert(make_pair(opduration, op));
+  arrived.insert(make_pair(op->get_initiated(), op));
+  if (opduration >= history_slow_op_threshold.load())
+    slow_op.insert(make_pair(op->get_initiated(), op));
+  cleanup(now);
+}
+
+void OpHistory::cleanup(utime_t now)
+{
+  while (arrived.size() &&
+	 (now - arrived.begin()->first >
+	  (double)(history_duration.load()))) {
+    duration.erase(make_pair(
+	arrived.begin()->second->get_duration(),
+	arrived.begin()->second));
+    arrived.erase(arrived.begin());
+  }
+
+  while (duration.size() > history_size.load()) {
+    arrived.erase(make_pair(
+	duration.begin()->second->get_initiated(),
+	duration.begin()->second));
+    duration.erase(duration.begin());
+  }
+
+  while (slow_op.size() > history_slow_op_size.load()) {
+    slow_op.erase(make_pair(
+	slow_op.begin()->second->get_initiated(),
+	slow_op.begin()->second));
+  }
+}
+
+void OpHistory::dump_ops(utime_t now, Formatter *f, set<string> filters, bool by_duration)
+{
+  std::lock_guard history_lock(ops_history_lock);
+  cleanup(now);
+  f->open_object_section("op_history");
+  f->dump_int("size", history_size.load());
+  f->dump_int("duration", history_duration.load());
+  {
+    f->open_array_section("ops");
+    auto dump_fn = [&f, &now, &filters](auto begin_iter, auto end_iter) {
+      for (auto i=begin_iter; i!=end_iter; ++i) {
+	if (!i->second->filter_out(filters))
+	  continue;
+	f->open_object_section("op");
+	i->second->dump(now, f);
+	f->close_section();
+      }
+    };
+
+    if (by_duration) {
+      dump_fn(duration.rbegin(), duration.rend());
+    } else {
+      dump_fn(arrived.begin(), arrived.end());
+    }
+    f->close_section();
+  }
+  f->close_section();
+}
+
+struct ShardedTrackingData {
+  ceph::mutex ops_in_flight_lock_sharded;
+  TrackedOp::tracked_op_list_t ops_in_flight_sharded;
+  explicit ShardedTrackingData(string lock_name)
+    : ops_in_flight_lock_sharded(ceph::make_mutex(lock_name)) {}
+};
+
+OpTracker::OpTracker(CephContext *cct_, bool tracking, uint32_t num_shards):
+  seq(0),
+  num_optracker_shards(num_shards),
+  complaint_time(0), log_threshold(0),
+  tracking_enabled(tracking),
+  lock("OpTracker::lock"), cct(cct_) {
+    for (uint32_t i = 0; i < num_optracker_shards; i++) {
+      char lock_name[32] = {0};
+      snprintf(lock_name, sizeof(lock_name), "%s:%" PRIu32, "OpTracker::ShardedLock", i);
+      ShardedTrackingData* one_shard = new ShardedTrackingData(lock_name);
+      sharded_in_flight_list.push_back(one_shard);
+    }
+}
+
+OpTracker::~OpTracker() {
+  while (!sharded_in_flight_list.empty()) {
+    ceph_assert((sharded_in_flight_list.back())->ops_in_flight_sharded.empty());
+    delete sharded_in_flight_list.back();
+    sharded_in_flight_list.pop_back();
+  }
+}
+
+bool OpTracker::dump_historic_ops(Formatter *f, bool by_duration, set<string> filters)
+{
+  if (!tracking_enabled)
+    return false;
+
+  RWLock::RLocker l(lock);
+  utime_t now = ceph_clock_now();
+  history.dump_ops(now, f, filters, by_duration);
+  return true;
+}
+
+void OpHistory::dump_slow_ops(utime_t now, Formatter *f, set<string> filters)
+{
+  std::lock_guard history_lock(ops_history_lock);
+  cleanup(now);
+  f->open_object_section("OpHistory slow ops");
+  f->dump_int("num to keep", history_slow_op_size.load());
+  f->dump_int("threshold to keep", history_slow_op_threshold.load());
+  {
+    f->open_array_section("Ops");
+    for (set<pair<utime_t, TrackedOpRef> >::const_iterator i =
+	   slow_op.begin();
+	 i != slow_op.end();
+	 ++i) {
+      if (!i->second->filter_out(filters))
+        continue;
+      f->open_object_section("Op");
+      i->second->dump(now, f);
+      f->close_section();
+    }
+    f->close_section();
+  }
+  f->close_section();
+}
+
+bool OpTracker::dump_historic_slow_ops(Formatter *f, set<string> filters)
+{
+  if (!tracking_enabled)
+    return false;
+
+  RWLock::RLocker l(lock);
+  utime_t now = ceph_clock_now();
+  history.dump_slow_ops(now, f, filters);
+  return true;
+}
+
+bool OpTracker::dump_ops_in_flight(Formatter *f, bool print_only_blocked, set<string> filters)
+{
+  if (!tracking_enabled)
+    return false;
+
+  RWLock::RLocker l(lock);
+  f->open_object_section("ops_in_flight"); // overall dump
+  uint64_t total_ops_in_flight = 0;
+  f->open_array_section("ops"); // list of TrackedOps
+  utime_t now = ceph_clock_now();
+  for (uint32_t i = 0; i < num_optracker_shards; i++) {
+    ShardedTrackingData* sdata = sharded_in_flight_list[i];
+    ceph_assert(NULL != sdata); 
+    std::lock_guard locker(sdata->ops_in_flight_lock_sharded);
+    for (auto& op : sdata->ops_in_flight_sharded) {
+      if (print_only_blocked && (now - op.get_initiated() <= complaint_time))
+        break;
+      if (!op.filter_out(filters))
+        continue;
+      f->open_object_section("op");
+      op.dump(now, f);
+      f->close_section(); // this TrackedOp
+      total_ops_in_flight++;
+    }
+  }
+  f->close_section(); // list of TrackedOps
+  if (print_only_blocked) {
+    f->dump_float("complaint_time", complaint_time);
+    f->dump_int("num_blocked_ops", total_ops_in_flight);
+  } else
+    f->dump_int("num_ops", total_ops_in_flight);
+  f->close_section(); // overall dump
+  return true;
+}
+
+bool OpTracker::register_inflight_op(TrackedOp *i)
+{
+  if (!tracking_enabled)
+    return false;
+
+  RWLock::RLocker l(lock);
+  uint64_t current_seq = ++seq;
+  uint32_t shard_index = current_seq % num_optracker_shards;
+  ShardedTrackingData* sdata = sharded_in_flight_list[shard_index];
+  ceph_assert(NULL != sdata);
+  {
+    std::lock_guard locker(sdata->ops_in_flight_lock_sharded);
+    sdata->ops_in_flight_sharded.push_back(*i);
+    i->seq = current_seq;
+  }
+  return true;
+}
+
+void OpTracker::unregister_inflight_op(TrackedOp* const i)
+{
+  // caller checks;
+  ceph_assert(i->state);
+
+  uint32_t shard_index = i->seq % num_optracker_shards;
+  ShardedTrackingData* sdata = sharded_in_flight_list[shard_index];
+  ceph_assert(NULL != sdata);
+  {
+    std::lock_guard locker(sdata->ops_in_flight_lock_sharded);
+    auto p = sdata->ops_in_flight_sharded.iterator_to(*i);
+    sdata->ops_in_flight_sharded.erase(p);
+  }
+}
+
+void OpTracker::record_history_op(TrackedOpRef&& i)
+{
+  RWLock::RLocker l(lock);
+  history.insert(ceph_clock_now(), std::move(i));
+}
+
+bool OpTracker::visit_ops_in_flight(utime_t* oldest_secs,
+				    std::function<bool(TrackedOp&)>&& visit)
+{
+  if (!tracking_enabled)
+    return false;
+
+  const utime_t now = ceph_clock_now();
+  utime_t oldest_op = now;
+  // single representation of all inflight operations reunified
+  // from OpTracker's shards. TrackedOpRef extends the lifetime
+  // to carry the ops outside of the critical section, and thus
+  // allows to call the visitor without any lock being held.
+  // This simplifies the contract on API at the price of plenty
+  // additional moves and atomic ref-counting. This seems OK as
+  // `visit_ops_in_flight()` is definitely not intended for any
+  // hot path.
+  std::vector<TrackedOpRef> ops_in_flight;
+
+  RWLock::RLocker l(lock);
+  for (const auto sdata : sharded_in_flight_list) {
+    ceph_assert(sdata);
+    std::lock_guard locker(sdata->ops_in_flight_lock_sharded);
+    if (!sdata->ops_in_flight_sharded.empty()) {
+      utime_t oldest_op_tmp =
+	sdata->ops_in_flight_sharded.front().get_initiated();
+      if (oldest_op_tmp < oldest_op) {
+        oldest_op = oldest_op_tmp;
+      }
+    }
+    std::transform(std::begin(sdata->ops_in_flight_sharded),
+                   std::end(sdata->ops_in_flight_sharded),
+                   std::back_inserter(ops_in_flight),
+                   [] (TrackedOp& op) { return TrackedOpRef(&op); });
+  }
+  if (ops_in_flight.empty())
+    return false;
+  *oldest_secs = now - oldest_op;
+  dout(10) << "ops_in_flight.size: " << ops_in_flight.size()
+           << "; oldest is " << *oldest_secs
+           << " seconds old" << dendl;
+
+  if (*oldest_secs < complaint_time)
+    return false;
+
+  l.unlock();
+  for (auto& op : ops_in_flight) {
+    // `lock` neither `ops_in_flight_lock_sharded` should be held when
+    // calling the visitor. Otherwise `OSD::get_health_metrics()` can
+    // dead-lock due to the `~TrackedOp()` calling `record_history_op()`
+    // or `unregister_inflight_op()`.
+    if (!visit(*op))
+      break;
+  }
+  return true;
+}
+
+bool OpTracker::with_slow_ops_in_flight(utime_t* oldest_secs,
+					int* num_slow_ops,
+					int* num_warned_ops,
+					std::function<void(TrackedOp&)>&& on_warn)
+{
+  const utime_t now = ceph_clock_now();
+  auto too_old = now;
+  too_old -= complaint_time;
+  int slow = 0;
+  int warned = 0;
+  auto check = [&](TrackedOp& op) {
+    if (op.get_initiated() >= too_old) {
+      // no more slow ops in flight
+      return false;
+    }
+    if (!op.warn_interval_multiplier)
+      return true;
+    slow++;
+    if (warned >= log_threshold) {
+      // enough samples of slow ops
+      return true;
+    }
+    auto time_to_complain = (op.get_initiated() +
+			     complaint_time * op.warn_interval_multiplier);
+    if (time_to_complain >= now) {
+      // complain later if the op is still in flight
+      return true;
+    }
+    // will warn, increase counter
+    warned++;
+    on_warn(op);
+    return true;
+  };
+  if (visit_ops_in_flight(oldest_secs, check)) {
+    if (num_slow_ops) {
+      *num_slow_ops = slow;
+      *num_warned_ops = warned;
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool OpTracker::check_ops_in_flight(std::string* summary,
+				    std::vector<string> &warnings,
+				    int *num_slow_ops)
+{
+  const utime_t now = ceph_clock_now();
+  auto too_old = now;
+  too_old -= complaint_time;
+  int warned = 0;
+  utime_t oldest_secs;
+  auto warn_on_slow_op = [&](TrackedOp& op) {
+    stringstream ss;
+    utime_t age = now - op.get_initiated();
+    ss << "slow request " << age << " seconds old, received at "
+       << op.get_initiated() << ": " << op.get_desc()
+       << " currently "
+       << op.state_string();
+    warnings.push_back(ss.str());
+    // only those that have been shown will backoff
+    op.warn_interval_multiplier *= 2;
+  };
+  int slow = 0;
+  if (with_slow_ops_in_flight(&oldest_secs, &slow, &warned, warn_on_slow_op) &&
+      slow > 0) {
+    stringstream ss;
+    ss << slow << " slow requests, "
+       << warned << " included below; oldest blocked for > "
+       << oldest_secs << " secs";
+    *summary = ss.str();
+    if (num_slow_ops) {
+      *num_slow_ops = slow;
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void OpTracker::get_age_ms_histogram(pow2_hist_t *h)
+{
+  h->clear();
+  utime_t now = ceph_clock_now();
+
+  for (uint32_t iter = 0; iter < num_optracker_shards; iter++) {
+    ShardedTrackingData* sdata = sharded_in_flight_list[iter];
+    ceph_assert(NULL != sdata);
+    std::lock_guard locker(sdata->ops_in_flight_lock_sharded);
+
+    for (auto& i : sdata->ops_in_flight_sharded) {
+      utime_t age = now - i.get_initiated();
+      uint32_t ms = (long)(age * 1000.0);
+      h->add(ms);
+    }
+  }
+}
+
+
+#undef dout_context
+#define dout_context tracker->cct
+
+void TrackedOp::mark_event(std::string_view event, utime_t stamp)
+{
+  if (!state)
+    return;
+
+  {
+    std::lock_guard l(lock);
+    events.emplace_back(stamp, event);
+  }
+  dout(6) << " seq: " << seq
+	  << ", time: " << stamp
+	  << ", event: " << event
+	  << ", op: " << get_desc()
+	  << dendl;
+  _event_marked();
+}
+
+void TrackedOp::dump(utime_t now, Formatter *f) const
+{
+  // Ignore if still in the constructor
+  if (!state)
+    return;
+  f->dump_string("description", get_desc());
+  f->dump_stream("initiated_at") << get_initiated();
+  f->dump_float("age", now - get_initiated());
+  f->dump_float("duration", get_duration());
+  {
+    f->open_object_section("type_data");
+    _dump(f);
+    f->close_section();
+  }
+}
diff --git a/src/common/TrackedOp.h b/src/common/TrackedOp.h
new file mode 100644
index 00000000..a502f490
--- /dev/null
+++ b/src/common/TrackedOp.h
@@ -0,0 +1,396 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 New Dream Network/Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef TRACKEDREQUEST_H_
+#define TRACKEDREQUEST_H_
+
+#include <atomic>
+#include "common/histogram.h"
+#include "common/RWLock.h"
+#include "common/Thread.h"
+#include "common/Clock.h"
+#include "common/ceph_mutex.h"
+#include "include/spinlock.h"
+#include "msg/Message.h"
+
+#define OPTRACKER_PREALLOC_EVENTS 20
+
+class TrackedOp;
+class OpHistory;
+
+typedef boost::intrusive_ptr<TrackedOp> TrackedOpRef;
+
+class OpHistoryServiceThread : public Thread
+{
+private:
+  list<pair<utime_t, TrackedOpRef>> _external_queue;
+  OpHistory* _ophistory;
+  mutable ceph::spinlock queue_spinlock;
+  bool _break_thread;
+public:
+  explicit OpHistoryServiceThread(OpHistory* parent)
+    : _ophistory(parent),
+      _break_thread(false) { }
+
+  void break_thread();
+  void insert_op(const utime_t& now, TrackedOpRef op) {
+    queue_spinlock.lock();
+    _external_queue.emplace_back(now, op);
+    queue_spinlock.unlock();
+  }
+
+  void *entry() override;
+};
+
+
+class OpHistory {
+  set<pair<utime_t, TrackedOpRef> > arrived;
+  set<pair<double, TrackedOpRef> > duration;
+  set<pair<utime_t, TrackedOpRef> > slow_op;
+  ceph::mutex ops_history_lock = ceph::make_mutex("OpHistory::ops_history_lock");
+  void cleanup(utime_t now);
+  std::atomic_size_t history_size{0};
+  std::atomic_uint32_t history_duration{0};
+  std::atomic_size_t history_slow_op_size{0};
+  std::atomic_uint32_t history_slow_op_threshold{0};
+  std::atomic_bool shutdown{false};
+  OpHistoryServiceThread opsvc;
+  friend class OpHistoryServiceThread;
+
+public:
+  OpHistory() : opsvc(this) {
+    opsvc.create("OpHistorySvc");
+  }
+  ~OpHistory() {
+    ceph_assert(arrived.empty());
+    ceph_assert(duration.empty());
+    ceph_assert(slow_op.empty());
+  }
+  void insert(const utime_t& now, TrackedOpRef op)
+  {
+    if (shutdown)
+      return;
+
+    opsvc.insert_op(now, op);
+  }
+
+  void _insert_delayed(const utime_t& now, TrackedOpRef op);
+  void dump_ops(utime_t now, Formatter *f, set<string> filters = {""}, bool by_duration=false);
+  void dump_slow_ops(utime_t now, Formatter *f, set<string> filters = {""});
+  void on_shutdown();
+  void set_size_and_duration(size_t new_size, uint32_t new_duration) {
+    history_size = new_size;
+    history_duration = new_duration;
+  }
+  void set_slow_op_size_and_threshold(size_t new_size, uint32_t new_threshold) {
+    history_slow_op_size = new_size;
+    history_slow_op_threshold = new_threshold;
+  }
+};
+
+struct ShardedTrackingData;
+class OpTracker {
+  friend class OpHistory;
+  std::atomic<int64_t> seq = { 0 };
+  vector<ShardedTrackingData*> sharded_in_flight_list;
+  OpHistory history;
+  uint32_t num_optracker_shards;
+  float complaint_time;
+  int log_threshold;
+  std::atomic<bool> tracking_enabled;
+  RWLock       lock;
+
+public:
+  CephContext *cct;
+  OpTracker(CephContext *cct_, bool tracking, uint32_t num_shards);
+      
+  void set_complaint_and_threshold(float time, int threshold) {
+    complaint_time = time;
+    log_threshold = threshold;
+  }
+  void set_history_size_and_duration(uint32_t new_size, uint32_t new_duration) {
+    history.set_size_and_duration(new_size, new_duration);
+  }
+  void set_history_slow_op_size_and_threshold(uint32_t new_size, uint32_t new_threshold) {
+    history.set_slow_op_size_and_threshold(new_size, new_threshold);
+  }
+  bool is_tracking() const {
+    return tracking_enabled;
+  }
+  void set_tracking(bool enable) {
+    tracking_enabled = enable;
+  }
+  bool dump_ops_in_flight(Formatter *f, bool print_only_blocked = false, set<string> filters = {""});
+  bool dump_historic_ops(Formatter *f, bool by_duration = false, set<string> filters = {""});
+  bool dump_historic_slow_ops(Formatter *f, set<string> filters = {""});
+  bool register_inflight_op(TrackedOp *i);
+  void unregister_inflight_op(TrackedOp *i);
+  void record_history_op(TrackedOpRef&& i);
+
+  void get_age_ms_histogram(pow2_hist_t *h);
+
+  /**
+   * walk through ops in flight
+   *
+   * @param oldest_sec the amount of time since the oldest op was initiated
+   * @param check a function consuming tracked ops, the function returns
+   *              false if it don't want to be fed with more ops
+   * @return True if there are any Ops to warn on, false otherwise
+   */
+  bool visit_ops_in_flight(utime_t* oldest_secs,
+			   std::function<bool(TrackedOp&)>&& visit);
+  /**
+   * walk through slow ops in flight
+   *
+   * @param[out] oldest_sec the amount of time since the oldest op was initiated
+   * @param[out] num_slow_ops total number of slow ops
+   * @param[out] num_warned_ops total number of warned ops
+   * @param on_warn a function consuming tracked ops, the function returns
+   *                false if it don't want to be fed with more ops
+   * @return True if there are any Ops to warn on, false otherwise
+   */
+  bool with_slow_ops_in_flight(utime_t* oldest_secs,
+			       int* num_slow_ops,
+			       int* num_warned_ops,
+			       std::function<void(TrackedOp&)>&& on_warn);
+  /**
+   * Look for Ops which are too old, and insert warning
+   * strings for each Op that is too old.
+   *
+   * @param summary[out] a string summarizing slow Ops.
+   * @param warning_strings[out] A vector<string> reference which is filled
+   * with a warning string for each old Op.
+   * @param slow[out] total number of slow ops
+   * @return True if there are any Ops to warn on, false otherwise.
+   */
+  bool check_ops_in_flight(std::string* summary,
+			   std::vector<string> &warning_strings,
+			   int* slow = nullptr);
+
+  void on_shutdown() {
+    history.on_shutdown();
+  }
+  ~OpTracker();
+
+  template <typename T, typename U>
+  typename T::Ref create_request(U params)
+  {
+    typename T::Ref retval(new T(params, this));
+    retval->tracking_start();
+
+    if (is_tracking()) {
+      retval->mark_event("header_read", params->get_recv_stamp());
+      retval->mark_event("throttled", params->get_throttle_stamp());
+      retval->mark_event("all_read", params->get_recv_complete_stamp());
+      retval->mark_event("dispatched", params->get_dispatch_stamp());
+    }
+
+    return retval;
+  }
+};
+
+
+class TrackedOp : public boost::intrusive::list_base_hook<> {
+private:
+  friend class OpHistory;
+  friend class OpTracker;
+
+  boost::intrusive::list_member_hook<> tracker_item;
+
+public:
+  typedef boost::intrusive::list<
+  TrackedOp,
+  boost::intrusive::member_hook<
+    TrackedOp,
+    boost::intrusive::list_member_hook<>,
+    &TrackedOp::tracker_item> > tracked_op_list_t;
+
+  // for use when clearing lists.  e.g.,
+  //   ls.clear_and_dispose(TrackedOp::Putter());
+  struct Putter {
+    void operator()(TrackedOp *op) {
+      op->put();
+    }
+  };
+
+protected:
+  OpTracker *tracker;          ///< the tracker we are associated with
+  std::atomic_int nref = {0};  ///< ref count
+
+  utime_t initiated_at;
+
+  struct Event {
+    utime_t stamp;
+    std::string str;
+
+    Event(utime_t t, std::string_view s) : stamp(t), str(s) {}
+
+    int compare(const char *s) const {
+      return str.compare(s);
+    }
+
+    const char *c_str() const {
+      return str.c_str();
+    }
+
+    void dump(Formatter *f) const {
+      f->dump_stream("time") << stamp;
+      f->dump_string("event", str);
+    }
+  };
+
+  vector<Event> events;    ///< list of events and their times
+  mutable ceph::mutex lock = ceph::make_mutex("TrackedOp::lock"); ///< to protect the events list
+  uint64_t seq = 0;        ///< a unique value set by the OpTracker
+
+  uint32_t warn_interval_multiplier = 1; //< limits output of a given op warning
+
+  enum {
+    STATE_UNTRACKED = 0,
+    STATE_LIVE,
+    STATE_HISTORY
+  };
+  atomic<int> state = {STATE_UNTRACKED};
+
+  mutable string desc_str;   ///< protected by lock
+  mutable const char *desc = nullptr;  ///< readable without lock
+  mutable atomic<bool> want_new_desc = {false};
+
+  TrackedOp(OpTracker *_tracker, const utime_t& initiated) :
+    tracker(_tracker),
+    initiated_at(initiated)
+  {
+    events.reserve(OPTRACKER_PREALLOC_EVENTS);
+  }
+
+  /// output any type-specific data you want to get when dump() is called
+  virtual void _dump(Formatter *f) const {}
+  /// if you want something else to happen when events are marked, implement
+  virtual void _event_marked() {}
+  /// return a unique descriptor of the Op; eg the message it's attached to
+  virtual void _dump_op_descriptor_unlocked(ostream& stream) const = 0;
+  /// called when the last non-OpTracker reference is dropped
+  virtual void _unregistered() {}
+
+  virtual bool filter_out(const set<string>& filters) { return true; }
+
+public:
+  ZTracer::Trace osd_trace;
+  ZTracer::Trace pg_trace;
+  ZTracer::Trace store_trace;
+  ZTracer::Trace journal_trace;
+
+  virtual ~TrackedOp() {}
+
+  void get() {
+    ++nref;
+  }
+  void put() {
+  again:
+    auto nref_snap = nref.load();
+    if (nref_snap == 1) {
+      switch (state.load()) {
+      case STATE_UNTRACKED:
+	_unregistered();
+	delete this;
+	break;
+
+      case STATE_LIVE:
+	mark_event("done");
+	tracker->unregister_inflight_op(this);
+	_unregistered();
+	if (!tracker->is_tracking()) {
+	  delete this;
+	} else {
+	  state = TrackedOp::STATE_HISTORY;
+	  tracker->record_history_op(
+	    TrackedOpRef(this, /* add_ref = */ false));
+	}
+	break;
+
+      case STATE_HISTORY:
+	delete this;
+	break;
+
+      default:
+	ceph_abort();
+      }
+    } else if (!nref.compare_exchange_weak(nref_snap, nref_snap - 1)) {
+      goto again;
+    }
+  }
+
+  const char *get_desc() const {
+    if (!desc || want_new_desc.load()) {
+      std::lock_guard l(lock);
+      _gen_desc();
+    }
+    return desc;
+  }
+private:
+  void _gen_desc() const {
+    ostringstream ss;
+    _dump_op_descriptor_unlocked(ss);
+    desc_str = ss.str();
+    desc = desc_str.c_str();
+    want_new_desc = false;
+  }
+public:
+  void reset_desc() {
+    want_new_desc = true;
+  }
+
+  const utime_t& get_initiated() const {
+    return initiated_at;
+  }
+
+  double get_duration() const {
+    std::lock_guard l(lock);
+    if (!events.empty() && events.rbegin()->compare("done") == 0)
+      return events.rbegin()->stamp - get_initiated();
+    else
+      return ceph_clock_now() - get_initiated();
+  }
+
+  void mark_event(std::string_view event, utime_t stamp=ceph_clock_now());
+
+  void mark_nowarn() {
+    warn_interval_multiplier = 0;
+  }
+
+  virtual std::string_view state_string() const {
+    std::lock_guard l(lock);
+    return events.empty() ? std::string_view() : std::string_view(events.rbegin()->str);
+  }
+
+  void dump(utime_t now, Formatter *f) const;
+
+  void tracking_start() {
+    if (tracker->register_inflight_op(this)) {
+      events.emplace_back(initiated_at, "initiated");
+      state = STATE_LIVE;
+    }
+  }
+
+  // ref counting via intrusive_ptr, with special behavior on final
+  // put for historical op tracking
+  friend void intrusive_ptr_add_ref(TrackedOp *o) {
+    o->get();
+  }
+  friend void intrusive_ptr_release(TrackedOp *o) {
+    o->put();
+  }
+};
+
+
+#endif
diff --git a/src/common/Tub.h b/src/common/Tub.h
new file mode 100644
index 00000000..73cb1235
--- /dev/null
+++ b/src/common/Tub.h
@@ -0,0 +1,287 @@
+/* Copyright (c) 2010-2015 Stanford University
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef CEPH_COMMON_TUB_H
+#define CEPH_COMMON_TUB_H
+
+/**
+ * A Tub holds an object that may be uninitialized; it allows the allocation of
+ * memory for objects to be separated from its construction and destruction.
+ * When you initially create a Tub its object is uninitialized (and should not
+ * be used). You can call #construct and #destroy to invoke the constructor and
+ * destructor of the embedded object, and #get or #operator-> will return the
+ * embedded object. The embedded object is automatically destroyed when the Tub
+ * is destroyed (if it was ever constructed in the first place).
+ *
+ * Tubs are useful in situations like the following:
+ * - You want to create an array of objects, but the objects need complex
+ *   constructors with multiple arguments.
+ * - You want to create a collection of objects, only some of which will be
+ *   used, and you don't want to pay the cost of constructing objects that will
+ *   never be used.
+ * - You want automatic destruction of an object but don't want to
+ *   heap-allocate the object (as with std::unique_ptr).
+ * - You want a way to return failure from a method without using pointers,
+ *   exceptions, or special values (e.g. -1). The Tub gives you a 'maybe'
+ *   object; it may be empty if a failure occurred.
+ * - You want a singleton, but don't want to deal with heap-allocating an
+ *   object on first use and freeing it later. Instead, just declare your object
+ *   in a tub and do:
+ *      if (!tub) tub.construct();
+ *  - You want optional arguments to a function, but don't want to use pointers
+ *    (i.e. use the Tub's boolean to determine that an argument was passed,
+ *    rather than checking arg != NULL).
+ *
+ * Tub is CopyConstructible if and only if ElementType is CopyConstructible,
+ * and Tub is Assignable if and only if ElementType is Assignable.
+ *
+ * \tparam ElementType
+ *      The type of the object to be stored within the Tub.
+ */
+template<typename ElementType>
+class Tub {
+ public:
+  /// The type of the object to be stored within the Tub.
+  typedef ElementType element_type;
+
+  /**
+	 * Default constructor: the object starts off uninitialized.
+	 */
+  Tub(): occupied(false) {}
+
+  /**
+	 * Construct an occupied Tub, whose contained object is initialized
+	 * with a copy of the given object.
+	 * \pre
+	 *      ElementType is CopyConstructible.
+	 * \param other
+	 *      Source of the copy.
+	 */
+  Tub(const ElementType& other) // NOLINT
+          : occupied(false) {
+    construct(other);
+  }
+
+  /**
+	 * Construct an occupied Tub, whose contained object is initialized
+	 * with a move of the given object.
+	 * \pre
+	 *      ElementType is MoveConstructible.
+	 * \param other
+	 *      Source of the move.
+	 */
+  Tub(ElementType&& other) // NOLINT
+          : occupied(false) {
+    construct(std::move(other));
+  }
+
+  /**
+	 * Copy constructor.
+	 * The object will be initialized if and only if the source of the copy is
+	 * initialized.
+	 * \pre
+	 *      ElementType is CopyConstructible.
+	 * \param other
+	 *      Source of the copy.
+	 */
+  Tub(const Tub<ElementType>& other) // NOLINT
+          : occupied(false) {
+    if (other.occupied) {
+      construct(*other.object); // use ElementType's copy constructor
+    }
+  }
+
+  /**
+	 * Move constructor.
+	 * The object will be initialized if and only if the source of the move is
+	 * initialized.
+	 * \pre
+	 *      ElementType is MoveConstructible.
+	 * \param other
+	 *      Source of the move.
+	 */
+  Tub(Tub<ElementType>&& other) // NOLINT
+          : occupied(false) {
+    if (other.occupied)
+      construct(std::move(*other.object)); // use ElementType's copy constructor
+  }
+
+  /**
+	 * Destructor: destroy the object if it was initialized.
+	 */
+  ~Tub() {
+    destroy();
+  }
+
+  /**
+	 * Assignment: destroy current object if initialized, replace with
+	 * source.  Result will be uninitialized if source is uninitialized.
+	 * \pre
+	 *      ElementType is Assignable.
+	 */
+  Tub<ElementType>& operator=(const Tub<ElementType>& other) {
+    if (this != &other) {
+      if (other.occupied) {
+        if (occupied) {
+#if __GNUC__ && __GNUC__ >= 4 && __GNUC_MINOR__ >= 7
+          #pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
+          *object = *other.object; // use ElementType's assignment
+#if __GNUC__ && __GNUC__ >= 4 && __GNUC_MINOR__ >= 7
+#pragma GCC diagnostic pop
+#endif
+        } else {
+          construct(*other.object);
+        }
+      } else {
+        destroy();
+      }
+    }
+    return *this;
+  }
+
+  /**
+	 * Assignment: destroy current object if initialized, replace with
+	 * source.  Result will be uninitialized if source is uninitialized.
+	 * \pre
+	 *      ElementType is Assignable.
+	 */
+  Tub<ElementType>& operator=(Tub<ElementType> &&other) {
+    if (this != &other) {
+      if (other.occupied) {
+        if (occupied)
+          *object = std::move(*other.object);
+        else
+          construct(std::move(*other.object));
+        other.destroy();
+      } else {
+        destroy();
+      }
+    }
+    return *this;
+  }
+
+  /**
+	 * Assignment: destroy current object if initialized, replace with
+	 * source.  Result will be uninitialized if source is uninitialized.
+	 * \pre
+	 *      ElementType is Assignable.
+	 */
+  Tub<ElementType>& operator=(ElementType &&elt) {
+    if (occupied) {
+      *object = std::move(elt);
+    } else {
+      construct(std::forward<ElementType>(elt));
+    }
+    return *this;
+  }
+
+  /**
+	 * Initialize the object.
+	 * If the object was already initialized, it will be destroyed first.
+	 * \param args
+	 *      Arguments to ElementType's constructor.
+	 * \return
+	 *      A pointer to the newly initialized object.
+	 * \post
+	 *      The object is initialized.
+	 */
+  template<typename... Args>
+  ElementType* construct(Args&&... args) {
+    destroy();
+    new(object) ElementType(std::forward<Args>(args)...);
+    occupied = true;
+    return object;
+  }
+
+  /**
+	 * Destroy the object, leaving the Tub in the same state
+	 * as after the no-argument constructor.
+	 * If the object was not initialized, this will have no effect.
+	 * \post
+	 *      The object is uninitialized.
+	 */
+  void destroy() {
+    if (occupied) {
+      object->~ElementType();
+      occupied = false;
+    }
+  }
+
+  /// See #get().
+  const ElementType& operator*() const {
+    return *get();
+  }
+
+  /// See #get().
+  ElementType& operator*() {
+    return *get();
+  }
+
+  /// See #get().
+  const ElementType* operator->() const {
+    return get();
+  }
+
+  /// See #get().
+  ElementType* operator->() {
+    return get();
+  }
+
+  /**
+	 * Return a pointer to the object.
+	 * \pre
+	 *      The object is initialized.
+	 */
+  ElementType* get() {
+    if (!occupied)
+      return NULL;
+    return object;
+  }
+
+  /// See #get().
+  const ElementType* get() const {
+    if (!occupied)
+      return NULL;
+    return object;
+  }
+
+  /**
+	 * Return whether the object is initialized.
+	 */
+  operator bool() const {
+    return occupied;
+  }
+
+ private:
+  /**
+	 * A pointer to where the object is, if it is initialized.
+	 * This must directly precede #raw in the struct.
+	 */
+  ElementType object[0];
+
+  /**
+	 * A storage area to back the object while it is initialized.
+	 */
+  char raw[sizeof(ElementType)];
+
+  /**
+	 * Whether the object is initialized.
+	 */
+  bool occupied;
+};
+
+#endif  // CEPH_COMMON_TUB_H
diff --git a/src/common/WeightedPriorityQueue.h b/src/common/WeightedPriorityQueue.h
new file mode 100644
index 00000000..a05174e8
--- /dev/null
+++ b/src/common/WeightedPriorityQueue.h
@@ -0,0 +1,349 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef WP_QUEUE_H
+#define WP_QUEUE_H
+
+#include "OpQueue.h"
+
+#include <boost/intrusive/list.hpp>
+#include <boost/intrusive/rbtree.hpp>
+#include <boost/intrusive/avl_set.hpp>
+
+#include "include/ceph_assert.h"
+
+namespace bi = boost::intrusive;
+
+template <typename T, typename S>
+class MapKey
+{
+  public:
+  bool operator()(const S i, const T &k) const
+  {
+    return i < k.key;
+  }
+  bool operator()(const T &k, const S i) const
+  {
+    return k.key < i;
+  }
+};
+
+template <typename T>
+class DelItem
+{
+  public:
+  void operator()(T* delete_this)
+    { delete delete_this; }
+};
+
+template <typename T, typename K>
+class WeightedPriorityQueue :  public OpQueue <T, K>
+{
+  private:
+    class ListPair : public bi::list_base_hook<>
+    {
+      public:
+        unsigned cost;
+        T item;
+        ListPair(unsigned c, T&& i) :
+          cost(c),
+          item(std::move(i))
+	{}
+    };
+    class Klass : public bi::set_base_hook<>
+    {
+      typedef bi::list<ListPair> ListPairs;
+      typedef typename ListPairs::iterator Lit;
+      public:
+        K key;		// klass
+        ListPairs lp;
+        Klass(K& k) :
+          key(k) {
+        }
+        ~Klass() {
+          lp.clear_and_dispose(DelItem<ListPair>());
+        }
+      friend bool operator< (const Klass &a, const Klass &b)
+        { return a.key < b.key; }
+      friend bool operator> (const Klass &a, const Klass &b)
+        { return a.key > b.key; }
+      friend bool operator== (const Klass &a, const Klass &b)
+        { return a.key == b.key; }
+      void insert(unsigned cost, T&& item, bool front) {
+        if (front) {
+          lp.push_front(*new ListPair(cost, std::move(item)));
+        } else {
+          lp.push_back(*new ListPair(cost, std::move(item)));
+        }
+      }
+      //Get the cost of the next item to dequeue
+      unsigned get_cost() const {
+        ceph_assert(!empty());
+        return lp.begin()->cost;
+      }
+      T pop() {
+	ceph_assert(!lp.empty());
+	T ret = std::move(lp.begin()->item);
+        lp.erase_and_dispose(lp.begin(), DelItem<ListPair>());
+        return ret;
+      }
+      bool empty() const {
+        return lp.empty();
+      }
+      unsigned get_size() const {
+	return lp.size();
+      }
+      void filter_class(std::list<T>* out) {
+        for (Lit i = --lp.end();; --i) {
+          if (out) {
+            out->push_front(std::move(i->item));
+          }
+          i = lp.erase_and_dispose(i, DelItem<ListPair>());
+          if (i == lp.begin()) {
+            break;
+          }
+        }
+      }
+    };
+    class SubQueue : public bi::set_base_hook<>
+    {
+      typedef bi::rbtree<Klass> Klasses;
+      typedef typename Klasses::iterator Kit;
+      void check_end() {
+        if (next == klasses.end()) {
+          next = klasses.begin();
+        }
+      }
+      public:
+	unsigned key;	// priority
+        Klasses klasses;
+	Kit next;
+	SubQueue(unsigned& p) :
+	  key(p),
+	  next(klasses.begin()) {
+	}
+	~SubQueue() {
+	  klasses.clear_and_dispose(DelItem<Klass>());
+	}
+      friend bool operator< (const SubQueue &a, const SubQueue &b)
+        { return a.key < b.key; }
+      friend bool operator> (const SubQueue &a, const SubQueue &b)
+        { return a.key > b.key; }
+      friend bool operator== (const SubQueue &a, const SubQueue &b)
+        { return a.key == b.key; }
+      bool empty() const {
+        return klasses.empty();
+      }
+      void insert(K cl, unsigned cost, T&& item, bool front = false) {
+        typename Klasses::insert_commit_data insert_data;
+      	std::pair<Kit, bool> ret =
+          klasses.insert_unique_check(cl, MapKey<Klass, K>(), insert_data);
+      	if (ret.second) {
+      	  ret.first = klasses.insert_unique_commit(*new Klass(cl), insert_data);
+          check_end();
+	}
+	ret.first->insert(cost, std::move(item), front);
+      }
+      unsigned get_cost() const {
+        ceph_assert(!empty());
+        return next->get_cost();
+      }
+      T pop() {
+        T ret = next->pop();
+        if (next->empty()) {
+          next = klasses.erase_and_dispose(next, DelItem<Klass>());
+        } else {
+	  ++next;
+	}
+        check_end();
+	return ret;
+      }
+      void filter_class(K& cl, std::list<T>* out) {
+        Kit i = klasses.find(cl, MapKey<Klass, K>());
+        if (i != klasses.end()) {
+          i->filter_class(out);
+	  Kit tmp = klasses.erase_and_dispose(i, DelItem<Klass>());
+	  if (next == i) {
+            next = tmp;
+          }
+          check_end();
+        }
+      }
+      // this is intended for unit tests and should be never used on hot paths
+      unsigned get_size_slow() const {
+	unsigned count = 0;
+	for (const auto& klass : klasses) {
+	  count += klass.get_size();
+	}
+	return count;
+      }
+      void dump(ceph::Formatter *f) const {
+        f->dump_int("num_keys", next->get_size());
+        if (!empty()) {
+          f->dump_int("first_item_cost", next->get_cost());
+        }
+      }
+    };
+    class Queue {
+      typedef bi::rbtree<SubQueue> SubQueues;
+      typedef typename SubQueues::iterator Sit;
+      SubQueues queues;
+      unsigned total_prio;
+      unsigned max_cost;
+      public:
+	Queue() :
+	  total_prio(0),
+	  max_cost(0) {
+	}
+	~Queue() {
+	  queues.clear_and_dispose(DelItem<SubQueue>());
+	}
+	bool empty() const {
+	  return queues.empty();
+	}
+	void insert(unsigned p, K cl, unsigned cost, T&& item, bool front = false) {
+	  typename SubQueues::insert_commit_data insert_data;
+      	  std::pair<typename SubQueues::iterator, bool> ret =
+      	    queues.insert_unique_check(p, MapKey<SubQueue, unsigned>(), insert_data);
+      	  if (ret.second) {
+      	    ret.first = queues.insert_unique_commit(*new SubQueue(p), insert_data);
+	    total_prio += p;
+      	  }
+	  ret.first->insert(cl, cost, std::move(item), front);
+	  if (cost > max_cost) {
+	    max_cost = cost;
+	  }
+	}
+	T pop(bool strict = false) {
+	  Sit i = --queues.end();
+	  if (strict) {
+	    T ret = i->pop();
+	    if (i->empty()) {
+	      queues.erase_and_dispose(i, DelItem<SubQueue>());
+	    }
+	    return ret;
+	  }
+	  if (queues.size() > 1) {
+	    while (true) {
+	      // Pick a new priority out of the total priority.
+	      unsigned prio = rand() % total_prio + 1;
+	      unsigned tp = total_prio - i->key;
+	      // Find the priority corresponding to the picked number.
+	      // Subtract high priorities to low priorities until the picked number
+	      // is more than the total and try to dequeue that priority.
+	      // Reverse the direction from previous implementation because there is a higher
+	      // chance of dequeuing a high priority op so spend less time spinning.
+	      while (prio <= tp) {
+		--i;
+		tp -= i->key;
+	      }
+	      // Flip a coin to see if this priority gets to run based on cost.
+	      // The next op's cost is multiplied by .9 and subtracted from the
+	      // max cost seen. Ops with lower costs will have a larger value
+	      // and allow them to be selected easier than ops with high costs.
+	      if (max_cost == 0 || rand() % max_cost <=
+		  (max_cost - ((i->get_cost() * 9) / 10))) {
+		break;
+	      }
+	      i = --queues.end();
+	    }
+	  }
+	  T ret = i->pop();
+	  if (i->empty()) {
+	    total_prio -= i->key;
+	    queues.erase_and_dispose(i, DelItem<SubQueue>());
+	  }
+	  return ret;
+	}
+	void filter_class(K& cl, std::list<T>* out) {
+	  for (Sit i = queues.begin(); i != queues.end();) {
+	    i->filter_class(cl, out);
+	    if (i->empty()) {
+	      total_prio -= i->key;
+	      i = queues.erase_and_dispose(i, DelItem<SubQueue>());
+	    } else {
+	      ++i;
+	    }
+	  }
+	}
+	// this is intended for unit tests and should be never used on hot paths
+	unsigned get_size_slow() const {
+	  unsigned count = 0;
+	  for (const auto& queue : queues) {
+	    count += queue.get_size_slow();
+	  }
+	  return count;
+	}
+	void dump(ceph::Formatter *f) const {
+	  for (typename SubQueues::const_iterator i = queues.begin();
+	        i != queues.end(); ++i) {
+	    f->dump_int("total_priority", total_prio);
+	    f->dump_int("max_cost", max_cost);
+	    f->open_object_section("subqueue");
+	    f->dump_int("priority", i->key);
+	    i->dump(f);
+	    f->close_section();
+	  }
+	}
+    };
+
+    Queue strict;
+    Queue normal;
+  public:
+    WeightedPriorityQueue(unsigned max_per, unsigned min_c) :
+      strict(),
+      normal()
+      {
+	std::srand(time(0));
+      }
+    void remove_by_class(K cl, std::list<T>* removed = 0) final {
+      strict.filter_class(cl, removed);
+      normal.filter_class(cl, removed);
+    }
+    bool empty() const final {
+      return strict.empty() && normal.empty();
+    }
+    void enqueue_strict(K cl, unsigned p, T&& item) final {
+      strict.insert(p, cl, 0, std::move(item));
+    }
+    void enqueue_strict_front(K cl, unsigned p, T&& item) final {
+      strict.insert(p, cl, 0, std::move(item), true);
+    }
+    void enqueue(K cl, unsigned p, unsigned cost, T&& item) final {
+      normal.insert(p, cl, cost, std::move(item));
+    }
+    void enqueue_front(K cl, unsigned p, unsigned cost, T&& item) final {
+      normal.insert(p, cl, cost, std::move(item), true);
+    }
+    T dequeue() override {
+      ceph_assert(!empty());
+      if (!strict.empty()) {
+	return strict.pop(true);
+      }
+      return normal.pop();
+    }
+    unsigned get_size_slow() {
+      return strict.get_size_slow() + normal.get_size_slow();
+    }
+    void dump(ceph::Formatter *f) const override {
+      f->open_array_section("high_queues");
+      strict.dump(f);
+      f->close_section();
+      f->open_array_section("queues");
+      normal.dump(f);
+      f->close_section();
+    }
+};
+
+#endif
diff --git a/src/common/WorkQueue.cc b/src/common/WorkQueue.cc
new file mode 100644
index 00000000..ba757900
--- /dev/null
+++ b/src/common/WorkQueue.cc
@@ -0,0 +1,411 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "WorkQueue.h"
+#include "include/compat.h"
+#include "common/errno.h"
+
+#define dout_subsys ceph_subsys_tp
+#undef dout_prefix
+#define dout_prefix *_dout << name << " "
+
+
+ThreadPool::ThreadPool(CephContext *cct_, string nm, string tn, int n, const char *option)
+  : cct(cct_), name(std::move(nm)), thread_name(std::move(tn)),
+    lockname(name + "::lock"),
+    _lock(ceph::make_mutex(lockname)),  // this should be safe due to declaration order
+    _stop(false),
+    _pause(0),
+    _draining(0),
+    _num_threads(n),
+    processing(0)
+{
+  if (option) {
+    _thread_num_option = option;
+    // set up conf_keys
+    _conf_keys = new const char*[2];
+    _conf_keys[0] = _thread_num_option.c_str();
+    _conf_keys[1] = NULL;
+  } else {
+    _conf_keys = new const char*[1];
+    _conf_keys[0] = NULL;
+  }
+}
+
+void ThreadPool::TPHandle::suspend_tp_timeout()
+{
+  cct->get_heartbeat_map()->clear_timeout(hb);
+}
+
+void ThreadPool::TPHandle::reset_tp_timeout()
+{
+  cct->get_heartbeat_map()->reset_timeout(
+    hb, grace, suicide_grace);
+}
+
+ThreadPool::~ThreadPool()
+{
+  ceph_assert(_threads.empty());
+  delete[] _conf_keys;
+}
+
+void ThreadPool::handle_conf_change(const ConfigProxy& conf,
+				    const std::set <std::string> &changed)
+{
+  if (changed.count(_thread_num_option)) {
+    char *buf;
+    int r = conf.get_val(_thread_num_option.c_str(), &buf, -1);
+    ceph_assert(r >= 0);
+    int v = atoi(buf);
+    free(buf);
+    if (v >= 0) {
+      _lock.lock();
+      _num_threads = v;
+      start_threads();
+      _cond.notify_all();
+      _lock.unlock();
+    }
+  }
+}
+
+void ThreadPool::worker(WorkThread *wt)
+{
+  std::unique_lock ul(_lock);
+  ldout(cct,10) << "worker start" << dendl;
+  
+  std::stringstream ss;
+  ss << name << " thread " << (void *)pthread_self();
+  heartbeat_handle_d *hb = cct->get_heartbeat_map()->add_worker(ss.str(), pthread_self());
+
+  while (!_stop) {
+
+    // manage dynamic thread pool
+    join_old_threads();
+    if (_threads.size() > _num_threads) {
+      ldout(cct,1) << " worker shutting down; too many threads (" << _threads.size() << " > " << _num_threads << ")" << dendl;
+      _threads.erase(wt);
+      _old_threads.push_back(wt);
+      break;
+    }
+
+    if (!_pause && !work_queues.empty()) {
+      WorkQueue_* wq;
+      int tries = 2 * work_queues.size();
+      bool did = false;
+      while (tries--) {
+	next_work_queue %= work_queues.size();
+	wq = work_queues[next_work_queue++];
+	
+	void *item = wq->_void_dequeue();
+	if (item) {
+	  processing++;
+	  ldout(cct,12) << "worker wq " << wq->name << " start processing " << item
+			<< " (" << processing << " active)" << dendl;
+	  TPHandle tp_handle(cct, hb, wq->timeout_interval, wq->suicide_interval);
+	  tp_handle.reset_tp_timeout();
+	  ul.unlock();
+	  wq->_void_process(item, tp_handle);
+	  ul.lock();
+	  wq->_void_process_finish(item);
+	  processing--;
+	  ldout(cct,15) << "worker wq " << wq->name << " done processing " << item
+			<< " (" << processing << " active)" << dendl;
+	  if (_pause || _draining)
+	    _wait_cond.notify_all();
+	  did = true;
+	  break;
+	}
+      }
+      if (did)
+	continue;
+    }
+
+    ldout(cct,20) << "worker waiting" << dendl;
+    cct->get_heartbeat_map()->reset_timeout(
+      hb,
+      cct->_conf->threadpool_default_timeout,
+      0);
+    auto wait = std::chrono::seconds(
+      cct->_conf->threadpool_empty_queue_max_wait);
+    _cond.wait_for(ul, wait);
+  }
+  ldout(cct,1) << "worker finish" << dendl;
+
+  cct->get_heartbeat_map()->remove_worker(hb);
+}
+
+void ThreadPool::start_threads()
+{
+  ceph_assert(ceph_mutex_is_locked(_lock));
+  while (_threads.size() < _num_threads) {
+    WorkThread *wt = new WorkThread(this);
+    ldout(cct, 10) << "start_threads creating and starting " << wt << dendl;
+    _threads.insert(wt);
+
+    wt->create(thread_name.c_str());
+  }
+}
+
+void ThreadPool::join_old_threads()
+{
+  ceph_assert(ceph_mutex_is_locked(_lock));
+  while (!_old_threads.empty()) {
+    ldout(cct, 10) << "join_old_threads joining and deleting " << _old_threads.front() << dendl;
+    _old_threads.front()->join();
+    delete _old_threads.front();
+    _old_threads.pop_front();
+  }
+}
+
+void ThreadPool::start()
+{
+  ldout(cct,10) << "start" << dendl;
+
+  if (_thread_num_option.length()) {
+    ldout(cct, 10) << " registering config observer on " << _thread_num_option << dendl;
+    cct->_conf.add_observer(this);
+  }
+
+  _lock.lock();
+  start_threads();
+  _lock.unlock();
+  ldout(cct,15) << "started" << dendl;
+}
+
+void ThreadPool::stop(bool clear_after)
+{
+  ldout(cct,10) << "stop" << dendl;
+
+  if (_thread_num_option.length()) {
+    ldout(cct, 10) << " unregistering config observer on " << _thread_num_option << dendl;
+    cct->_conf.remove_observer(this);
+  }
+
+  _lock.lock();
+  _stop = true;
+  _cond.notify_all();
+  join_old_threads();
+  _lock.unlock();
+  for (set<WorkThread*>::iterator p = _threads.begin();
+       p != _threads.end();
+       ++p) {
+    (*p)->join();
+    delete *p;
+  }
+  _threads.clear();
+  _lock.lock();
+  for (unsigned i=0; i<work_queues.size(); i++)
+    work_queues[i]->_clear();
+  _stop = false;
+  _lock.unlock();
+  ldout(cct,15) << "stopped" << dendl;
+}
+
+void ThreadPool::pause()
+{
+  std::unique_lock ul(_lock);
+  ldout(cct,10) << "pause" << dendl;
+  _pause++;
+  while (processing) {
+    _wait_cond.wait(ul);
+  }
+  ldout(cct,15) << "paused" << dendl;
+}
+
+void ThreadPool::pause_new()
+{
+  ldout(cct,10) << "pause_new" << dendl;
+  _lock.lock();
+  _pause++;
+  _lock.unlock();
+}
+
+void ThreadPool::unpause()
+{
+  ldout(cct,10) << "unpause" << dendl;
+  _lock.lock();
+  ceph_assert(_pause > 0);
+  _pause--;
+  _cond.notify_all();
+  _lock.unlock();
+}
+
+void ThreadPool::drain(WorkQueue_* wq)
+{
+  std::unique_lock ul(_lock);
+  ldout(cct,10) << "drain" << dendl;
+  _draining++;
+  while (processing || (wq != NULL && !wq->_empty())) {
+    _wait_cond.wait(ul);
+  }
+  _draining--;
+}
+
+ShardedThreadPool::ShardedThreadPool(CephContext *pcct_, string nm, string tn,
+  uint32_t pnum_threads):
+  cct(pcct_),
+  name(std::move(nm)),
+  thread_name(std::move(tn)),
+  lockname(name + "::lock"),
+  shardedpool_lock(ceph::make_mutex(lockname)),
+  num_threads(pnum_threads),
+  num_paused(0),
+  num_drained(0),
+  wq(NULL) {}
+
+void ShardedThreadPool::shardedthreadpool_worker(uint32_t thread_index)
+{
+  ceph_assert(wq != NULL);
+  ldout(cct,10) << "worker start" << dendl;
+
+  std::stringstream ss;
+  ss << name << " thread " << (void *)pthread_self();
+  heartbeat_handle_d *hb = cct->get_heartbeat_map()->add_worker(ss.str(), pthread_self());
+
+  while (!stop_threads) {
+    if (pause_threads) {
+      std::unique_lock ul(shardedpool_lock);
+      ++num_paused;
+      wait_cond.notify_all();
+      while (pause_threads) {
+       cct->get_heartbeat_map()->reset_timeout(
+	        hb,
+	        wq->timeout_interval, wq->suicide_interval);
+       shardedpool_cond.wait_for(
+	 ul,
+	 std::chrono::seconds(cct->_conf->threadpool_empty_queue_max_wait));
+      }
+      --num_paused;
+    }
+    if (drain_threads) {
+      std::unique_lock ul(shardedpool_lock);
+      if (wq->is_shard_empty(thread_index)) {
+        ++num_drained;
+        wait_cond.notify_all();
+        while (drain_threads) {
+	  cct->get_heartbeat_map()->reset_timeout(
+	    hb,
+	    wq->timeout_interval, wq->suicide_interval);
+          shardedpool_cond.wait_for(
+	    ul,
+	    std::chrono::seconds(cct->_conf->threadpool_empty_queue_max_wait));
+        }
+        --num_drained;
+      }
+    }
+
+    cct->get_heartbeat_map()->reset_timeout(
+      hb,
+      wq->timeout_interval, wq->suicide_interval);
+    wq->_process(thread_index, hb);
+
+  }
+
+  ldout(cct,10) << "sharded worker finish" << dendl;
+
+  cct->get_heartbeat_map()->remove_worker(hb);
+
+}
+
+void ShardedThreadPool::start_threads()
+{
+  ceph_assert(ceph_mutex_is_locked(shardedpool_lock));
+  int32_t thread_index = 0;
+  while (threads_shardedpool.size() < num_threads) {
+
+    WorkThreadSharded *wt = new WorkThreadSharded(this, thread_index);
+    ldout(cct, 10) << "start_threads creating and starting " << wt << dendl;
+    threads_shardedpool.push_back(wt);
+    wt->create(thread_name.c_str());
+    thread_index++;
+  }
+}
+
+void ShardedThreadPool::start()
+{
+  ldout(cct,10) << "start" << dendl;
+
+  shardedpool_lock.lock();
+  start_threads();
+  shardedpool_lock.unlock();
+  ldout(cct,15) << "started" << dendl;
+}
+
+void ShardedThreadPool::stop()
+{
+  ldout(cct,10) << "stop" << dendl;
+  stop_threads = true;
+  ceph_assert(wq != NULL);
+  wq->return_waiting_threads();
+  for (vector<WorkThreadSharded*>::iterator p = threads_shardedpool.begin();
+       p != threads_shardedpool.end();
+       ++p) {
+    (*p)->join();
+    delete *p;
+  }
+  threads_shardedpool.clear();
+  ldout(cct,15) << "stopped" << dendl;
+}
+
+void ShardedThreadPool::pause()
+{
+  std::unique_lock ul(shardedpool_lock);
+  ldout(cct,10) << "pause" << dendl;
+  pause_threads = true;
+  ceph_assert(wq != NULL);
+  wq->return_waiting_threads();
+  while (num_threads != num_paused){
+    wait_cond.wait(ul);
+  }
+  ldout(cct,10) << "paused" << dendl; 
+}
+
+void ShardedThreadPool::pause_new()
+{
+  ldout(cct,10) << "pause_new" << dendl;
+  shardedpool_lock.lock();
+  pause_threads = true;
+  ceph_assert(wq != NULL);
+  wq->return_waiting_threads();
+  shardedpool_lock.unlock();
+  ldout(cct,10) << "paused_new" << dendl;
+}
+
+void ShardedThreadPool::unpause()
+{
+  ldout(cct,10) << "unpause" << dendl;
+  shardedpool_lock.lock();
+  pause_threads = false;
+  wq->stop_return_waiting_threads();
+  shardedpool_cond.notify_all();
+  shardedpool_lock.unlock();
+  ldout(cct,10) << "unpaused" << dendl;
+}
+
+void ShardedThreadPool::drain()
+{
+  std::unique_lock ul(shardedpool_lock);
+  ldout(cct,10) << "drain" << dendl;
+  drain_threads = true;
+  ceph_assert(wq != NULL);
+  wq->return_waiting_threads();
+  while (num_threads != num_drained) {
+    wait_cond.wait(ul);
+  }
+  drain_threads = false;
+  wq->stop_return_waiting_threads();
+  shardedpool_cond.notify_all();
+  ldout(cct,10) << "drained" << dendl;
+}
+
diff --git a/src/common/WorkQueue.h b/src/common/WorkQueue.h
new file mode 100644
index 00000000..5259f92d
--- /dev/null
+++ b/src/common/WorkQueue.h
@@ -0,0 +1,751 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_WORKQUEUE_H
+#define CEPH_WORKQUEUE_H
+
+#ifdef WITH_SEASTAR
+// for ObjectStore.h
+struct ThreadPool {
+  struct TPHandle {
+  };
+};
+
+#else
+
+#include <atomic>
+#include <list>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "common/ceph_mutex.h"
+#include "include/unordered_map.h"
+#include "common/config_obs.h"
+#include "common/HeartbeatMap.h"
+#include "common/Thread.h"
+#include "include/Context.h"
+
+class CephContext;
+
+/// Pool of threads that share work submitted to multiple work queues.
+class ThreadPool : public md_config_obs_t {
+protected:
+  CephContext *cct;
+  std::string name;
+  std::string thread_name;
+  std::string lockname;
+  ceph::mutex _lock;
+  ceph::condition_variable _cond;
+  bool _stop;
+  int _pause;
+  int _draining;
+  ceph::condition_variable _wait_cond;
+
+public:
+  class TPHandle {
+    friend class ThreadPool;
+    CephContext *cct;
+    heartbeat_handle_d *hb;
+    ceph::coarse_mono_clock::rep grace;
+    ceph::coarse_mono_clock::rep suicide_grace;
+  public:
+    TPHandle(
+      CephContext *cct,
+      heartbeat_handle_d *hb,
+      time_t grace,
+      time_t suicide_grace)
+      : cct(cct), hb(hb), grace(grace), suicide_grace(suicide_grace) {}
+    void reset_tp_timeout();
+    void suspend_tp_timeout();
+  };
+protected:
+
+  /// Basic interface to a work queue used by the worker threads.
+  struct WorkQueue_ {
+    std::string name;
+    time_t timeout_interval, suicide_interval;
+    WorkQueue_(std::string n, time_t ti, time_t sti)
+      : name(std::move(n)), timeout_interval(ti), suicide_interval(sti)
+    { }
+    virtual ~WorkQueue_() {}
+    /// Remove all work items from the queue.
+    virtual void _clear() = 0;
+    /// Check whether there is anything to do.
+    virtual bool _empty() = 0;
+    /// Get the next work item to process.
+    virtual void *_void_dequeue() = 0;
+    /** @brief Process the work item.
+     * This function will be called several times in parallel
+     * and must therefore be thread-safe. */
+    virtual void _void_process(void *item, TPHandle &handle) = 0;
+    /** @brief Synchronously finish processing a work item.
+     * This function is called after _void_process with the global thread pool lock held,
+     * so at most one copy will execute simultaneously for a given thread pool.
+     * It can be used for non-thread-safe finalization. */
+    virtual void _void_process_finish(void *) = 0;
+  };
+
+  // track thread pool size changes
+  unsigned _num_threads;
+  std::string _thread_num_option;
+  const char **_conf_keys;
+
+  const char **get_tracked_conf_keys() const override {
+    return _conf_keys;
+  }
+  void handle_conf_change(const ConfigProxy& conf,
+			  const std::set <std::string> &changed) override;
+
+public:
+  /** @brief Work queue that processes several submitted items at once.
+   * The queue will automatically add itself to the thread pool on construction
+   * and remove itself on destruction. */
+  template<class T>
+  class BatchWorkQueue : public WorkQueue_ {
+    ThreadPool *pool;
+
+    virtual bool _enqueue(T *) = 0;
+    virtual void _dequeue(T *) = 0;
+    virtual void _dequeue(std::list<T*> *) = 0;
+    virtual void _process_finish(const std::list<T*> &) {}
+
+    // virtual methods from WorkQueue_ below
+    void *_void_dequeue() override {
+      std::list<T*> *out(new std::list<T*>);
+      _dequeue(out);
+      if (!out->empty()) {
+	return (void *)out;
+      } else {
+	delete out;
+	return 0;
+      }
+    }
+    void _void_process(void *p, TPHandle &handle) override {
+      _process(*((std::list<T*>*)p), handle);
+    }
+    void _void_process_finish(void *p) override {
+      _process_finish(*(std::list<T*>*)p);
+      delete (std::list<T*> *)p;
+    }
+
+  protected:
+    virtual void _process(const std::list<T*> &items, TPHandle &handle) = 0;
+
+  public:
+    BatchWorkQueue(std::string n, time_t ti, time_t sti, ThreadPool* p)
+      : WorkQueue_(std::move(n), ti, sti), pool(p) {
+      pool->add_work_queue(this);
+    }
+    ~BatchWorkQueue() override {
+      pool->remove_work_queue(this);
+    }
+
+    bool queue(T *item) {
+      pool->_lock.lock();
+      bool r = _enqueue(item);
+      pool->_cond.notify_one();
+      pool->_lock.unlock();
+      return r;
+    }
+    void dequeue(T *item) {
+      pool->_lock.lock();
+      _dequeue(item);
+      pool->_lock.unlock();
+    }
+    void clear() {
+      pool->_lock.lock();
+      _clear();
+      pool->_lock.unlock();
+    }
+
+    void lock() {
+      pool->lock();
+    }
+    void unlock() {
+      pool->unlock();
+    }
+    void wake() {
+      pool->wake();
+    }
+    void _wake() {
+      pool->_wake();
+    }
+    void drain() {
+      pool->drain(this);
+    }
+
+  };
+
+  /** @brief Templated by-value work queue.
+   * Skeleton implementation of a queue that processes items submitted by value.
+   * This is useful if the items are single primitive values or very small objects
+   * (a few bytes). The queue will automatically add itself to the thread pool on
+   * construction and remove itself on destruction. */
+  template<typename T, typename U = T>
+  class WorkQueueVal : public WorkQueue_ {
+    ceph::mutex _lock = ceph::make_mutex("WorkQueueVal::_lock");
+    ThreadPool *pool;
+    std::list<U> to_process;
+    std::list<U> to_finish;
+    virtual void _enqueue(T) = 0;
+    virtual void _enqueue_front(T) = 0;
+    bool _empty() override = 0;
+    virtual U _dequeue() = 0;
+    virtual void _process_finish(U) {}
+
+    void *_void_dequeue() override {
+      {
+	std::lock_guard l(_lock);
+	if (_empty())
+	  return 0;
+	U u = _dequeue();
+	to_process.push_back(u);
+      }
+      return ((void*)1); // Not used
+    }
+    void _void_process(void *, TPHandle &handle) override {
+      _lock.lock();
+      ceph_assert(!to_process.empty());
+      U u = to_process.front();
+      to_process.pop_front();
+      _lock.unlock();
+
+      _process(u, handle);
+
+      _lock.lock();
+      to_finish.push_back(u);
+      _lock.unlock();
+    }
+
+    void _void_process_finish(void *) override {
+      _lock.lock();
+      ceph_assert(!to_finish.empty());
+      U u = to_finish.front();
+      to_finish.pop_front();
+      _lock.unlock();
+
+      _process_finish(u);
+    }
+
+    void _clear() override {}
+
+  public:
+    WorkQueueVal(std::string n, time_t ti, time_t sti, ThreadPool *p)
+      : WorkQueue_(std::move(n), ti, sti), pool(p) {
+      pool->add_work_queue(this);
+    }
+    ~WorkQueueVal() override {
+      pool->remove_work_queue(this);
+    }
+    void queue(T item) {
+      std::lock_guard l(pool->_lock);
+      _enqueue(item);
+      pool->_cond.notify_one();
+    }
+    void queue_front(T item) {
+      std::lock_guard l(pool->_lock);
+      _enqueue_front(item);
+      pool->_cond.notify_one();
+    }
+    void drain() {
+      pool->drain(this);
+    }
+  protected:
+    void lock() {
+      pool->lock();
+    }
+    void unlock() {
+      pool->unlock();
+    }
+    virtual void _process(U u, TPHandle &) = 0;
+  };
+
+  /** @brief Template by-pointer work queue.
+   * Skeleton implementation of a queue that processes items of a given type submitted as pointers.
+   * This is useful when the work item are large or include dynamically allocated memory. The queue
+   * will automatically add itself to the thread pool on construction and remove itself on
+   * destruction. */
+  template<class T>
+  class WorkQueue : public WorkQueue_ {
+    ThreadPool *pool;
+    
+    /// Add a work item to the queue.
+    virtual bool _enqueue(T *) = 0;
+    /// Dequeue a previously submitted work item.
+    virtual void _dequeue(T *) = 0;
+    /// Dequeue a work item and return the original submitted pointer.
+    virtual T *_dequeue() = 0;
+    virtual void _process_finish(T *) {}
+
+    // implementation of virtual methods from WorkQueue_
+    void *_void_dequeue() override {
+      return (void *)_dequeue();
+    }
+    void _void_process(void *p, TPHandle &handle) override {
+      _process(static_cast<T *>(p), handle);
+    }
+    void _void_process_finish(void *p) override {
+      _process_finish(static_cast<T *>(p));
+    }
+
+  protected:
+    /// Process a work item. Called from the worker threads.
+    virtual void _process(T *t, TPHandle &) = 0;
+
+  public:
+    WorkQueue(std::string n, time_t ti, time_t sti, ThreadPool* p)
+      : WorkQueue_(std::move(n), ti, sti), pool(p) {
+      pool->add_work_queue(this);
+    }
+    ~WorkQueue() override {
+      pool->remove_work_queue(this);
+    }
+    
+    bool queue(T *item) {
+      pool->_lock.lock();
+      bool r = _enqueue(item);
+      pool->_cond.notify_one();
+      pool->_lock.unlock();
+      return r;
+    }
+    void dequeue(T *item) {
+      pool->_lock.lock();
+      _dequeue(item);
+      pool->_lock.unlock();
+    }
+    void clear() {
+      pool->_lock.lock();
+      _clear();
+      pool->_lock.unlock();
+    }
+
+    void lock() {
+      pool->lock();
+    }
+    void unlock() {
+      pool->unlock();
+    }
+    /// wake up the thread pool (without lock held)
+    void wake() {
+      pool->wake();
+    }
+    /// wake up the thread pool (with lock already held)
+    void _wake() {
+      pool->_wake();
+    }
+    void _wait() {
+      pool->_wait();
+    }
+    void drain() {
+      pool->drain(this);
+    }
+
+  };
+
+  template<typename T>
+  class PointerWQ : public WorkQueue_ {
+  public:
+    ~PointerWQ() override {
+      m_pool->remove_work_queue(this);
+      ceph_assert(m_processing == 0);
+    }
+    void drain() {
+      {
+        // if this queue is empty and not processing, don't wait for other
+        // queues to finish processing
+        std::lock_guard l(m_pool->_lock);
+        if (m_processing == 0 && m_items.empty()) {
+          return;
+        }
+      }
+      m_pool->drain(this);
+    }
+    void queue(T *item) {
+      std::lock_guard l(m_pool->_lock);
+      m_items.push_back(item);
+      m_pool->_cond.notify_one();
+    }
+    bool empty() {
+      std::lock_guard l(m_pool->_lock);
+      return _empty();
+    }
+  protected:
+    PointerWQ(std::string n, time_t ti, time_t sti, ThreadPool* p)
+      : WorkQueue_(std::move(n), ti, sti), m_pool(p), m_processing(0) {
+    }
+    void register_work_queue() {
+      m_pool->add_work_queue(this);
+    }
+    void _clear() override {
+      ceph_assert(ceph_mutex_is_locked(m_pool->_lock));
+      m_items.clear();
+    }
+    bool _empty() override {
+      ceph_assert(ceph_mutex_is_locked(m_pool->_lock));
+      return m_items.empty();
+    }
+    void *_void_dequeue() override {
+      ceph_assert(ceph_mutex_is_locked(m_pool->_lock));
+      if (m_items.empty()) {
+        return NULL;
+      }
+
+      ++m_processing;
+      T *item = m_items.front();
+      m_items.pop_front();
+      return item;
+    }
+    void _void_process(void *item, ThreadPool::TPHandle &handle) override {
+      process(reinterpret_cast<T *>(item));
+    }
+    void _void_process_finish(void *item) override {
+      ceph_assert(ceph_mutex_is_locked(m_pool->_lock));
+      ceph_assert(m_processing > 0);
+      --m_processing;
+    }
+
+    virtual void process(T *item) = 0;
+    void process_finish() {
+      std::lock_guard locker(m_pool->_lock);
+      _void_process_finish(nullptr);
+    }
+
+    T *front() {
+      ceph_assert(ceph_mutex_is_locked(m_pool->_lock));
+      if (m_items.empty()) {
+        return NULL;
+      }
+      return m_items.front();
+    }
+    void requeue_front(T *item) {
+      std::lock_guard pool_locker(m_pool->_lock);
+      requeue_front(pool_locker, item);
+    }
+    void requeue_front(const std::lock_guard<ceph::mutex>&, T *item) {
+      _void_process_finish(nullptr);
+      m_items.push_front(item);
+    }
+    void requeue_back(T *item) {
+      std::lock_guard pool_locker(m_pool->_lock);
+      requeue_back(pool_locker, item);
+    }
+    void requeue_back(const std::lock_guard<ceph::mutex>&, T *item) {
+      _void_process_finish(nullptr);
+      m_items.push_back(item);
+    }
+    void signal() {
+      std::lock_guard pool_locker(m_pool->_lock);
+      signal(pool_locker);
+    }
+    void signal(const std::lock_guard<ceph::mutex>&) {
+      m_pool->_cond.notify_one();
+    }
+    ceph::mutex &get_pool_lock() {
+      return m_pool->_lock;
+    }
+  private:
+    ThreadPool *m_pool;
+    std::list<T *> m_items;
+    uint32_t m_processing;
+  };
+protected:
+  std::vector<WorkQueue_*> work_queues;
+  int next_work_queue = 0;
+ 
+
+  // threads
+  struct WorkThread : public Thread {
+    ThreadPool *pool;
+    // cppcheck-suppress noExplicitConstructor
+    WorkThread(ThreadPool *p) : pool(p) {}
+    void *entry() override {
+      pool->worker(this);
+      return 0;
+    }
+  };
+  
+  std::set<WorkThread*> _threads;
+  std::list<WorkThread*> _old_threads;  ///< need to be joined
+  int processing;
+
+  void start_threads();
+  void join_old_threads();
+  virtual void worker(WorkThread *wt);
+
+public:
+  ThreadPool(CephContext *cct_, std::string nm, std::string tn, int n, const char *option = NULL);
+  ~ThreadPool() override;
+
+  /// return number of threads currently running
+  int get_num_threads() {
+    std::lock_guard l(_lock);
+    return _num_threads;
+  }
+  
+  /// assign a work queue to this thread pool
+  void add_work_queue(WorkQueue_* wq) {
+    std::lock_guard l(_lock);
+    work_queues.push_back(wq);
+  }
+  /// remove a work queue from this thread pool
+  void remove_work_queue(WorkQueue_* wq) {
+    std::lock_guard l(_lock);
+    unsigned i = 0;
+    while (work_queues[i] != wq)
+      i++;
+    for (i++; i < work_queues.size(); i++) 
+      work_queues[i-1] = work_queues[i];
+    ceph_assert(i == work_queues.size());
+    work_queues.resize(i-1);
+  }
+
+  /// take thread pool lock
+  void lock() {
+    _lock.lock();
+  }
+  /// release thread pool lock
+  void unlock() {
+    _lock.unlock();
+  }
+
+  /// wait for a kick on this thread pool
+  void wait(ceph::condition_variable &c) {
+    std::unique_lock l(_lock, std::adopt_lock);
+    c.wait(l);
+  }
+
+  /// wake up a waiter (with lock already held)
+  void _wake() {
+    _cond.notify_all();
+  }
+  /// wake up a waiter (without lock held)
+  void wake() {
+    std::lock_guard l(_lock);
+    _cond.notify_all();
+  }
+  void _wait() {
+    std::unique_lock l(_lock, std::adopt_lock);
+    _cond.wait(l);
+  }
+
+  /// start thread pool thread
+  void start();
+  /// stop thread pool thread
+  void stop(bool clear_after=true);
+  /// pause thread pool (if it not already paused)
+  void pause();
+  /// pause initiation of new work
+  void pause_new();
+  /// resume work in thread pool.  must match each pause() call 1:1 to resume.
+  void unpause();
+  /** @brief Wait until work completes.
+   * If the parameter is NULL, blocks until all threads are idle.
+   * If it is not NULL, blocks until the given work queue does not have
+   * any items left to process. */
+  void drain(WorkQueue_* wq = 0);
+};
+
+class GenContextWQ :
+  public ThreadPool::WorkQueueVal<GenContext<ThreadPool::TPHandle&>*> {
+  std::list<GenContext<ThreadPool::TPHandle&>*> _queue;
+public:
+  GenContextWQ(const std::string &name, time_t ti, ThreadPool *tp)
+    : ThreadPool::WorkQueueVal<
+      GenContext<ThreadPool::TPHandle&>*>(name, ti, ti*10, tp) {}
+  
+  void _enqueue(GenContext<ThreadPool::TPHandle&> *c) override {
+    _queue.push_back(c);
+  }
+  void _enqueue_front(GenContext<ThreadPool::TPHandle&> *c) override {
+    _queue.push_front(c);
+  }
+  bool _empty() override {
+    return _queue.empty();
+  }
+  GenContext<ThreadPool::TPHandle&> *_dequeue() override {
+    ceph_assert(!_queue.empty());
+    GenContext<ThreadPool::TPHandle&> *c = _queue.front();
+    _queue.pop_front();
+    return c;
+  }
+  void _process(GenContext<ThreadPool::TPHandle&> *c,
+		ThreadPool::TPHandle &tp) override {
+    c->complete(tp);
+  }
+};
+
+class C_QueueInWQ : public Context {
+  GenContextWQ *wq;
+  GenContext<ThreadPool::TPHandle&> *c;
+public:
+  C_QueueInWQ(GenContextWQ *wq, GenContext<ThreadPool::TPHandle &> *c)
+    : wq(wq), c(c) {}
+  void finish(int) override {
+    wq->queue(c);
+  }
+};
+
+/// Work queue that asynchronously completes contexts (executes callbacks).
+/// @see Finisher
+class ContextWQ : public ThreadPool::PointerWQ<Context> {
+public:
+  ContextWQ(const std::string &name, time_t ti, ThreadPool *tp)
+    : ThreadPool::PointerWQ<Context>(name, ti, 0, tp) {
+    this->register_work_queue();
+  }
+
+  void queue(Context *ctx, int result = 0) {
+    if (result != 0) {
+      std::lock_guard locker(m_lock);
+      m_context_results[ctx] = result;
+    }
+    ThreadPool::PointerWQ<Context>::queue(ctx);
+  }
+protected:
+  void _clear() override {
+    ThreadPool::PointerWQ<Context>::_clear();
+
+    std::lock_guard locker(m_lock);
+    m_context_results.clear();
+  }
+
+  void process(Context *ctx) override {
+    int result = 0;
+    {
+      std::lock_guard locker(m_lock);
+      ceph::unordered_map<Context *, int>::iterator it =
+        m_context_results.find(ctx);
+      if (it != m_context_results.end()) {
+        result = it->second;
+        m_context_results.erase(it);
+      }
+    }
+    ctx->complete(result);
+  }
+private:
+  ceph::mutex m_lock = ceph::make_mutex("ContextWQ::m_lock");
+  ceph::unordered_map<Context*, int> m_context_results;
+};
+
+class ShardedThreadPool {
+
+  CephContext *cct;
+  std::string name;
+  std::string thread_name;
+  std::string lockname;
+  ceph::mutex shardedpool_lock;
+  ceph::condition_variable shardedpool_cond;
+  ceph::condition_variable wait_cond;
+  uint32_t num_threads;
+
+  std::atomic<bool> stop_threads = { false };
+  std::atomic<bool> pause_threads = { false };
+  std::atomic<bool> drain_threads = { false };
+
+  uint32_t num_paused;
+  uint32_t num_drained;
+
+public:
+
+  class BaseShardedWQ {
+  
+  public:
+    time_t timeout_interval, suicide_interval;
+    BaseShardedWQ(time_t ti, time_t sti):timeout_interval(ti), suicide_interval(sti) {}
+    virtual ~BaseShardedWQ() {}
+
+    virtual void _process(uint32_t thread_index, heartbeat_handle_d *hb ) = 0;
+    virtual void return_waiting_threads() = 0;
+    virtual void stop_return_waiting_threads() = 0;
+    virtual bool is_shard_empty(uint32_t thread_index) = 0;
+  };      
+
+  template <typename T>
+  class ShardedWQ: public BaseShardedWQ {
+  
+    ShardedThreadPool* sharded_pool;
+
+  protected:
+    virtual void _enqueue(T&&) = 0;
+    virtual void _enqueue_front(T&&) = 0;
+
+
+  public:
+    ShardedWQ(time_t ti, time_t sti, ShardedThreadPool* tp): BaseShardedWQ(ti, sti), 
+                                                                 sharded_pool(tp) {
+      tp->set_wq(this);
+    }
+    ~ShardedWQ() override {}
+
+    void queue(T&& item) {
+      _enqueue(std::move(item));
+    }
+    void queue_front(T&& item) {
+      _enqueue_front(std::move(item));
+    }
+    void drain() {
+      sharded_pool->drain();
+    }
+    
+  };
+
+private:
+
+  BaseShardedWQ* wq;
+  // threads
+  struct WorkThreadSharded : public Thread {
+    ShardedThreadPool *pool;
+    uint32_t thread_index;
+    WorkThreadSharded(ShardedThreadPool *p, uint32_t pthread_index): pool(p),
+      thread_index(pthread_index) {}
+    void *entry() override {
+      pool->shardedthreadpool_worker(thread_index);
+      return 0;
+    }
+  };
+
+  std::vector<WorkThreadSharded*> threads_shardedpool;
+  void start_threads();
+  void shardedthreadpool_worker(uint32_t thread_index);
+  void set_wq(BaseShardedWQ* swq) {
+    wq = swq;
+  }
+
+
+
+public:
+
+  ShardedThreadPool(CephContext *cct_, std::string nm, std::string tn, uint32_t pnum_threads);
+
+  ~ShardedThreadPool(){};
+
+  /// start thread pool thread
+  void start();
+  /// stop thread pool thread
+  void stop();
+  /// pause thread pool (if it not already paused)
+  void pause();
+  /// pause initiation of new work
+  void pause_new();
+  /// resume work in thread pool.  must match each pause() call 1:1 to resume.
+  void unpause();
+  /// wait for all work to complete
+  void drain();
+
+};
+
+#endif
+
+#endif
diff --git a/src/common/addr_parsing.c b/src/common/addr_parsing.c
new file mode 100644
index 00000000..4159dff6
--- /dev/null
+++ b/src/common/addr_parsing.c
@@ -0,0 +1,155 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#if defined(__FreeBSD__) || defined(_AIX)
+#include <sys/socket.h>
+#include <netinet/in.h>
+#endif
+#include <netdb.h>
+
+#define BUF_SIZE 128
+
+int safe_cat(char **pstr, int *plen, int pos, const char *str2)
+{
+  int len2 = strlen(str2);
+
+  //printf("safe_cat '%s' max %d pos %d '%s' len %d\n", *pstr, *plen, pos, str2, len2);
+  while (*plen < pos + len2 + 1) {
+    *plen += BUF_SIZE;
+
+    void *_realloc = realloc(*pstr, (size_t)*plen);
+
+    if (!_realloc) {
+      printf("Out of memory\n");
+      exit(1);
+    } else {
+      *pstr = _realloc;
+    }
+    //printf("safe_cat '%s' max %d pos %d '%s' len %d\n", *pstr, *plen, pos, str2, len2);
+  }
+
+  strncpy((*pstr)+pos, str2, len2);
+  (*pstr)[pos+len2] = '\0';
+
+  return pos + len2;
+}
+
+char *resolve_addrs(const char *orig_str)
+{
+  int len = BUF_SIZE;
+  char *new_str = (char *)malloc(len);
+
+  if (!new_str) {
+    return NULL;
+  }
+
+  char *saveptr = NULL;
+  char *buf = strdup(orig_str);
+  const char *delim = ",; ";
+
+  char *tok = strtok_r(buf, delim, &saveptr);
+
+  int pos = 0;
+
+  while (tok) {
+    struct addrinfo hint;
+    struct addrinfo *res, *ores;
+    char *firstcolon, *lastcolon, *bracecolon;
+    int r;
+    int brackets = 0;
+
+    firstcolon = strchr(tok, ':');
+    lastcolon = strrchr(tok, ':');
+    bracecolon = strstr(tok, "]:");
+
+    char *port_str = 0;
+    if (firstcolon && firstcolon == lastcolon) {
+      /* host:port or a.b.c.d:port */
+      *firstcolon = 0;
+      port_str = firstcolon + 1;
+    } else if (bracecolon) {
+      /* [ipv6addr]:port */
+      port_str = bracecolon + 1;
+      *port_str = 0;
+      port_str++;
+    }
+    if (port_str && !*port_str)
+      port_str = NULL;
+
+    if (*tok == '[' &&
+	tok[strlen(tok)-1] == ']') {
+      tok[strlen(tok)-1] = 0;
+      tok++;
+      brackets = 1;
+    }
+
+    //printf("name '%s' port '%s'\n", tok, port_str);
+
+    // FIPS zeroization audit 20191115: this memset is fine.
+    memset(&hint, 0, sizeof(hint));
+    hint.ai_family = AF_UNSPEC;
+    hint.ai_socktype = SOCK_STREAM;
+    hint.ai_protocol = IPPROTO_TCP;
+
+    r = getaddrinfo(tok, port_str, &hint, &res);
+    if (r < 0) {
+      printf("server name not found: %s (%s)\n", tok,
+	     gai_strerror(r));
+      free(new_str);
+      free(buf);
+      return 0;
+    }
+
+    /* build resolved addr list */
+    ores = res;
+    while (res) {
+      char host[40], port[40];
+      getnameinfo(res->ai_addr, res->ai_addrlen,
+		  host, sizeof(host),
+		  port, sizeof(port),
+		  NI_NUMERICSERV | NI_NUMERICHOST);
+      /*printf(" host %s port %s flags %d family %d socktype %d proto %d sanonname %s\n",
+	host, port,
+	res->ai_flags, res->ai_family, res->ai_socktype, res->ai_protocol,
+	res->ai_canonname);*/
+      if (res->ai_family == AF_INET6)
+	brackets = 1;  /* always surround ipv6 addrs with brackets */
+      if (brackets)
+	pos = safe_cat(&new_str, &len, pos, "[");
+      pos = safe_cat(&new_str, &len, pos, host);
+      if (brackets)
+	pos = safe_cat(&new_str, &len, pos, "]");
+      if (port_str) {
+	pos = safe_cat(&new_str, &len, pos, ":");
+	pos = safe_cat(&new_str, &len, pos, port);
+      }
+      res = res->ai_next;
+      if (res)
+	pos = safe_cat(&new_str, &len, pos, ",");
+    }
+    freeaddrinfo(ores);
+
+    tok = strtok_r(NULL, delim, &saveptr);
+    if (tok)
+      pos = safe_cat(&new_str, &len, pos, ",");
+
+  }
+
+  //printf("new_str is '%s'\n", new_str);
+  free(buf);
+  return new_str;
+}
diff --git a/src/common/address_helper.cc b/src/common/address_helper.cc
new file mode 100644
index 00000000..cdb8591f
--- /dev/null
+++ b/src/common/address_helper.cc
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * address_helper.cc
+ *
+ *  Created on: Oct 27, 2013
+ *      Author: matt
+ */
+
+#include <netdb.h>
+#include <regex>
+
+#include "common/address_helper.h"
+
+// decode strings like "tcp://<host>:<port>"
+int entity_addr_from_url(entity_addr_t *addr /* out */, const char *url)
+{
+	std::regex expr("(tcp|rdma)://([^:]*):([\\d]+)");
+	std::cmatch m;
+
+	if (std::regex_match(url, m, expr)) {
+		string host(m[2].first, m[2].second);
+		string port(m[3].first, m[3].second);
+		addrinfo hints;
+		// FIPS zeroization audit 20191115: this memset is fine.
+		memset(&hints, 0, sizeof(hints));
+		hints.ai_family = PF_UNSPEC;
+		addrinfo *res;
+		if (!getaddrinfo(host.c_str(), nullptr, &hints, &res)) {
+			addr->set_sockaddr((sockaddr*)res->ai_addr);
+			addr->set_port(std::atoi(port.c_str()));
+			freeaddrinfo(res);
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
diff --git a/src/common/address_helper.h b/src/common/address_helper.h
new file mode 100644
index 00000000..047bd0a0
--- /dev/null
+++ b/src/common/address_helper.h
@@ -0,0 +1,22 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef ADDRESS_HELPER_H_
+#define ADDRESS_HELPER_H_
+
+#include "msg/msg_types.h"
+
+int entity_addr_from_url(entity_addr_t *addr /* out */, const char *url);
+
+#endif /* ADDRESS_HELPER_H_ */
diff --git a/src/common/admin_socket.cc b/src/common/admin_socket.cc
new file mode 100644
index 00000000..022f3560
--- /dev/null
+++ b/src/common/admin_socket.cc
@@ -0,0 +1,666 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include <poll.h>
+#include <sys/un.h>
+
+#include "common/admin_socket.h"
+#include "common/admin_socket_client.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "common/Thread.h"
+#include "common/version.h"
+
+
+// re-include our assert to clobber the system one; fix dout:
+#include "include/ceph_assert.h"
+#include "include/compat.h"
+#include "include/sock_compat.h"
+
+#define dout_subsys ceph_subsys_asok
+#undef dout_prefix
+#define dout_prefix *_dout << "asok(" << (void*)m_cct << ") "
+
+
+using std::ostringstream;
+
+/*
+ * UNIX domain sockets created by an application persist even after that
+ * application closes, unless they're explicitly unlinked. This is because the
+ * directory containing the socket keeps a reference to the socket.
+ *
+ * This code makes things a little nicer by unlinking those dead sockets when
+ * the application exits normally.
+ */
+
+template<typename F, typename... Args>
+inline int retry_sys_call(F f, Args... args) {
+  int r;
+  do {
+    r = f(args...);
+  } while (r < 0 && errno == EINTR);
+  return r;
+};
+
+
+static std::mutex cleanup_lock;
+static std::vector<std::string> cleanup_files;
+static bool cleanup_atexit = false;
+
+static void remove_cleanup_file(std::string_view file) {
+  std::unique_lock l(cleanup_lock);
+
+  if (auto i = std::find(cleanup_files.cbegin(), cleanup_files.cend(), file);
+      i != cleanup_files.cend()) {
+    retry_sys_call(::unlink, i->c_str());
+    cleanup_files.erase(i);
+  }
+}
+
+void remove_all_cleanup_files() {
+  std::unique_lock l(cleanup_lock);
+  for (const auto& s : cleanup_files) {
+    retry_sys_call(::unlink, s.c_str());
+  }
+  cleanup_files.clear();
+}
+
+static void add_cleanup_file(std::string file) {
+  std::unique_lock l(cleanup_lock);
+  cleanup_files.push_back(std::move(file));
+  if (!cleanup_atexit) {
+    atexit(remove_all_cleanup_files);
+    cleanup_atexit = true;
+  }
+}
+
+AdminSocket::AdminSocket(CephContext *cct)
+  : m_cct(cct)
+{}
+
+AdminSocket::~AdminSocket()
+{
+  shutdown();
+}
+
+/*
+ * This thread listens on the UNIX domain socket for incoming connections.
+ * It only handles one connection at a time at the moment. All I/O is nonblocking,
+ * so that we can implement sensible timeouts. [TODO: make all I/O nonblocking]
+ *
+ * This thread also listens to m_shutdown_rd_fd. If there is any data sent to this
+ * pipe, the thread terminates itself gracefully, allowing the
+ * AdminSocketConfigObs class to join() it.
+ */
+
+std::string AdminSocket::create_shutdown_pipe(int *pipe_rd, int *pipe_wr)
+{
+  int pipefd[2];
+  if (pipe_cloexec(pipefd) < 0) {
+    int e = errno;
+    ostringstream oss;
+    oss << "AdminSocket::create_shutdown_pipe error: " << cpp_strerror(e);
+    return oss.str();
+  }
+  
+  *pipe_rd = pipefd[0];
+  *pipe_wr = pipefd[1];
+  return "";
+}
+
+std::string AdminSocket::destroy_shutdown_pipe()
+{
+  // Send a byte to the shutdown pipe that the thread is listening to
+  char buf[1] = { 0x0 };
+  int ret = safe_write(m_shutdown_wr_fd, buf, sizeof(buf));
+
+  // Close write end
+  retry_sys_call(::close, m_shutdown_wr_fd);
+  m_shutdown_wr_fd = -1;
+
+  if (ret != 0) {
+    ostringstream oss;
+    oss << "AdminSocket::destroy_shutdown_pipe error: failed to write"
+      "to thread shutdown pipe: error " << ret;
+    return oss.str();
+  }
+
+  th.join();
+
+  // Close read end. Doing this before join() blocks the listenter and prevents
+  // joining.
+  retry_sys_call(::close, m_shutdown_rd_fd);
+  m_shutdown_rd_fd = -1;
+
+  return "";
+}
+
+std::string AdminSocket::bind_and_listen(const std::string &sock_path, int *fd)
+{
+  ldout(m_cct, 5) << "bind_and_listen " << sock_path << dendl;
+
+  struct sockaddr_un address;
+  if (sock_path.size() > sizeof(address.sun_path) - 1) {
+    ostringstream oss;
+    oss << "AdminSocket::bind_and_listen: "
+	<< "The UNIX domain socket path " << sock_path << " is too long! The "
+	<< "maximum length on this system is "
+	<< (sizeof(address.sun_path) - 1);
+    return oss.str();
+  }
+  int sock_fd = socket_cloexec(PF_UNIX, SOCK_STREAM, 0);
+  if (sock_fd < 0) {
+    int err = errno;
+    ostringstream oss;
+    oss << "AdminSocket::bind_and_listen: "
+	<< "failed to create socket: " << cpp_strerror(err);
+    return oss.str();
+  }
+  // FIPS zeroization audit 20191115: this memset is fine.
+  memset(&address, 0, sizeof(struct sockaddr_un));
+  address.sun_family = AF_UNIX;
+  snprintf(address.sun_path, sizeof(address.sun_path),
+	   "%s", sock_path.c_str());
+  if (::bind(sock_fd, (struct sockaddr*)&address,
+	   sizeof(struct sockaddr_un)) != 0) {
+    int err = errno;
+    if (err == EADDRINUSE) {
+      AdminSocketClient client(sock_path);
+      bool ok;
+      client.ping(&ok);
+      if (ok) {
+	ldout(m_cct, 20) << "socket " << sock_path << " is in use" << dendl;
+	err = EEXIST;
+      } else {
+	ldout(m_cct, 20) << "unlink stale file " << sock_path << dendl;
+	retry_sys_call(::unlink, sock_path.c_str());
+	if (::bind(sock_fd, (struct sockaddr*)&address,
+		 sizeof(struct sockaddr_un)) == 0) {
+	  err = 0;
+	} else {
+	  err = errno;
+	}
+      }
+    }
+    if (err != 0) {
+      ostringstream oss;
+      oss << "AdminSocket::bind_and_listen: "
+	  << "failed to bind the UNIX domain socket to '" << sock_path
+	  << "': " << cpp_strerror(err);
+      close(sock_fd);
+      return oss.str();
+    }
+  }
+  if (listen(sock_fd, 5) != 0) {
+    int err = errno;
+    ostringstream oss;
+    oss << "AdminSocket::bind_and_listen: "
+	  << "failed to listen to socket: " << cpp_strerror(err);
+    close(sock_fd);
+    retry_sys_call(::unlink, sock_path.c_str());
+    return oss.str();
+  }
+  *fd = sock_fd;
+  return "";
+}
+
+void AdminSocket::entry() noexcept
+{
+  ldout(m_cct, 5) << "entry start" << dendl;
+  while (true) {
+    struct pollfd fds[2];
+    // FIPS zeroization audit 20191115: this memset is fine.
+    memset(fds, 0, sizeof(fds));
+    fds[0].fd = m_sock_fd;
+    fds[0].events = POLLIN | POLLRDBAND;
+    fds[1].fd = m_shutdown_rd_fd;
+    fds[1].events = POLLIN | POLLRDBAND;
+
+    int ret = poll(fds, 2, -1);
+    if (ret < 0) {
+      int err = errno;
+      if (err == EINTR) {
+	continue;
+      }
+      lderr(m_cct) << "AdminSocket: poll(2) error: '"
+		   << cpp_strerror(err) << dendl;
+      return;
+    }
+
+    if (fds[0].revents & POLLIN) {
+      // Send out some data
+      do_accept();
+    }
+    if (fds[1].revents & POLLIN) {
+      // Parent wants us to shut down
+      return;
+    }
+  }
+  ldout(m_cct, 5) << "entry exit" << dendl;
+}
+
+void AdminSocket::chown(uid_t uid, gid_t gid)
+{
+  if (m_sock_fd >= 0) {
+    int r = ::chown(m_path.c_str(), uid, gid);
+    if (r < 0) {
+      r = -errno;
+      lderr(m_cct) << "AdminSocket: failed to chown socket: "
+		   << cpp_strerror(r) << dendl;
+    }
+  }
+}
+
+void AdminSocket::chmod(mode_t mode)
+{
+  if (m_sock_fd >= 0) {
+    int r = ::chmod(m_path.c_str(), mode);
+    if (r < 0) {
+      r = -errno;
+      lderr(m_cct) << "AdminSocket: failed to chmod socket: "
+                   << cpp_strerror(r) << dendl;
+    }
+  }
+}
+
+bool AdminSocket::do_accept()
+{
+  struct sockaddr_un address;
+  socklen_t address_length = sizeof(address);
+  ldout(m_cct, 30) << "AdminSocket: calling accept" << dendl;
+  int connection_fd = accept_cloexec(m_sock_fd, (struct sockaddr*) &address,
+			     &address_length);
+  if (connection_fd < 0) {
+    int err = errno;
+    lderr(m_cct) << "AdminSocket: do_accept error: '"
+			   << cpp_strerror(err) << dendl;
+    return false;
+  }
+  ldout(m_cct, 30) << "AdminSocket: finished accept" << dendl;
+
+  char cmd[1024];
+  unsigned pos = 0;
+  string c;
+  while (1) {
+    int ret = safe_read(connection_fd, &cmd[pos], 1);
+    if (ret <= 0) {
+      if (ret < 0) {
+        lderr(m_cct) << "AdminSocket: error reading request code: "
+		     << cpp_strerror(ret) << dendl;
+      }
+      retry_sys_call(::close, connection_fd);
+      return false;
+    }
+    if (cmd[0] == '\0') {
+      // old protocol: __be32
+      if (pos == 3 && cmd[0] == '\0') {
+	switch (cmd[3]) {
+	case 0:
+	  c = "0";
+	  break;
+	case 1:
+	  c = "perfcounters_dump";
+	  break;
+	case 2:
+	  c = "perfcounters_schema";
+	  break;
+	default:
+	  c = "foo";
+	  break;
+	}
+	//wrap command with new protocol
+	c = "{\"prefix\": \"" + c + "\"}";
+	break;
+      }
+    } else {
+      // new protocol: null or \n terminated string
+      if (cmd[pos] == '\n' || cmd[pos] == '\0') {
+	cmd[pos] = '\0';
+	c = cmd;
+	break;
+      }
+    }
+    if (++pos >= sizeof(cmd)) {
+      lderr(m_cct) << "AdminSocket: error reading request too long" << dendl;
+      retry_sys_call(::close, connection_fd);
+      return false;
+    }
+  }
+
+  bool rval;
+  bufferlist out;
+  rval = execute_command(c, out);
+  if (rval) {
+    uint32_t len = htonl(out.length());
+    int ret = safe_write(connection_fd, &len, sizeof(len));
+    if (ret < 0) {
+      lderr(m_cct) << "AdminSocket: error writing response length "
+          << cpp_strerror(ret) << dendl;
+      rval = false;
+    } else {
+      if (out.write_fd(connection_fd) >= 0)
+        rval = true;
+    }
+  }
+
+  retry_sys_call(::close, connection_fd);
+  return rval;
+}
+
+int AdminSocket::execute_command(const std::string& cmd, ceph::bufferlist& out)
+{
+  cmdmap_t cmdmap;
+  string format;
+  vector<string> cmdvec;
+  stringstream errss;
+  cmdvec.push_back(cmd);
+  if (!cmdmap_from_json(cmdvec, &cmdmap, errss)) {
+    ldout(m_cct, 0) << "AdminSocket: " << errss.str() << dendl;
+    return false;
+  }
+  string match;
+  try {
+    cmd_getval(m_cct, cmdmap, "format", format);
+    cmd_getval(m_cct, cmdmap, "prefix", match);
+  } catch (const bad_cmd_get& e) {
+    return false;
+  }
+  if (format != "json" && format != "json-pretty" &&
+      format != "xml" && format != "xml-pretty")
+    format = "json-pretty";
+
+  std::unique_lock l(lock);
+  decltype(hooks)::iterator p;
+  while (match.size()) {
+    p = hooks.find(match);
+    if (p != hooks.cend())
+      break;
+
+    // drop right-most word
+    size_t pos = match.rfind(' ');
+    if (pos == std::string::npos) {
+      match.clear();  // we fail
+      break;
+    } else {
+      match.resize(pos);
+    }
+  }
+
+  if (p == hooks.cend()) {
+    lderr(m_cct) << "AdminSocket: request '" << cmd << "' not defined" << dendl;
+    return false;
+  }
+  string args;
+  if (match != cmd) {
+    args = cmd.substr(match.length() + 1);
+  }
+
+  // Drop lock to avoid cycles in cases where the hook takes
+  // the same lock that was held during calls to register/unregister,
+  // and set in_hook to allow unregister to wait for us before
+  // removing this hook.
+  in_hook = true;
+  auto match_hook = p->second.hook;
+  l.unlock();
+  bool success = (validate(match, cmdmap, out) &&
+      match_hook->call(match, cmdmap, format, out));
+  l.lock();
+  in_hook = false;
+  in_hook_cond.notify_all();
+  if (!success) {
+    ldout(m_cct, 0) << "AdminSocket: request '" << match << "' args '" << args
+        << "' to " << match_hook << " failed" << dendl;
+    out.append("failed");
+  } else {
+    ldout(m_cct, 5) << "AdminSocket: request '" << match << "' '" << args
+        << "' to " << match_hook
+        << " returned " << out.length() << " bytes" << dendl;
+  }
+  return true;
+}
+
+
+
+bool AdminSocket::validate(const std::string& command,
+			   const cmdmap_t& cmdmap,
+			   bufferlist& out) const
+{
+  stringstream os;
+  if (validate_cmd(m_cct, hooks.at(command).desc, cmdmap, os)) {
+    return true;
+  } else {
+    out.append(os);
+    return false;
+  }
+}
+
+int AdminSocket::register_command(std::string_view command,
+				  std::string_view cmddesc,
+				  AdminSocketHook *hook,
+				  std::string_view help)
+{
+  int ret;
+  std::unique_lock l(lock);
+  auto i = hooks.find(command);
+  if (i != hooks.cend()) {
+    ldout(m_cct, 5) << "register_command " << command << " hook " << hook
+		    << " EEXIST" << dendl;
+    ret = -EEXIST;
+  } else {
+    ldout(m_cct, 5) << "register_command " << command << " hook " << hook
+		    << dendl;
+    hooks.emplace_hint(i,
+		       std::piecewise_construct,
+		       std::forward_as_tuple(command),
+		       std::forward_as_tuple(hook, cmddesc, help));
+    ret = 0;
+  }
+  return ret;
+}
+
+int AdminSocket::unregister_command(std::string_view command)
+{
+  int ret;
+  std::unique_lock l(lock);
+  auto i = hooks.find(command);
+  if (i != hooks.cend()) {
+    ldout(m_cct, 5) << "unregister_command " << command << dendl;
+
+    // If we are currently processing a command, wait for it to
+    // complete in case it referenced the hook that we are
+    // unregistering.
+    in_hook_cond.wait(l, [this]() { return !in_hook; });
+
+    hooks.erase(i);
+
+
+    ret = 0;
+  } else {
+    ldout(m_cct, 5) << "unregister_command " << command << " ENOENT" << dendl;
+    ret = -ENOENT;
+  }
+  return ret;
+}
+
+void AdminSocket::unregister_commands(const AdminSocketHook *hook)
+{
+  std::unique_lock l(lock);
+  auto i = hooks.begin();
+  while (i != hooks.end()) {
+    if (i->second.hook == hook) {
+      ldout(m_cct, 5) << __func__ << " " << i->first << dendl;
+
+      // If we are currently processing a command, wait for it to
+      // complete in case it referenced the hook that we are
+      // unregistering.
+      in_hook_cond.wait(l, [this]() { return !in_hook; });
+      hooks.erase(i++);
+    } else {
+      i++;
+    }
+  }
+}
+
+class VersionHook : public AdminSocketHook {
+public:
+  bool call(std::string_view command, const cmdmap_t& cmdmap,
+	    std::string_view format, bufferlist& out) override {
+    if (command == "0"sv) {
+      out.append(CEPH_ADMIN_SOCK_VERSION);
+    } else {
+      JSONFormatter jf;
+      jf.open_object_section("version");
+      if (command == "version") {
+	jf.dump_string("version", ceph_version_to_str());
+	jf.dump_string("release", ceph_release_name(ceph_release()));
+	jf.dump_string("release_type", ceph_release_type());
+      } else if (command == "git_version") {
+	jf.dump_string("git_version", git_version_to_str());
+      }
+      ostringstream ss;
+      jf.close_section();
+      jf.enable_line_break();
+      jf.flush(ss);
+      out.append(ss.str());
+    }
+    return true;
+  }
+};
+
+class HelpHook : public AdminSocketHook {
+  AdminSocket *m_as;
+public:
+  explicit HelpHook(AdminSocket *as) : m_as(as) {}
+  bool call(std::string_view command, const cmdmap_t& cmdmap,
+	    std::string_view format,
+	    bufferlist& out) override {
+    std::unique_ptr<Formatter> f(Formatter::create(format, "json-pretty"sv,
+						   "json-pretty"sv));
+    f->open_object_section("help");
+    for (const auto& [command, info] : m_as->hooks) {
+      if (info.help.length())
+	f->dump_string(command.c_str(), info.help);
+    }
+    f->close_section();
+    ostringstream ss;
+    f->flush(ss);
+    out.append(ss.str());
+    return true;
+  }
+};
+
+class GetdescsHook : public AdminSocketHook {
+  AdminSocket *m_as;
+public:
+  explicit GetdescsHook(AdminSocket *as) : m_as(as) {}
+  bool call(std::string_view command, const cmdmap_t& cmdmap,
+	    std::string_view format, bufferlist& out) override {
+    int cmdnum = 0;
+    JSONFormatter jf;
+    jf.open_object_section("command_descriptions");
+    for (const auto& [command, info] : m_as->hooks) {
+      // GCC 8 actually has [[maybe_unused]] on a structured binding
+      // do what you'd expect. GCC 7 does not.
+      (void)command;
+      ostringstream secname;
+      secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
+      dump_cmd_and_help_to_json(&jf,
+                                CEPH_FEATURES_ALL,
+				secname.str().c_str(),
+				info.desc,
+				info.help);
+      cmdnum++;
+    }
+    jf.close_section(); // command_descriptions
+    jf.enable_line_break();
+    ostringstream ss;
+    jf.flush(ss);
+    out.append(ss.str());
+    return true;
+  }
+};
+
+bool AdminSocket::init(const std::string& path)
+{
+  ldout(m_cct, 5) << "init " << path << dendl;
+
+  /* Set up things for the new thread */
+  std::string err;
+  int pipe_rd = -1, pipe_wr = -1;
+  err = create_shutdown_pipe(&pipe_rd, &pipe_wr);
+  if (!err.empty()) {
+    lderr(m_cct) << "AdminSocketConfigObs::init: error: " << err << dendl;
+    return false;
+  }
+  int sock_fd;
+  err = bind_and_listen(path, &sock_fd);
+  if (!err.empty()) {
+    lderr(m_cct) << "AdminSocketConfigObs::init: failed: " << err << dendl;
+    close(pipe_rd);
+    close(pipe_wr);
+    return false;
+  }
+
+  /* Create new thread */
+  m_sock_fd = sock_fd;
+  m_shutdown_rd_fd = pipe_rd;
+  m_shutdown_wr_fd = pipe_wr;
+  m_path = path;
+
+  version_hook = std::make_unique<VersionHook>();
+  register_command("0", "0", version_hook.get(), "");
+  register_command("version", "version", version_hook.get(), "get ceph version");
+  register_command("git_version", "git_version", version_hook.get(),
+		   "get git sha1");
+  help_hook = std::make_unique<HelpHook>(this);
+  register_command("help", "help", help_hook.get(),
+		   "list available commands");
+  getdescs_hook = std::make_unique<GetdescsHook>(this);
+  register_command("get_command_descriptions", "get_command_descriptions",
+		   getdescs_hook.get(), "list available commands");
+
+  th = make_named_thread("admin_socket", &AdminSocket::entry, this);
+  add_cleanup_file(m_path.c_str());
+  return true;
+}
+
+void AdminSocket::shutdown()
+{
+  // Under normal operation this is unlikely to occur.  However for some unit
+  // tests, some object members are not initialized and so cannot be deleted
+  // without fault.
+  if (m_shutdown_wr_fd < 0)
+    return;
+
+  ldout(m_cct, 5) << "shutdown" << dendl;
+
+  auto err = destroy_shutdown_pipe();
+  if (!err.empty()) {
+    lderr(m_cct) << "AdminSocket::shutdown: error: " << err << dendl;
+  }
+
+  retry_sys_call(::close, m_sock_fd);
+
+  unregister_commands(version_hook.get());
+  version_hook.reset();
+
+  unregister_command("help");
+  help_hook.reset();
+
+  unregister_command("get_command_descriptions");
+  getdescs_hook.reset();
+
+  remove_cleanup_file(m_path);
+  m_path.clear();
+}
diff --git a/src/common/admin_socket.h b/src/common/admin_socket.h
new file mode 100644
index 00000000..3603fde3
--- /dev/null
+++ b/src/common/admin_socket.h
@@ -0,0 +1,144 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_ADMIN_SOCKET_H
+#define CEPH_COMMON_ADMIN_SOCKET_H
+
+#include <condition_variable>
+#include <mutex>
+#include <string>
+#include <string_view>
+#include <thread>
+
+#include "include/buffer.h"
+#include "common/cmdparse.h"
+
+class AdminSocket;
+class CephContext;
+
+using namespace std::literals;
+
+inline constexpr auto CEPH_ADMIN_SOCK_VERSION = "2"sv;
+
+class AdminSocketHook {
+public:
+  virtual bool call(std::string_view command, const cmdmap_t& cmdmap,
+		    std::string_view format, bufferlist& out) = 0;
+  virtual ~AdminSocketHook() {}
+};
+
+class AdminSocket
+{
+public:
+  AdminSocket(CephContext *cct);
+  ~AdminSocket();
+
+  AdminSocket(const AdminSocket&) = delete;
+  AdminSocket& operator =(const AdminSocket&) = delete;
+  AdminSocket(AdminSocket&&) = delete;
+  AdminSocket& operator =(AdminSocket&&) = delete;
+
+  /**
+   * register an admin socket command
+   *
+   * The command is registered under a command string.  Incoming
+   * commands are split by space and matched against the longest
+   * registered command.  For example, if 'foo' and 'foo bar' are
+   * registered, and an incoming command is 'foo bar baz', it is
+   * matched with 'foo bar', while 'foo fud' will match 'foo'.
+   *
+   * The entire incoming command string is passed to the registered
+   * hook.
+   *
+   * @param command command string
+   * @param cmddesc command syntax descriptor
+   * @param hook implementation
+   * @param help help text.  if empty, command will not be included in 'help' output.
+   *
+   * @return 0 for success, -EEXIST if command already registered.
+   */
+  int register_command(std::string_view command,
+		       std::string_view cmddesc,
+		       AdminSocketHook *hook,
+		       std::string_view help);
+
+  /**
+   * unregister an admin socket command.
+   *
+   * If a command is currently in progress, this will block until it
+   * is done.  For that reason, you must not hold any locks required
+   * by your hook while you call this.
+   *
+   * @param command command string
+   * @return 0 on succest, -ENOENT if command dne.
+   */
+  int unregister_command(std::string_view command);
+
+  /*
+   * unregister all commands belong to hook.
+   */
+  void unregister_commands(const AdminSocketHook *hook);
+
+  bool init(const std::string& path);
+
+  void chown(uid_t uid, gid_t gid);
+  void chmod(mode_t mode);
+  int execute_command(const std::string& cmd, ceph::bufferlist& out);
+
+private:
+
+  void shutdown();
+
+  std::string create_shutdown_pipe(int *pipe_rd, int *pipe_wr);
+  std::string destroy_shutdown_pipe();
+  std::string bind_and_listen(const std::string &sock_path, int *fd);
+
+  std::thread th;
+  void entry() noexcept;
+  bool do_accept();
+  bool validate(const std::string& command,
+		const cmdmap_t& cmdmap,
+		bufferlist& out) const;
+
+  CephContext *m_cct;
+  std::string m_path;
+  int m_sock_fd = -1;
+  int m_shutdown_rd_fd = -1;
+  int m_shutdown_wr_fd = -1;
+
+  bool in_hook = false;
+  std::condition_variable in_hook_cond;
+  std::mutex lock;  // protects `hooks`
+  std::unique_ptr<AdminSocketHook> version_hook;
+  std::unique_ptr<AdminSocketHook> help_hook;
+  std::unique_ptr<AdminSocketHook> getdescs_hook;
+
+  struct hook_info {
+    AdminSocketHook* hook;
+    std::string desc;
+    std::string help;
+
+    hook_info(AdminSocketHook* hook, std::string_view desc,
+	      std::string_view help)
+      : hook(hook), desc(desc), help(help) {}
+  };
+
+  std::map<std::string, hook_info, std::less<>> hooks;
+
+  friend class AdminSocketTest;
+  friend class HelpHook;
+  friend class GetdescsHook;
+};
+
+#endif
diff --git a/src/common/admin_socket_client.cc b/src/common/admin_socket_client.cc
new file mode 100644
index 00000000..eeb2d114
--- /dev/null
+++ b/src/common/admin_socket_client.cc
@@ -0,0 +1,166 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <arpa/inet.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "common/admin_socket.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "common/admin_socket_client.h"
+
+#include "include/compat.h"
+#include "include/sock_compat.h"
+
+using std::ostringstream;
+
+const char* get_rand_socket_path()
+{
+  static char *g_socket_path = NULL;
+
+  if (g_socket_path == NULL) {
+    char buf[512];
+    const char *tdir = getenv("TMPDIR");
+    if (tdir == NULL) {
+      tdir = "/tmp";
+    }
+    snprintf(buf, sizeof(((struct sockaddr_un*)0)->sun_path),
+	     "%s/perfcounters_test_socket.%ld.%ld",
+	     tdir, (long int)getpid(), time(NULL));
+    g_socket_path = (char*)strdup(buf);
+  }
+  return g_socket_path;
+}
+
+static std::string asok_connect(const std::string &path, int *fd)
+{
+  int socket_fd = socket_cloexec(PF_UNIX, SOCK_STREAM, 0);
+  if(socket_fd < 0) {
+    int err = errno;
+    ostringstream oss;
+    oss << "socket(PF_UNIX, SOCK_STREAM, 0) failed: " << cpp_strerror(err);
+    return oss.str();
+  }
+
+  struct sockaddr_un address;
+  // FIPS zeroization audit 20191115: this memset is fine.
+  memset(&address, 0, sizeof(struct sockaddr_un));
+  address.sun_family = AF_UNIX;
+  snprintf(address.sun_path, sizeof(address.sun_path), "%s", path.c_str());
+
+  if (::connect(socket_fd, (struct sockaddr *) &address, 
+	sizeof(struct sockaddr_un)) != 0) {
+    int err = errno;
+    ostringstream oss;
+    oss << "connect(" << socket_fd << ") failed: " << cpp_strerror(err);
+    close(socket_fd);
+    return oss.str();
+  }
+
+  struct timeval timer;
+  timer.tv_sec = 10;
+  timer.tv_usec = 0;
+  if (::setsockopt(socket_fd, SOL_SOCKET, SO_RCVTIMEO, &timer, sizeof(timer))) {
+    int err = errno;
+    ostringstream oss;
+    oss << "setsockopt(" << socket_fd << ", SO_RCVTIMEO) failed: "
+	<< cpp_strerror(err);
+    close(socket_fd);
+    return oss.str();
+  }
+  timer.tv_sec = 10;
+  timer.tv_usec = 0;
+  if (::setsockopt(socket_fd, SOL_SOCKET, SO_SNDTIMEO, &timer, sizeof(timer))) {
+    int err = errno;
+    ostringstream oss;
+    oss << "setsockopt(" << socket_fd << ", SO_SNDTIMEO) failed: "
+	<< cpp_strerror(err);
+    close(socket_fd);
+    return oss.str();
+  }
+
+  *fd = socket_fd;
+  return "";
+}
+
+static std::string asok_request(int socket_fd, std::string request)
+{
+  ssize_t res = safe_write(socket_fd, request.c_str(), request.length() + 1);
+  if (res < 0) {
+    int err = res;
+    ostringstream oss;
+    oss << "safe_write(" << socket_fd << ") failed to write request code: "
+	<< cpp_strerror(err);
+    return oss.str();
+  }
+  return "";
+}
+
+AdminSocketClient::
+AdminSocketClient(const std::string &path)
+  : m_path(path)
+{
+}
+
+std::string AdminSocketClient::ping(bool *ok)
+{
+  std::string version;
+  std::string result = do_request("{\"prefix\":\"0\"}", &version);
+  *ok = result == "" && version.length() == 1;
+  return result;
+}
+
+std::string AdminSocketClient::do_request(std::string request, std::string *result)
+{
+  int socket_fd = 0, res;
+  std::string buffer;
+  uint32_t message_size_raw, message_size;
+
+  std::string err = asok_connect(m_path, &socket_fd);
+  if (!err.empty()) {
+    goto out;
+  }
+  err = asok_request(socket_fd, request);
+  if (!err.empty()) {
+    goto done;
+  }
+  res = safe_read_exact(socket_fd, &message_size_raw,
+				sizeof(message_size_raw));
+  if (res < 0) {
+    int e = res;
+    ostringstream oss;
+    oss << "safe_read(" << socket_fd << ") failed to read message size: "
+	<< cpp_strerror(e);
+    err = oss.str();
+    goto done;
+  }
+  message_size = ntohl(message_size_raw);
+  buffer.resize(message_size, 0);
+  res = safe_read_exact(socket_fd, &buffer[0], message_size);
+  if (res < 0) {
+    int e = res;
+    ostringstream oss;
+    oss << "safe_read(" << socket_fd << ") failed: " << cpp_strerror(e);
+    err = oss.str();
+    goto done;
+  }
+  //printf("MESSAGE FROM SERVER: %s\n", buffer.c_str());
+  std::swap(*result, buffer);
+done:
+  close(socket_fd);
+ out:
+  return err;
+}
diff --git a/src/common/admin_socket_client.h b/src/common/admin_socket_client.h
new file mode 100644
index 00000000..dcfab2b8
--- /dev/null
+++ b/src/common/admin_socket_client.h
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_ADMIN_SOCKET_CLIENT_H
+#define CEPH_COMMON_ADMIN_SOCKET_CLIENT_H
+
+#include <string>
+
+/* This is a simple client that talks to an AdminSocket using blocking I/O.
+ * We put a 5-second timeout on send and recv operations.
+ */
+class AdminSocketClient
+{
+public:
+  AdminSocketClient(const std::string &path);
+  std::string do_request(std::string request, std::string *result);
+  std::string ping(bool *ok);
+private:
+  std::string m_path;
+};
+
+const char* get_rand_socket_path();
+
+#endif
diff --git a/src/common/aix_errno.cc b/src/common/aix_errno.cc
new file mode 100644
index 00000000..07f6a145
--- /dev/null
+++ b/src/common/aix_errno.cc
@@ -0,0 +1,231 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <errno.h>
+#include "include/types.h"
+
+
+// converts from linux errno values to host values
+__s32 ceph_to_hostos_errno(__s32 r) 
+{
+  if (r < -34) {
+    switch (r) {
+      case -35:
+        return -EDEADLK;
+      case -36:
+        return -ENAMETOOLONG;
+      case -37:
+        return -ENOLCK;
+      case -38:
+        return -ENOSYS;
+      case -39:
+        return -ENOTEMPTY;
+      case -40:
+        return -ELOOP;
+      case -42:
+        return -ENOMSG;
+      case -43:
+        return -EIDRM;
+      case -44:
+        return -ECHRNG;
+      case -45:
+        return -EL2NSYNC;
+      case -46:
+        return -EL3HLT;
+      case -47:
+        return -EL3RST;
+      case -48:
+        return -ELNRNG;
+      case -49:
+        return -EUNATCH;
+      case -51:
+        return -EL2HLT;
+      case -52:
+        return -EPERM; //TODO EBADE
+      case -53:
+        return -EPERM; //TODO EBADR
+      case -54:
+        return -EPERM; //TODO EXFULL
+      case -55:
+        return -EPERM; //TODO ENOANO
+      case -56:
+        return -EPERM; //TODO EBADRQC
+      case -57:
+        return -EPERM; //TODO EBADSLT
+      case -59:
+        return -EPERM; //TODO EBFONT
+      case -60:
+        return -ENOSTR;
+      case -61:
+        return -ENODATA;
+      case -62:
+        return -ETIME;
+      case -63:
+        return -ENOSR;
+      case -64:
+        return -EPERM; //TODO ENONET
+      case -65:
+        return -EPERM; //TODO ENOPKG
+      case -66:
+        return -EREMOTE;
+      case -67:
+        return -ENOLINK;
+      case -68:
+        return -EPERM; //TODO EADV 
+      case -69:
+        return -EPERM; //TODO ESRMNT 
+      case -70:
+        return -EPERM; //TODO ECOMM
+      case -71:
+        return -EPROTO;
+      case -72:
+        return -EMULTIHOP;
+      case -73:
+        return -EPERM; //TODO EDOTDOT 
+      case -74:
+        return -EBADMSG;
+      case -75:
+        return -EOVERFLOW;
+      case -76:
+        return -EPERM; //TODO ENOTUNIQ
+      case -77:
+        return -EPERM; //TODO EBADFD
+      case -78:
+        return -EPERM; //TODO EREMCHG
+      case -79:
+        return -EPERM; //TODO ELIBACC
+      case -80:
+        return -EPERM; //TODO ELIBBAD 
+      case -81:
+        return -EPERM; //TODO ELIBSCN
+      case -82:
+        return -EPERM; //TODO ELIBMAX
+      case -83:
+	return -EPERM; // TODO ELIBEXEC
+      case -84:
+        return -EILSEQ;
+      case -85:
+        return -ERESTART;
+      case -86:
+        return -EPERM; //ESTRPIPE; 
+      case -87:
+        return -EUSERS;
+      case -88:
+        return -ENOTSOCK;
+      case -89:
+        return -EDESTADDRREQ;
+      case -90:
+        return -EMSGSIZE;
+      case -91:
+        return -EPROTOTYPE;
+      case -92:
+        return -ENOPROTOOPT;
+      case -93:
+        return -EPROTONOSUPPORT;
+      case -94:
+        return -ESOCKTNOSUPPORT;
+      case -95:
+        return -EOPNOTSUPP;
+      case -96:
+        return -EPFNOSUPPORT;
+      case -97:
+        return -EAFNOSUPPORT;
+      case -98:
+        return -EADDRINUSE;
+      case -99:
+        return -EADDRNOTAVAIL;
+      case -100:
+        return -ENETDOWN;
+      case -101:
+        return -ENETUNREACH;
+      case -102:
+        return -ENETRESET;
+      case -103:
+        return -ECONNABORTED;
+      case -104:
+        return -ECONNRESET;
+      case -105:
+        return -ENOBUFS;
+      case -106:
+        return -EISCONN;
+      case -107:
+        return -ENOTCONN;
+      case -108:
+        return -ESHUTDOWN;
+      case -109:
+        return -ETOOMANYREFS;
+      case -110:
+        return -ETIMEDOUT;
+      case -111:
+        return -ECONNREFUSED;
+      case -112:
+        return -EHOSTDOWN;
+      case -113:
+        return -EHOSTUNREACH;
+      case -114:
+        return -EALREADY;
+      case -115:
+        return -EINPROGRESS;
+      case -116:
+        return -ESTALE;
+      case -117:
+        return -EPERM; //TODO EUCLEAN 
+      case -118:
+        return -EPERM; //TODO ENOTNAM
+      case -119:
+        return -EPERM; //TODO ENAVAIL
+      case -120:
+        return -EPERM; //TODO EISNAM
+      case -121:
+        return -EPERM; //TODO EREMOTEIO
+      case -122:
+        return -EDQUOT;
+      case -123:
+        return -EPERM; //TODO ENOMEDIUM
+      case -124:
+        return -EPERM; //TODO EMEDIUMTYPE - not used
+      case -125:
+        return -ECANCELED;
+      case -126:
+        return -EPERM; //TODO ENOKEY
+      case -127:
+        return -EPERM; //TODO EKEYEXPIRED
+      case -128:
+        return -EPERM; //TODO EKEYREVOKED
+      case -129:
+        return -EPERM; //TODO EKEYREJECTED
+      case -130:
+        return -EOWNERDEAD;
+      case -131:
+        return -ENOTRECOVERABLE;
+      case -132:
+        return -EPERM; //TODO ERFKILL
+      case -133:
+        return -EPERM; //TODO EHWPOISON
+
+      default: { 
+        break;
+      }
+    }
+  } 
+  return r; // otherwise return original value
+}
+
+// converts Host OS errno values to linux/Ceph values
+// XXX Currently not worked out
+__s32 hostos_to_ceph_errno(__s32 r)
+{
+  return r;
+}
+
diff --git a/src/common/align.h b/src/common/align.h
new file mode 100644
index 00000000..b5c25b99
--- /dev/null
+++ b/src/common/align.h
@@ -0,0 +1,30 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+  *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_ALIGN_H
+#define CEPH_COMMON_ALIGN_H
+
+template <typename T>
+inline constexpr T align_up(T v, T align) {
+  return (v + align - 1) & ~(align - 1);
+}
+
+template <typename T>
+inline constexpr T align_down(T v, T align) {
+  return v & ~(align - 1);
+}
+
+#endif /* CEPH_COMMON_ALIGN_H */
diff --git a/src/common/arch.h b/src/common/arch.h
new file mode 100644
index 00000000..09dbe7c9
--- /dev/null
+++ b/src/common/arch.h
@@ -0,0 +1,15 @@
+#ifndef CEPH_ARCH_H
+#define CEPH_ARCH_H
+
+static const char *get_arch()
+{
+#if defined(__i386__)
+  return "i386";
+#elif defined(__x86_64__)
+  return "x86-64";
+#else
+    return "unknown";
+#endif
+}
+
+#endif
diff --git a/src/common/armor.c b/src/common/armor.c
new file mode 100644
index 00000000..706e01e7
--- /dev/null
+++ b/src/common/armor.c
@@ -0,0 +1,131 @@
+
+#if defined(__linux__)
+#include <linux/errno.h>
+#else
+#include <sys/errno.h>
+#endif
+
+/*
+ * base64 encode/decode.
+ */
+
+const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+static int encode_bits(int c)
+{
+	return pem_key[c];
+}
+
+static int decode_bits(char c)
+{
+	if (c >= 'A' && c <= 'Z')
+		return c - 'A';
+	if (c >= 'a' && c <= 'z')
+		return c - 'a' + 26;
+	if (c >= '0' && c <= '9')
+		return c - '0' + 52;
+	if (c == '+' || c == '-')
+		return 62;
+	if (c == '/' || c == '_')
+		return 63;
+	if (c == '=')
+		return 0; /* just non-negative, please */
+	return -EINVAL;	
+}
+
+static int set_str_val(char **pdst, const char *end, char c)
+{
+	if (*pdst < end) {
+		char *p = *pdst;
+		*p = c;
+		(*pdst)++;
+	} else
+		return -ERANGE;
+
+	return 0;
+}
+
+int ceph_armor_line_break(char *dst, const char *dst_end, const char *src, const char *end, int line_width)
+{
+	int olen = 0;
+	int line = 0;
+
+#define SET_DST(c) do { \
+	int __ret = set_str_val(&dst, dst_end, c); \
+	if (__ret < 0) \
+		return __ret; \
+} while (0);
+
+	while (src < end) {
+		unsigned char a;
+
+		a = *src++;
+		SET_DST(encode_bits(a >> 2));
+		if (src < end) {
+			unsigned char b;
+			b = *src++;
+			SET_DST(encode_bits(((a & 3) << 4) | (b >> 4)));
+			if (src < end) {
+				unsigned char c;
+				c = *src++;
+				SET_DST(encode_bits(((b & 15) << 2) |
+								(c >> 6)));
+				SET_DST(encode_bits(c & 63));
+			} else {
+				SET_DST(encode_bits((b & 15) << 2));
+				SET_DST('=');
+			}
+		} else {
+			SET_DST(encode_bits(((a & 3) << 4)));
+			SET_DST('=');
+			SET_DST('=');
+		}
+		olen += 4;
+		line += 4;
+		if (line_width && line == line_width) {
+			line = 0;
+			SET_DST('\n');
+			olen++;
+		}
+	}
+	return olen;
+}
+
+int ceph_armor(char *dst, const char *dst_end, const char *src, const char *end)
+{
+	return ceph_armor_line_break(dst, dst_end, src, end, 0);
+}
+
+int ceph_unarmor(char *dst, const char *dst_end, const char *src, const char *end)
+{
+	int olen = 0;
+
+	while (src < end) {
+		int a, b, c, d;
+
+		if (src[0] == '\n') {
+			src++;
+			continue;
+		}
+
+		if (src + 4 > end)
+			return -EINVAL;
+		a = decode_bits(src[0]);
+		b = decode_bits(src[1]);
+		c = decode_bits(src[2]);
+		d = decode_bits(src[3]);
+		if (a < 0 || b < 0 || c < 0 || d < 0)
+			return -EINVAL;
+
+		SET_DST((a << 2) | (b >> 4));
+		if (src[2] == '=')
+			return olen + 1;
+		SET_DST(((b & 15) << 4) | (c >> 2));
+		if (src[3] == '=')
+			return olen + 2;
+		SET_DST(((c & 3) << 6) | d);
+		olen += 3;
+		src += 4;
+	}
+	return olen;
+}
diff --git a/src/common/armor.h b/src/common/armor.h
new file mode 100644
index 00000000..340b33aa
--- /dev/null
+++ b/src/common/armor.h
@@ -0,0 +1,20 @@
+#ifndef CEPH_ARMOR_H
+#define CEPH_ARMOR_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int ceph_armor(char *dst, const char *dst_end,
+	       const char *src, const char *end);
+
+int ceph_armor_linebreak(char *dst, const char *dst_end,
+	       const char *src, const char *end,
+	       int line_width);
+int ceph_unarmor(char *dst, const char *dst_end,
+		 const char *src, const char *end);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/common/assert.cc b/src/common/assert.cc
new file mode 100644
index 00000000..6fb50014
--- /dev/null
+++ b/src/common/assert.cc
@@ -0,0 +1,256 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2008-2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+#include "common/debug.h"
+
+namespace ceph {
+  static CephContext *g_assert_context = NULL;
+
+  /* If you register an assert context, ceph_assert() will try to lock the dout
+   * stream of that context before starting an assert. This is nice because the
+   * output looks better. Your assert will not be interleaved with other dout
+   * statements.
+   *
+   * However, this is strictly optional and library code currently does not
+   * register an assert context. The extra complexity of supporting this
+   * wouldn't really be worth it.
+   */
+  void register_assert_context(CephContext *cct)
+  {
+    ceph_assert(!g_assert_context);
+    g_assert_context = cct;
+  }
+
+  [[gnu::cold]] void __ceph_assert_fail(const char *assertion,
+					const char *file, int line,
+					const char *func)
+  {
+    g_assert_condition = assertion;
+    g_assert_file = file;
+    g_assert_line = line;
+    g_assert_func = func;
+    g_assert_thread = (unsigned long long)pthread_self();
+    ceph_pthread_getname(pthread_self(), g_assert_thread_name,
+		       sizeof(g_assert_thread_name));
+
+    ostringstream tss;
+    tss << ceph_clock_now();
+
+    snprintf(g_assert_msg, sizeof(g_assert_msg),
+	     "%s: In function '%s' thread %llx time %s\n"
+	     "%s: %d: FAILED ceph_assert(%s)\n",
+	     file, func, (unsigned long long)pthread_self(), tss.str().c_str(),
+	     file, line, assertion);
+    dout_emergency(g_assert_msg);
+
+    // TODO: get rid of this memory allocation.
+    ostringstream oss;
+    oss << BackTrace(1);
+    dout_emergency(oss.str());
+
+    if (g_assert_context) {
+      lderr(g_assert_context) << g_assert_msg << std::endl;
+      *_dout << oss.str() << dendl;
+
+      // dump recent only if the abort signal handler won't do it for us
+      if (!g_assert_context->_conf->fatal_signal_handlers) {
+	g_assert_context->_log->dump_recent();
+      }
+    }
+
+    abort();
+  }
+
+  [[gnu::cold]] void __ceph_assert_fail(const assert_data &ctx)
+  {
+    __ceph_assert_fail(ctx.assertion, ctx.file, ctx.line, ctx.function);
+  }
+
+  class BufAppender {
+  public:
+    BufAppender(char* buf, int size) : bufptr(buf), remaining(size) {}
+
+    void printf(const char * format, ...) {
+      va_list args;
+      va_start(args, format);
+      this->vprintf(format, args);
+      va_end(args);
+    }
+
+    void vprintf(const char * format, va_list args) {
+      int n = vsnprintf(bufptr, remaining, format, args);
+      if (n >= 0) {
+	if (n < remaining) {
+	  remaining -= n;
+	  bufptr += n;
+	} else {
+	  remaining = 0;
+	}
+      }
+    }
+
+  private:
+    char* bufptr;
+    int remaining;
+  };
+
+
+  [[gnu::cold]] void __ceph_assertf_fail(const char *assertion,
+					 const char *file, int line,
+					 const char *func, const char* msg,
+					 ...)
+  {
+    ostringstream tss;
+    tss << ceph_clock_now();
+
+    g_assert_condition = assertion;
+    g_assert_file = file;
+    g_assert_line = line;
+    g_assert_func = func;
+    g_assert_thread = (unsigned long long)pthread_self();
+    ceph_pthread_getname(pthread_self(), g_assert_thread_name,
+		       sizeof(g_assert_thread_name));
+
+    BufAppender ba(g_assert_msg, sizeof(g_assert_msg));
+    BackTrace *bt = new BackTrace(1);
+    ba.printf("%s: In function '%s' thread %llx time %s\n"
+	     "%s: %d: FAILED ceph_assert(%s)\n",
+	     file, func, (unsigned long long)pthread_self(), tss.str().c_str(),
+	     file, line, assertion);
+    ba.printf("Assertion details: ");
+    va_list args;
+    va_start(args, msg);
+    ba.vprintf(msg, args);
+    va_end(args);
+    ba.printf("\n");
+    dout_emergency(g_assert_msg);
+
+    // TODO: get rid of this memory allocation.
+    ostringstream oss;
+    oss << *bt;
+    dout_emergency(oss.str());
+
+    if (g_assert_context) {
+      lderr(g_assert_context) << g_assert_msg << std::endl;
+      *_dout << oss.str() << dendl;
+
+      // dump recent only if the abort signal handler won't do it for us
+      if (!g_assert_context->_conf->fatal_signal_handlers) {
+	g_assert_context->_log->dump_recent();
+      }
+    }
+
+    abort();
+  }
+
+  [[gnu::cold]] void __ceph_abort(const char *file, int line,
+				  const char *func, const string& msg)
+  {
+    ostringstream tss;
+    tss << ceph_clock_now();
+
+    g_assert_condition = "abort";
+    g_assert_file = file;
+    g_assert_line = line;
+    g_assert_func = func;
+    g_assert_thread = (unsigned long long)pthread_self();
+    ceph_pthread_getname(pthread_self(), g_assert_thread_name,
+		       sizeof(g_assert_thread_name));
+
+    BackTrace *bt = new BackTrace(1);
+    snprintf(g_assert_msg, sizeof(g_assert_msg),
+             "%s: In function '%s' thread %llx time %s\n"
+	     "%s: %d: ceph_abort_msg(\"%s\")\n", file, func,
+	     (unsigned long long)pthread_self(),
+	     tss.str().c_str(), file, line,
+	     msg.c_str());
+    dout_emergency(g_assert_msg);
+
+    // TODO: get rid of this memory allocation.
+    ostringstream oss;
+    oss << *bt;
+    dout_emergency(oss.str());
+
+    if (g_assert_context) {
+      lderr(g_assert_context) << g_assert_msg << std::endl;
+      *_dout << oss.str() << dendl;
+
+      // dump recent only if the abort signal handler won't do it for us
+      if (!g_assert_context->_conf->fatal_signal_handlers) {
+	g_assert_context->_log->dump_recent();
+      }
+    }
+
+    abort();
+  }
+
+  [[gnu::cold]] void __ceph_abortf(const char *file, int line,
+				   const char *func, const char* msg,
+				   ...)
+  {
+    ostringstream tss;
+    tss << ceph_clock_now();
+
+    g_assert_condition = "abort";
+    g_assert_file = file;
+    g_assert_line = line;
+    g_assert_func = func;
+    g_assert_thread = (unsigned long long)pthread_self();
+    ceph_pthread_getname(pthread_self(), g_assert_thread_name,
+		       sizeof(g_assert_thread_name));
+
+    BufAppender ba(g_assert_msg, sizeof(g_assert_msg));
+    BackTrace *bt = new BackTrace(1);
+    ba.printf("%s: In function '%s' thread %llx time %s\n"
+	      "%s: %d: abort()\n",
+	      file, func, (unsigned long long)pthread_self(), tss.str().c_str(),
+	      file, line);
+    ba.printf("Abort details: ");
+    va_list args;
+    va_start(args, msg);
+    ba.vprintf(msg, args);
+    va_end(args);
+    ba.printf("\n");
+    dout_emergency(g_assert_msg);
+
+    // TODO: get rid of this memory allocation.
+    ostringstream oss;
+    oss << *bt;
+    dout_emergency(oss.str());
+
+    if (g_assert_context) {
+      lderr(g_assert_context) << g_assert_msg << std::endl;
+      *_dout << oss.str() << dendl;
+
+      // dump recent only if the abort signal handler won't do it for us
+      if (!g_assert_context->_conf->fatal_signal_handlers) {
+	g_assert_context->_log->dump_recent();
+      }
+    }
+
+    abort();
+  }
+
+  [[gnu::cold]] void __ceph_assert_warn(const char *assertion,
+					const char *file,
+					int line, const char *func)
+  {
+    char buf[8096];
+    snprintf(buf, sizeof(buf),
+	     "WARNING: ceph_assert(%s) at: %s: %d: %s()\n",
+	     assertion, file, line, func);
+    dout_emergency(buf);
+  }
+}
diff --git a/src/common/async/bind_handler.h b/src/common/async/bind_handler.h
new file mode 100644
index 00000000..516d8a5e
--- /dev/null
+++ b/src/common/async/bind_handler.h
@@ -0,0 +1,111 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_ASYNC_BIND_HANDLER_H
+#define CEPH_ASYNC_BIND_HANDLER_H
+
+#include <tuple>
+#include <boost/asio.hpp>
+
+namespace ceph::async {
+
+/**
+ * A bound completion handler for use with boost::asio.
+ *
+ * A completion handler wrapper that allows a tuple of arguments to be forwarded
+ * to the original Handler. This is intended for use with boost::asio functions
+ * like defer(), dispatch() and post() which expect handlers which are callable
+ * with no arguments.
+ *
+ * The original Handler's associated allocator and executor are maintained.
+ *
+ * @see bind_handler
+ */
+template <typename Handler, typename Tuple>
+struct CompletionHandler {
+  Handler handler;
+  Tuple args;
+
+  CompletionHandler(Handler&& handler, Tuple&& args)
+    : handler(std::move(handler)),
+      args(std::move(args))
+  {}
+
+  void operator()() & {
+    std::apply(handler, args);
+  }
+  void operator()() const & {
+    std::apply(handler, args);
+  }
+  void operator()() && {
+    std::apply(std::move(handler), std::move(args));
+  }
+
+  using allocator_type = boost::asio::associated_allocator_t<Handler>;
+  allocator_type get_allocator() const noexcept {
+    return boost::asio::get_associated_allocator(handler);
+  }
+};
+
+} // namespace ceph::async
+
+namespace boost::asio {
+
+// specialize boost::asio::associated_executor<> for CompletionHandler
+template <typename Handler, typename Tuple, typename Executor>
+struct associated_executor<ceph::async::CompletionHandler<Handler, Tuple>, Executor> {
+  using type = boost::asio::associated_executor_t<Handler, Executor>;
+
+  static type get(const ceph::async::CompletionHandler<Handler, Tuple>& handler,
+                  const Executor& ex = Executor()) noexcept {
+    return boost::asio::get_associated_executor(handler.handler, ex);
+  }
+};
+
+} // namespace boost::asio
+
+namespace ceph::async {
+
+/**
+ * Returns a wrapped completion handler with bound arguments.
+ *
+ * Binds the given arguments to a handler, and returns a CompletionHandler that
+ * is callable with no arguments. This is similar to std::bind(), except that
+ * all arguments must be provided. Move-only argument types are supported as
+ * long as the CompletionHandler's 'operator() &&' overload is used, i.e.
+ * std::move(handler)().
+ *
+ * Example use:
+ *
+ *   // bind the arguments (5, "hello") to a callback lambda:
+ *   auto callback = [] (int a, std::string b) {};
+ *   auto handler = bind_handler(callback, 5, "hello");
+ *
+ *   // execute the bound handler on an io_context:
+ *   boost::asio::io_context context;
+ *   boost::asio::post(context, std::move(handler));
+ *   context.run();
+ *
+ * @see CompletionHandler
+ */
+template <typename Handler, typename ...Args>
+auto bind_handler(Handler&& h, Args&& ...args)
+{
+  return CompletionHandler{std::forward<Handler>(h),
+                           std::make_tuple(std::forward<Args>(args)...)};
+}
+
+} // namespace ceph::async
+
+#endif // CEPH_ASYNC_BIND_HANDLER_H
diff --git a/src/common/async/completion.h b/src/common/async/completion.h
new file mode 100644
index 00000000..6af9109d
--- /dev/null
+++ b/src/common/async/completion.h
@@ -0,0 +1,320 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_ASYNC_COMPLETION_H
+#define CEPH_ASYNC_COMPLETION_H
+
+#include <memory>
+
+#include "bind_handler.h"
+#include "forward_handler.h"
+
+namespace ceph::async {
+
+/**
+ * Abstract completion handler interface for use with boost::asio.
+ *
+ * Memory management is performed using the Handler's 'associated allocator',
+ * which carries the additional requirement that its memory be released before
+ * the Handler is invoked. This allows memory allocated for one asynchronous
+ * operation to be reused in its continuation. Because of this requirement, any
+ * calls to invoke the completion must first release ownership of it. To enforce
+ * this, the static functions defer()/dispatch()/post() take the completion by
+ * rvalue-reference to std::unique_ptr<Completion>, i.e. std::move(completion).
+ *
+ * Handlers may also have an 'associated executor', so the calls to defer(),
+ * dispatch(), and post() are forwarded to that executor. If there is no
+ * associated executor (which is generally the case unless one was bound with
+ * boost::asio::bind_executor()), the executor passed to Completion::create()
+ * is used as a default.
+ *
+ * Example use:
+ *
+ *   // declare a Completion type with Signature = void(int, string)
+ *   using MyCompletion = ceph::async::Completion<void(int, string)>;
+ *
+ *   // create a completion with the given callback:
+ *   std::unique_ptr<MyCompletion> c;
+ *   c = MyCompletion::create(ex, [] (int a, const string& b) {});
+ *
+ *   // bind arguments to the callback and post to its associated executor:
+ *   MyCompletion::post(std::move(c), 5, "hello");
+ *
+ *
+ * Additional user data may be stored along with the Completion to take
+ * advantage of the handler allocator optimization. This is accomplished by
+ * specifying its type in the template parameter T. For example, the type
+ * Completion<void(), int> contains a public member variable 'int user_data'.
+ * Any additional arguments to Completion::create() will be forwarded to type
+ * T's constructor.
+ *
+ * If the AsBase<T> type tag is used, as in Completion<void(), AsBase<T>>,
+ * the Completion will inherit from T instead of declaring it as a member
+ * variable.
+ *
+ * When invoking the completion handler via defer(), dispatch(), or post(),
+ * care must be taken when passing arguments that refer to user data, because
+ * its memory is destroyed prior to invocation. In such cases, the user data
+ * should be moved/copied out of the Completion first.
+ */
+template <typename Signature, typename T = void>
+class Completion;
+
+
+/// type tag for UserData
+template <typename T> struct AsBase {};
+
+namespace detail {
+
+/// optional user data to be stored with the Completion
+template <typename T>
+struct UserData {
+  T user_data;
+  template <typename ...Args>
+  UserData(Args&& ...args)
+    : user_data(std::forward<Args>(args)...)
+  {}
+};
+// AsBase specialization inherits from T
+template <typename T>
+struct UserData<AsBase<T>> : public T {
+  template <typename ...Args>
+  UserData(Args&& ...args)
+    : T(std::forward<Args>(args)...)
+  {}
+};
+// void specialization
+template <>
+class UserData<void> {};
+
+} // namespace detail
+
+
+// template specialization to pull the Signature's args apart
+template <typename T, typename ...Args>
+class Completion<void(Args...), T> : public detail::UserData<T> {
+ protected:
+  // internal interfaces for type-erasure on the Handler/Executor. uses
+  // tuple<Args...> to provide perfect forwarding because you can't make
+  // virtual function templates
+  virtual void destroy_defer(std::tuple<Args...>&& args) = 0;
+  virtual void destroy_dispatch(std::tuple<Args...>&& args) = 0;
+  virtual void destroy_post(std::tuple<Args...>&& args) = 0;
+  virtual void destroy() = 0;
+
+  // constructor is protected, use create(). any constructor arguments are
+  // forwarded to UserData
+  template <typename ...TArgs>
+  Completion(TArgs&& ...args)
+    : detail::UserData<T>(std::forward<TArgs>(args)...)
+  {}
+ public:
+  virtual ~Completion() = default;
+
+  // use the virtual destroy() interface on delete. this allows the derived
+  // class to manage its memory using Handler allocators, without having to use
+  // a custom Deleter for std::unique_ptr<>
+  static void operator delete(void *p) {
+    static_cast<Completion*>(p)->destroy();
+  }
+
+  /// completion factory function that uses the handler's associated allocator.
+  /// any additional arguments are forwared to T's constructor
+  template <typename Executor1, typename Handler, typename ...TArgs>
+  static std::unique_ptr<Completion>
+  create(const Executor1& ex1, Handler&& handler, TArgs&& ...args);
+
+  /// take ownership of the completion, bind any arguments to the completion
+  /// handler, then defer() it on its associated executor
+  template <typename ...Args2>
+  static void defer(std::unique_ptr<Completion>&& c, Args2&&...args);
+
+  /// take ownership of the completion, bind any arguments to the completion
+  /// handler, then dispatch() it on its associated executor
+  template <typename ...Args2>
+  static void dispatch(std::unique_ptr<Completion>&& c, Args2&&...args);
+
+  /// take ownership of the completion, bind any arguments to the completion
+  /// handler, then post() it to its associated executor
+  template <typename ...Args2>
+  static void post(std::unique_ptr<Completion>&& c, Args2&&...args);
+};
+
+namespace detail {
+
+// concrete Completion that knows how to invoke the completion handler. this
+// observes all of the 'Requirements on asynchronous operations' specified by
+// the C++ Networking TS
+template <typename Executor1, typename Handler, typename T, typename ...Args>
+class CompletionImpl final : public Completion<void(Args...), T> {
+  // use Handler's associated executor (or Executor1 by default) for callbacks
+  using Executor2 = boost::asio::associated_executor_t<Handler, Executor1>;
+  // maintain work on both executors
+  using Work1 = boost::asio::executor_work_guard<Executor1>;
+  using Work2 = boost::asio::executor_work_guard<Executor2>;
+  std::pair<Work1, Work2> work;
+  Handler handler;
+
+  // use Handler's associated allocator
+  using Alloc2 = boost::asio::associated_allocator_t<Handler>;
+  using Traits2 = std::allocator_traits<Alloc2>;
+  using RebindAlloc2 = typename Traits2::template rebind_alloc<CompletionImpl>;
+  using RebindTraits2 = std::allocator_traits<RebindAlloc2>;
+
+  // placement new for the handler allocator
+  static void* operator new(size_t, RebindAlloc2 alloc2) {
+    return RebindTraits2::allocate(alloc2, 1);
+  }
+  // placement delete for when the constructor throws during placement new
+  static void operator delete(void *p, RebindAlloc2 alloc2) {
+    RebindTraits2::deallocate(alloc2, static_cast<CompletionImpl*>(p), 1);
+  }
+
+  static auto bind_and_forward(Handler&& h, std::tuple<Args...>&& args) {
+    return forward_handler(CompletionHandler{std::move(h), std::move(args)});
+  }
+
+  void destroy_defer(std::tuple<Args...>&& args) override {
+    auto w = std::move(work);
+    auto f = bind_and_forward(std::move(handler), std::move(args));
+    RebindAlloc2 alloc2 = boost::asio::get_associated_allocator(handler);
+    RebindTraits2::destroy(alloc2, this);
+    RebindTraits2::deallocate(alloc2, this, 1);
+    w.second.get_executor().defer(std::move(f), alloc2);
+  }
+  void destroy_dispatch(std::tuple<Args...>&& args) override {
+    auto w = std::move(work);
+    auto f = bind_and_forward(std::move(handler), std::move(args));
+    RebindAlloc2 alloc2 = boost::asio::get_associated_allocator(handler);
+    RebindTraits2::destroy(alloc2, this);
+    RebindTraits2::deallocate(alloc2, this, 1);
+    w.second.get_executor().dispatch(std::move(f), alloc2);
+  }
+  void destroy_post(std::tuple<Args...>&& args) override {
+    auto w = std::move(work);
+    auto f = bind_and_forward(std::move(handler), std::move(args));
+    RebindAlloc2 alloc2 = boost::asio::get_associated_allocator(handler);
+    RebindTraits2::destroy(alloc2, this);
+    RebindTraits2::deallocate(alloc2, this, 1);
+    w.second.get_executor().post(std::move(f), alloc2);
+  }
+  void destroy() override {
+    RebindAlloc2 alloc2 = boost::asio::get_associated_allocator(handler);
+    RebindTraits2::destroy(alloc2, this);
+    RebindTraits2::deallocate(alloc2, this, 1);
+  }
+
+  // constructor is private, use create(). extra constructor arguments are
+  // forwarded to UserData
+  template <typename ...TArgs>
+  CompletionImpl(const Executor1& ex1, Handler&& handler, TArgs&& ...args)
+    : Completion<void(Args...), T>(std::forward<TArgs>(args)...),
+      work(ex1, boost::asio::make_work_guard(handler, ex1)),
+      handler(std::move(handler))
+  {}
+
+ public:
+  template <typename ...TArgs>
+  static auto create(const Executor1& ex, Handler&& handler, TArgs&& ...args) {
+    auto alloc2 = boost::asio::get_associated_allocator(handler);
+    using Ptr = std::unique_ptr<CompletionImpl>;
+    return Ptr{new (alloc2) CompletionImpl(ex, std::move(handler),
+                                           std::forward<TArgs>(args)...)};
+  }
+
+  static void operator delete(void *p) {
+    static_cast<CompletionImpl*>(p)->destroy();
+  }
+};
+
+} // namespace detail
+
+
+template <typename T, typename ...Args>
+template <typename Executor1, typename Handler, typename ...TArgs>
+std::unique_ptr<Completion<void(Args...), T>>
+Completion<void(Args...), T>::create(const Executor1& ex,
+                                     Handler&& handler, TArgs&& ...args)
+{
+  using Impl = detail::CompletionImpl<Executor1, Handler, T, Args...>;
+  return Impl::create(ex, std::forward<Handler>(handler),
+                      std::forward<TArgs>(args)...);
+}
+
+template <typename T, typename ...Args>
+template <typename ...Args2>
+void Completion<void(Args...), T>::defer(std::unique_ptr<Completion>&& ptr,
+                                         Args2&& ...args)
+{
+  auto c = ptr.release();
+  c->destroy_defer(std::make_tuple(std::forward<Args2>(args)...));
+}
+
+template <typename T, typename ...Args>
+template <typename ...Args2>
+void Completion<void(Args...), T>::dispatch(std::unique_ptr<Completion>&& ptr,
+                                            Args2&& ...args)
+{
+  auto c = ptr.release();
+  c->destroy_dispatch(std::make_tuple(std::forward<Args2>(args)...));
+}
+
+template <typename T, typename ...Args>
+template <typename ...Args2>
+void Completion<void(Args...), T>::post(std::unique_ptr<Completion>&& ptr,
+                                        Args2&& ...args)
+{
+  auto c = ptr.release();
+  c->destroy_post(std::make_tuple(std::forward<Args2>(args)...));
+}
+
+
+/// completion factory function that uses the handler's associated allocator.
+/// any additional arguments are forwared to T's constructor
+template <typename Signature, typename T, typename Executor1,
+          typename Handler, typename ...TArgs>
+std::unique_ptr<Completion<Signature, T>>
+create_completion(const Executor1& ex, Handler&& handler, TArgs&& ...args)
+{
+  return Completion<Signature, T>::create(ex, std::forward<Handler>(handler),
+                                          std::forward<TArgs>(args)...);
+}
+
+/// take ownership of the completion, bind any arguments to the completion
+/// handler, then defer() it on its associated executor
+template <typename Signature, typename T, typename ...Args>
+void defer(std::unique_ptr<Completion<Signature, T>>&& ptr, Args&& ...args)
+{
+  Completion<Signature, T>::defer(std::move(ptr), std::forward<Args>(args)...);
+}
+
+/// take ownership of the completion, bind any arguments to the completion
+/// handler, then dispatch() it on its associated executor
+template <typename Signature, typename T, typename ...Args>
+void dispatch(std::unique_ptr<Completion<Signature, T>>&& ptr, Args&& ...args)
+{
+  Completion<Signature, T>::dispatch(std::move(ptr), std::forward<Args>(args)...);
+}
+
+/// take ownership of the completion, bind any arguments to the completion
+/// handler, then post() it to its associated executor
+template <typename Signature, typename T, typename ...Args>
+void post(std::unique_ptr<Completion<Signature, T>>&& ptr, Args&& ...args)
+{
+  Completion<Signature, T>::post(std::move(ptr), std::forward<Args>(args)...);
+}
+
+} // namespace ceph::async
+
+#endif // CEPH_ASYNC_COMPLETION_H
diff --git a/src/common/async/detail/shared_lock.h b/src/common/async/detail/shared_lock.h
new file mode 100644
index 00000000..12e6a922
--- /dev/null
+++ b/src/common/async/detail/shared_lock.h
@@ -0,0 +1,185 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+namespace std {
+
+// specialize unique_lock and shared_lock for SharedMutex to operate on
+// SharedMutexImpl instead, because the locks may outlive the SharedMutex itself
+
+template <typename Executor>
+class unique_lock<ceph::async::SharedMutex<Executor>> {
+ public:
+  using mutex_type = boost::intrusive_ptr<ceph::async::detail::SharedMutexImpl>;
+
+  unique_lock() = default;
+  explicit unique_lock(ceph::async::SharedMutex<Executor>& m)
+    : impl(m.impl), locked(true)
+  {
+    impl->lock();
+  }
+  unique_lock(ceph::async::SharedMutex<Executor>& m, defer_lock_t t) noexcept
+    : impl(m.impl)
+  {}
+  unique_lock(ceph::async::SharedMutex<Executor>& m, try_to_lock_t t)
+    : impl(m.impl), locked(impl->try_lock())
+  {}
+  unique_lock(ceph::async::SharedMutex<Executor>& m, adopt_lock_t t) noexcept
+    : impl(m.impl), locked(true)
+  {}
+  ~unique_lock() {
+    if (impl && locked)
+      impl->unlock();
+  }
+
+  unique_lock(unique_lock&& other) noexcept
+    : impl(std::move(other.impl)),
+      locked(other.locked) {
+    other.locked = false;
+  }
+  unique_lock& operator=(unique_lock&& other) noexcept {
+    if (impl && locked) {
+      impl->unlock();
+    }
+    impl = std::move(other.impl);
+    locked = other.locked;
+    other.locked = false;
+    return *this;
+  }
+  void swap(unique_lock& other) noexcept {
+    using std::swap;
+    swap(impl, other.impl);
+    swap(locked, other.locked);
+  }
+
+  mutex_type mutex() const noexcept { return impl; }
+  bool owns_lock() const noexcept { return impl && locked; }
+  explicit operator bool() const noexcept { return impl && locked; }
+
+  mutex_type release() {
+    auto result = std::move(impl);
+    locked = false;
+    return result;
+  }
+
+  void lock() {
+    if (!impl)
+      throw system_error(make_error_code(errc::operation_not_permitted));
+    if (locked)
+      throw system_error(make_error_code(errc::resource_deadlock_would_occur));
+    impl->lock();
+    locked = true;
+  }
+  bool try_lock() {
+    if (!impl)
+      throw system_error(make_error_code(errc::operation_not_permitted));
+    if (locked)
+      throw system_error(make_error_code(errc::resource_deadlock_would_occur));
+    return locked = impl->try_lock();
+  }
+  void unlock() {
+    if (!impl || !locked)
+      throw system_error(make_error_code(errc::operation_not_permitted));
+    impl->unlock();
+    locked = false;
+  }
+ private:
+  mutex_type impl;
+  bool locked{false};
+};
+
+template <typename Executor>
+class shared_lock<ceph::async::SharedMutex<Executor>> {
+ public:
+  using mutex_type = boost::intrusive_ptr<ceph::async::detail::SharedMutexImpl>;
+
+  shared_lock() = default;
+  explicit shared_lock(ceph::async::SharedMutex<Executor>& m)
+    : impl(m.impl), locked(true)
+  {
+    impl->lock_shared();
+  }
+  shared_lock(ceph::async::SharedMutex<Executor>& m, defer_lock_t t) noexcept
+    : impl(m.impl)
+  {}
+  shared_lock(ceph::async::SharedMutex<Executor>& m, try_to_lock_t t)
+    : impl(m.impl), locked(impl->try_lock_shared())
+  {}
+  shared_lock(ceph::async::SharedMutex<Executor>& m, adopt_lock_t t) noexcept
+    : impl(m.impl), locked(true)
+  {}
+
+  ~shared_lock() {
+    if (impl && locked)
+      impl->unlock_shared();
+  }
+
+  shared_lock(shared_lock&& other) noexcept
+    : impl(std::move(other.impl)),
+      locked(other.locked) {
+    other.locked = false;
+  }
+  shared_lock& operator=(shared_lock&& other) noexcept {
+    if (impl && locked) {
+      impl->unlock_shared();
+    }
+    impl = std::move(other.impl);
+    locked = other.locked;
+    other.locked = false;
+    return *this;
+  }
+  void swap(shared_lock& other) noexcept {
+    using std::swap;
+    swap(impl, other.impl);
+    swap(locked, other.locked);
+  }
+
+  mutex_type mutex() const noexcept { return impl; }
+  bool owns_lock() const noexcept { return impl && locked; }
+  explicit operator bool() const noexcept { return impl && locked; }
+
+  mutex_type release() {
+    auto result = std::move(impl);
+    locked = false;
+    return result;
+  }
+
+  void lock() {
+    if (!impl)
+      throw system_error(make_error_code(errc::operation_not_permitted));
+    if (locked)
+      throw system_error(make_error_code(errc::resource_deadlock_would_occur));
+    impl->lock_shared();
+    locked = true;
+  }
+  bool try_lock() {
+    if (!impl)
+      throw system_error(make_error_code(errc::operation_not_permitted));
+    if (locked)
+      throw system_error(make_error_code(errc::resource_deadlock_would_occur));
+    return locked = impl->try_lock_shared();
+  }
+  void unlock() {
+    if (!impl || !locked)
+      throw system_error(make_error_code(errc::operation_not_permitted));
+    impl->unlock_shared();
+    locked = false;
+  }
+ private:
+  mutex_type impl;
+  bool locked{false};
+};
+
+} // namespace std
diff --git a/src/common/async/detail/shared_mutex.h b/src/common/async/detail/shared_mutex.h
new file mode 100644
index 00000000..8e543635
--- /dev/null
+++ b/src/common/async/detail/shared_mutex.h
@@ -0,0 +1,326 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <condition_variable>
+#include <mutex>
+#include <optional>
+#include <shared_mutex> // for std::shared_lock
+
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include <boost/intrusive_ptr.hpp>
+#include <boost/intrusive/list.hpp>
+
+#include "include/ceph_assert.h"
+
+#include "common/async/completion.h"
+
+namespace ceph::async::detail {
+
+struct LockRequest : public boost::intrusive::list_base_hook<> {
+  virtual ~LockRequest() {}
+  virtual void complete(boost::system::error_code ec) = 0;
+  virtual void destroy() = 0;
+};
+
+class SharedMutexImpl : public boost::intrusive_ref_counter<SharedMutexImpl> {
+ public:
+  ~SharedMutexImpl();
+
+  template <typename Mutex, typename CompletionToken>
+  auto async_lock(Mutex& mtx, CompletionToken&& token);
+  void lock();
+  void lock(boost::system::error_code& ec);
+  bool try_lock();
+  void unlock();
+  template <typename Mutex, typename CompletionToken>
+  auto async_lock_shared(Mutex& mtx, CompletionToken&& token);
+  void lock_shared();
+  void lock_shared(boost::system::error_code& ec);
+  bool try_lock_shared();
+  void unlock_shared();
+  void cancel();
+
+ private:
+  using RequestList = boost::intrusive::list<LockRequest>;
+
+  RequestList shared_queue; //< requests waiting on a shared lock
+  RequestList exclusive_queue; //< requests waiting on an exclusive lock
+
+  /// lock state encodes the number of shared lockers, or 'max' for exclusive
+  using LockState = uint16_t;
+  static constexpr LockState Unlocked = 0;
+  static constexpr LockState Exclusive = std::numeric_limits<LockState>::max();
+  static constexpr LockState MaxShared = Exclusive - 1;
+  LockState state = Unlocked; //< current lock state
+
+  std::mutex mutex; //< protects lock state and wait queues
+
+  void complete(RequestList&& requests, boost::system::error_code ec);
+};
+
+// sync requests live on the stack and wait on a condition variable
+class SyncRequest : public LockRequest {
+  std::condition_variable cond;
+  std::optional<boost::system::error_code> ec;
+ public:
+  boost::system::error_code wait(std::unique_lock<std::mutex>& lock) {
+    // return the error code once its been set
+    cond.wait(lock, [this] { return ec; });
+    return *ec;
+  }
+  void complete(boost::system::error_code ec) override {
+    this->ec = ec;
+    cond.notify_one();
+  }
+  void destroy() override {
+    // nothing, SyncRequests live on the stack
+  }
+};
+
+// async requests use async::Completion to invoke a handler on its executor
+template <typename Mutex, template <typename> typename Lock>
+class AsyncRequest : public LockRequest {
+  Mutex& mutex; //< mutex argument for lock guard
+ public:
+  explicit AsyncRequest(Mutex& mutex) : mutex(mutex) {}
+
+  using Signature = void(boost::system::error_code, Lock<Mutex>);
+  using LockCompletion = Completion<Signature, AsBase<AsyncRequest>>;
+
+  void complete(boost::system::error_code ec) override {
+    auto r = static_cast<LockCompletion*>(this);
+    // pass ownership of ourselves to post(). on error, pass an empty lock
+    post(std::unique_ptr<LockCompletion>{r}, ec,
+         ec ? Lock{mutex, std::defer_lock} : Lock{mutex, std::adopt_lock});
+  }
+  void destroy() override {
+    delete static_cast<LockCompletion*>(this);
+  }
+};
+
+inline SharedMutexImpl::~SharedMutexImpl()
+{
+  ceph_assert(state == Unlocked);
+  ceph_assert(shared_queue.empty());
+  ceph_assert(exclusive_queue.empty());
+}
+
+template <typename Mutex, typename CompletionToken>
+auto SharedMutexImpl::async_lock(Mutex& mtx, CompletionToken&& token)
+{
+  using Request = AsyncRequest<Mutex, std::unique_lock>;
+  using Signature = typename Request::Signature;
+  boost::asio::async_completion<CompletionToken, Signature> init(token);
+  auto& handler = init.completion_handler;
+  auto ex1 = mtx.get_executor();
+  {
+    std::lock_guard lock{mutex};
+
+    boost::system::error_code ec;
+    if (state == Unlocked) {
+      state = Exclusive;
+
+      // post a successful completion
+      auto ex2 = boost::asio::get_associated_executor(handler, ex1);
+      auto alloc2 = boost::asio::get_associated_allocator(handler);
+      auto b = bind_handler(std::move(handler), ec,
+                            std::unique_lock{mtx, std::adopt_lock});
+      ex2.post(forward_handler(std::move(b)), alloc2);
+    } else {
+      // create a request and add it to the exclusive list
+      using LockCompletion = typename Request::LockCompletion;
+      auto request = LockCompletion::create(ex1, std::move(handler), mtx);
+      exclusive_queue.push_back(*request.release());
+    }
+  }
+  return init.result.get();
+}
+
+inline void SharedMutexImpl::lock()
+{
+  boost::system::error_code ec;
+  lock(ec);
+  if (ec) {
+    throw boost::system::system_error(ec);
+  }
+}
+
+void SharedMutexImpl::lock(boost::system::error_code& ec)
+{
+  std::unique_lock lock{mutex};
+
+  if (state == Unlocked) {
+    state = Exclusive;
+    ec.clear();
+  } else {
+    SyncRequest request;
+    exclusive_queue.push_back(request);
+    ec = request.wait(lock);
+  }
+}
+
+inline bool SharedMutexImpl::try_lock()
+{
+  std::lock_guard lock{mutex};
+
+  if (state == Unlocked) {
+    state = Exclusive;
+    return true;
+  }
+  return false;
+}
+
+void SharedMutexImpl::unlock()
+{
+  RequestList granted;
+  {
+    std::lock_guard lock{mutex};
+    ceph_assert(state == Exclusive);
+
+    if (!exclusive_queue.empty()) {
+      // grant next exclusive lock
+      auto& request = exclusive_queue.front();
+      exclusive_queue.pop_front();
+      granted.push_back(request);
+    } else {
+      // grant shared locks, if any
+      state = shared_queue.size();
+      if (state > MaxShared) {
+        state = MaxShared;
+        auto end = std::next(shared_queue.begin(), MaxShared);
+        granted.splice(granted.end(), shared_queue,
+                       shared_queue.begin(), end, MaxShared);
+      } else {
+        granted.splice(granted.end(), shared_queue);
+      }
+    }
+  }
+  complete(std::move(granted), boost::system::error_code{});
+}
+
+template <typename Mutex, typename CompletionToken>
+auto SharedMutexImpl::async_lock_shared(Mutex& mtx, CompletionToken&& token)
+{
+  using Request = AsyncRequest<Mutex, std::shared_lock>;
+  using Signature = typename Request::Signature;
+  boost::asio::async_completion<CompletionToken, Signature> init(token);
+  auto& handler = init.completion_handler;
+  auto ex1 = mtx.get_executor();
+  {
+    std::lock_guard lock{mutex};
+
+    boost::system::error_code ec;
+    if (exclusive_queue.empty() && state < MaxShared) {
+      state++;
+
+      auto ex2 = boost::asio::get_associated_executor(handler, ex1);
+      auto alloc2 = boost::asio::get_associated_allocator(handler);
+      auto b = bind_handler(std::move(handler), ec,
+                            std::shared_lock{mtx, std::adopt_lock});
+      ex2.post(forward_handler(std::move(b)), alloc2);
+    } else {
+      using LockCompletion = typename Request::LockCompletion;
+      auto request = LockCompletion::create(ex1, std::move(handler), mtx);
+      shared_queue.push_back(*request.release());
+    }
+  }
+  return init.result.get();
+}
+
+inline void SharedMutexImpl::lock_shared()
+{
+  boost::system::error_code ec;
+  lock_shared(ec);
+  if (ec) {
+    throw boost::system::system_error(ec);
+  }
+}
+
+void SharedMutexImpl::lock_shared(boost::system::error_code& ec)
+{
+  std::unique_lock lock{mutex};
+
+  if (exclusive_queue.empty() && state < MaxShared) {
+    state++;
+    ec.clear();
+  } else {
+    SyncRequest request;
+    shared_queue.push_back(request);
+    ec = request.wait(lock);
+  }
+}
+
+inline bool SharedMutexImpl::try_lock_shared()
+{
+  std::lock_guard lock{mutex};
+
+  if (exclusive_queue.empty() && state < MaxShared) {
+    state++;
+    return true;
+  }
+  return false;
+}
+
+inline void SharedMutexImpl::unlock_shared()
+{
+  std::lock_guard lock{mutex};
+  ceph_assert(state != Unlocked && state <= MaxShared);
+
+  if (state == 1 && !exclusive_queue.empty()) {
+    // grant next exclusive lock
+    state = Exclusive;
+    auto& request = exclusive_queue.front();
+    exclusive_queue.pop_front();
+    request.complete(boost::system::error_code{});
+  } else if (state == MaxShared && !shared_queue.empty() &&
+             exclusive_queue.empty()) {
+    // grant next shared lock
+    auto& request = shared_queue.front();
+    shared_queue.pop_front();
+    request.complete(boost::system::error_code{});
+  } else {
+    state--;
+  }
+}
+
+inline void SharedMutexImpl::cancel()
+{
+  RequestList canceled;
+  {
+    std::lock_guard lock{mutex};
+    canceled.splice(canceled.end(), shared_queue);
+    canceled.splice(canceled.end(), exclusive_queue);
+  }
+  complete(std::move(canceled), boost::asio::error::operation_aborted);
+}
+
+void SharedMutexImpl::complete(RequestList&& requests,
+                               boost::system::error_code ec)
+{
+  while (!requests.empty()) {
+    auto& request = requests.front();
+    requests.pop_front();
+    try {
+      request.complete(ec);
+    } catch (...) {
+      // clean up any remaining completions and rethrow
+      requests.clear_and_dispose([] (LockRequest *r) { r->destroy(); });
+      throw;
+    }
+  }
+}
+
+} // namespace ceph::async::detail
diff --git a/src/common/async/forward_handler.h b/src/common/async/forward_handler.h
new file mode 100644
index 00000000..ae88cc83
--- /dev/null
+++ b/src/common/async/forward_handler.h
@@ -0,0 +1,103 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_ASYNC_FORWARD_HANDLER_H
+#define CEPH_ASYNC_FORWARD_HANDLER_H
+
+#include <boost/asio.hpp>
+
+namespace ceph::async {
+
+/**
+ * A forwarding completion handler for use with boost::asio.
+ *
+ * A completion handler wrapper that invokes the handler's operator() as an
+ * rvalue, regardless of whether the wrapper is invoked as an lvalue or rvalue.
+ * This operation is potentially destructive to the wrapped handler, so is only
+ * suitable for single-use handlers.
+ *
+ * This is useful when combined with bind_handler() and move-only arguments,
+ * because executors will always call the lvalue overload of operator().
+ *
+ * The original Handler's associated allocator and executor are maintained.
+ *
+ * @see forward_handler
+ */
+template <typename Handler>
+struct ForwardingHandler {
+  Handler handler;
+
+  ForwardingHandler(Handler&& handler)
+    : handler(std::move(handler))
+  {}
+
+  template <typename ...Args>
+  void operator()(Args&& ...args) {
+    std::move(handler)(std::forward<Args>(args)...);
+  }
+
+  using allocator_type = boost::asio::associated_allocator_t<Handler>;
+  allocator_type get_allocator() const noexcept {
+    return boost::asio::get_associated_allocator(handler);
+  }
+};
+
+} // namespace ceph::async
+
+namespace boost::asio {
+
+// specialize boost::asio::associated_executor<> for ForwardingHandler
+template <typename Handler, typename Executor>
+struct associated_executor<ceph::async::ForwardingHandler<Handler>, Executor> {
+  using type = boost::asio::associated_executor_t<Handler, Executor>;
+
+  static type get(const ceph::async::ForwardingHandler<Handler>& handler,
+                  const Executor& ex = Executor()) noexcept {
+    return boost::asio::get_associated_executor(handler.handler, ex);
+  }
+};
+
+} // namespace boost::asio
+
+namespace ceph::async {
+
+/**
+ * Returns a single-use completion handler that always forwards on operator().
+ *
+ * Wraps a completion handler such that it is always invoked as an rvalue. This
+ * is necessary when combining executors and bind_handler() with move-only
+ * argument types.
+ *
+ * Example use:
+ *
+ *   auto callback = [] (std::unique_ptr<int>&& p) {};
+ *   auto bound_handler = bind_handler(callback, std::make_unique<int>(5));
+ *   auro handler = forward_handler(std::move(bound_handler));
+ *
+ *   // execute the forwarding handler on an io_context:
+ *   boost::asio::io_context context;
+ *   boost::asio::post(context, std::move(handler));
+ *   context.run();
+ *
+ * @see ForwardingHandler
+ */
+template <typename Handler>
+auto forward_handler(Handler&& h)
+{
+  return ForwardingHandler{std::forward<Handler>(h)};
+}
+
+} // namespace ceph::async
+
+#endif // CEPH_ASYNC_FORWARD_HANDLER_H
diff --git a/src/common/async/shared_mutex.h b/src/common/async/shared_mutex.h
new file mode 100644
index 00000000..3e471a4d
--- /dev/null
+++ b/src/common/async/shared_mutex.h
@@ -0,0 +1,212 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "common/async/detail/shared_mutex.h"
+
+namespace ceph::async {
+
+/**
+ * An asynchronous shared mutex for use with boost::asio.
+ *
+ * A shared mutex class with asynchronous lock operations that complete on a
+ * boost::asio executor. The class also has synchronous interfaces that meet
+ * most of the standard library's requirements for the SharedMutex concept,
+ * which makes it compatible with lock_guard, unique_lock, and shared_lock.
+ *
+ * All lock requests can fail with operation_aborted on cancel() or destruction.
+ * The non-error_code overloads of lock() and lock_shared() will throw this
+ * error as an exception of type boost::system::system_error.
+ *
+ * Exclusive locks are prioritized over shared locks. Locks of the same type
+ * are granted in fifo order. The implementation defines a limit on the number
+ * of shared locks to 65534 at a time.
+ *
+ * Example use:
+ *
+ *   boost::asio::io_context context;
+ *   SharedMutex mutex{context.get_executor()};
+ *
+ *   mutex.async_lock([&] (boost::system::error_code ec, auto lock) {
+ *       if (!ec) {
+ *         // mutate shared state ...
+ *       }
+ *     });
+ *   mutex.async_lock_shared([&] (boost::system::error_code ec, auto lock) {
+ *       if (!ec) {
+ *         // read shared state ...
+ *       }
+ *     });
+ *
+ *   context.run();
+ */
+template <typename Executor>
+class SharedMutex {
+ public:
+  explicit SharedMutex(const Executor& ex);
+
+  /// on destruction, all pending lock requests are canceled
+  ~SharedMutex();
+
+  using executor_type = Executor;
+  executor_type get_executor() const noexcept { return ex; }
+
+  /// initiate an asynchronous request for an exclusive lock. when the lock is
+  /// granted, the completion handler is invoked with a successful error code
+  /// and a std::unique_lock that owns this mutex.
+  /// Signature = void(boost::system::error_code, std::unique_lock)
+  template <typename CompletionToken>
+  auto async_lock(CompletionToken&& token);
+
+  /// wait synchronously for an exclusive lock. if an error occurs before the
+  /// lock is granted, that error is thrown as an exception
+  void lock();
+
+  /// wait synchronously for an exclusive lock. if an error occurs before the
+  /// lock is granted, that error is assigned to 'ec'
+  void lock(boost::system::error_code& ec);
+
+  /// try to acquire an exclusive lock. if the lock is not immediately
+  /// available, returns false
+  bool try_lock();
+
+  /// releases an exclusive lock. not required to be called from the same thread
+  /// that initiated the lock
+  void unlock();
+
+  /// initiate an asynchronous request for a shared lock. when the lock is
+  /// granted, the completion handler is invoked with a successful error code
+  /// and a std::shared_lock that owns this mutex.
+  /// Signature = void(boost::system::error_code, std::shared_lock)
+  template <typename CompletionToken>
+  auto async_lock_shared(CompletionToken&& token);
+
+  /// wait synchronously for a shared lock. if an error occurs before the
+  /// lock is granted, that error is thrown as an exception
+  void lock_shared();
+
+  /// wait synchronously for a shared lock. if an error occurs before the lock
+  /// is granted, that error is assigned to 'ec'
+  void lock_shared(boost::system::error_code& ec);
+
+  /// try to acquire a shared lock. if the lock is not immediately available,
+  /// returns false
+  bool try_lock_shared();
+
+  /// releases a shared lock. not required to be called from the same thread
+  /// that initiated the lock
+  void unlock_shared();
+
+  /// cancel any pending requests for exclusive or shared locks with an
+  /// operation_aborted error
+  void cancel();
+
+ private:
+  Executor ex; //< default callback executor
+  boost::intrusive_ptr<detail::SharedMutexImpl> impl;
+
+  // allow lock guards to access impl
+  friend class std::unique_lock<SharedMutex>;
+  friend class std::shared_lock<SharedMutex>;
+};
+
+
+template <typename Executor>
+SharedMutex<Executor>::SharedMutex(const Executor& ex)
+  : ex(ex), impl(new detail::SharedMutexImpl)
+{
+}
+
+template <typename Executor>
+SharedMutex<Executor>::~SharedMutex()
+{
+  try {
+    impl->cancel();
+  } catch (const std::exception&) {
+    // swallow any exceptions, the destructor can't throw
+  }
+}
+
+template <typename Executor>
+template <typename CompletionToken>
+auto SharedMutex<Executor>::async_lock(CompletionToken&& token)
+{
+  return impl->async_lock(*this, std::forward<CompletionToken>(token));
+}
+
+template <typename Executor>
+void SharedMutex<Executor>::lock()
+{
+  impl->lock();
+}
+
+template <typename Executor>
+void SharedMutex<Executor>::lock(boost::system::error_code& ec)
+{
+  impl->lock(ec);
+}
+
+template <typename Executor>
+bool SharedMutex<Executor>::try_lock()
+{
+  return impl->try_lock();
+}
+
+template <typename Executor>
+void SharedMutex<Executor>::unlock()
+{
+  impl->unlock();
+}
+
+template <typename Executor>
+template <typename CompletionToken>
+auto SharedMutex<Executor>::async_lock_shared(CompletionToken&& token)
+{
+  return impl->async_lock_shared(*this, std::forward<CompletionToken>(token));
+}
+
+template <typename Executor>
+void SharedMutex<Executor>::lock_shared()
+{
+  impl->lock_shared();
+}
+
+template <typename Executor>
+void SharedMutex<Executor>::lock_shared(boost::system::error_code& ec)
+{
+  impl->lock_shared(ec);
+}
+
+template <typename Executor>
+bool SharedMutex<Executor>::try_lock_shared()
+{
+  return impl->try_lock_shared();
+}
+
+template <typename Executor>
+void SharedMutex<Executor>::unlock_shared()
+{
+  impl->unlock_shared();
+}
+
+template <typename Executor>
+void SharedMutex<Executor>::cancel()
+{
+  impl->cancel();
+}
+
+} // namespace ceph::async
+
+#include "common/async/detail/shared_lock.h"
diff --git a/src/common/async/yield_context.h b/src/common/async/yield_context.h
new file mode 100644
index 00000000..436192c0
--- /dev/null
+++ b/src/common/async/yield_context.h
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <boost/range/begin.hpp>
+#include <boost/range/end.hpp>
+#include <boost/asio/io_context.hpp>
+
+#include "acconfig.h"
+
+#ifndef HAVE_BOOST_CONTEXT
+
+// hide the dependencies on boost::context and boost::coroutines
+namespace boost::asio {
+struct yield_context;
+}
+
+#else // HAVE_BOOST_CONTEXT
+#ifndef BOOST_COROUTINES_NO_DEPRECATION_WARNING
+#define BOOST_COROUTINES_NO_DEPRECATION_WARNING
+#endif
+#include <boost/asio/spawn.hpp>
+
+#endif // HAVE_BOOST_CONTEXT
+
+
+/// optional-like wrapper for a boost::asio::yield_context and its associated
+/// boost::asio::io_context. operations that take an optional_yield argument
+/// will, when passed a non-empty yield context, suspend this coroutine instead
+/// of the blocking the thread of execution
+class optional_yield {
+  boost::asio::io_context *c = nullptr;
+  boost::asio::yield_context *y = nullptr;
+ public:
+  /// construct with a valid io and yield_context
+  explicit optional_yield(boost::asio::io_context& c,
+                          boost::asio::yield_context& y) noexcept
+    : c(&c), y(&y) {}
+
+  /// type tag to construct an empty object
+  struct empty_t {};
+  optional_yield(empty_t) noexcept {}
+
+  /// implicit conversion to bool, returns true if non-empty
+  operator bool() const noexcept { return y; }
+
+  /// return a reference to the associated io_context. only valid if non-empty
+  boost::asio::io_context& get_io_context() const noexcept { return *c; }
+
+  /// return a reference to the yield_context. only valid if non-empty
+  boost::asio::yield_context& get_yield_context() const noexcept { return *y; }
+};
+
+// type tag object to construct an empty optional_yield
+static constexpr optional_yield::empty_t null_yield{};
diff --git a/src/common/autovector.h b/src/common/autovector.h
new file mode 100644
index 00000000..f52a585f
--- /dev/null
+++ b/src/common/autovector.h
@@ -0,0 +1,336 @@
+// Copyright (c) 2018-Present Red Hat Inc.  All rights reserved.
+//
+// Copyright (c) 2011-2018, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 and Apache 2.0 License
+
+#ifndef CEPH_AUTOVECTOR_H
+#define CEPH_AUTOVECTOR_H
+
+#include <algorithm>
+#include <cassert>
+#include <initializer_list>
+#include <iterator>
+#include <stdexcept>
+#include <vector>
+
+#include "include/ceph_assert.h"
+
+// A vector that leverages pre-allocated stack-based array to achieve better
+// performance for array with small amount of items.
+//
+// The interface resembles that of vector, but with less features since we aim
+// to solve the problem that we have in hand, rather than implementing a
+// full-fledged generic container.
+//
+// Currently we don't support:
+//  * reserve()/shrink_to_fit()
+//     If used correctly, in most cases, people should not touch the
+//     underlying vector at all.
+//  * random insert()/erase(), please only use push_back()/pop_back().
+//  * No move/swap operations. Each autovector instance has a
+//     stack-allocated array and if we want support move/swap operations, we
+//     need to copy the arrays other than just swapping the pointers. In this
+//     case we'll just explicitly forbid these operations since they may
+//     lead users to make false assumption by thinking they are inexpensive
+//     operations.
+//
+// Naming style of public methods almost follows that of the STL's.
+namespace ceph {
+
+template <class T, size_t kSize = 8>
+class autovector {
+ public:
+  // General STL-style container member types.
+  typedef T value_type;
+  typedef typename std::vector<T>::difference_type difference_type;
+  typedef typename std::vector<T>::size_type size_type;
+  typedef value_type& reference;
+  typedef const value_type& const_reference;
+  typedef value_type* pointer;
+  typedef const value_type* const_pointer;
+
+  // This class is the base for regular/const iterator
+  template <class TAutoVector, class TValueType>
+  class iterator_impl {
+   public:
+    // -- iterator traits
+    typedef iterator_impl<TAutoVector, TValueType> self_type;
+    typedef TValueType value_type;
+    typedef TValueType& reference;
+    typedef TValueType* pointer;
+    typedef typename TAutoVector::difference_type difference_type;
+    typedef std::random_access_iterator_tag iterator_category;
+
+    iterator_impl(TAutoVector* vect, size_t index)
+        : vect_(vect), index_(index) {};
+    iterator_impl(const iterator_impl&) = default;
+    ~iterator_impl() {}
+    iterator_impl& operator=(const iterator_impl&) = default;
+
+    // -- Advancement
+    // ++iterator
+    self_type& operator++() {
+      ++index_;
+      return *this;
+    }
+
+    // iterator++
+    self_type operator++(int) {
+      auto old = *this;
+      ++index_;
+      return old;
+    }
+
+    // --iterator
+    self_type& operator--() {
+      --index_;
+      return *this;
+    }
+
+    // iterator--
+    self_type operator--(int) {
+      auto old = *this;
+      --index_;
+      return old;
+    }
+
+    self_type operator-(difference_type len) const {
+      return self_type(vect_, index_ - len);
+    }
+
+    difference_type operator-(const self_type& other) const {
+      ceph_assert(vect_ == other.vect_);
+      return index_ - other.index_;
+    }
+
+    self_type operator+(difference_type len) const {
+      return self_type(vect_, index_ + len);
+    }
+
+    self_type& operator+=(difference_type len) {
+      index_ += len;
+      return *this;
+    }
+
+    self_type& operator-=(difference_type len) {
+      index_ -= len;
+      return *this;
+    }
+
+    // -- Reference
+    reference operator*() {
+      ceph_assert(vect_->size() >= index_);
+      return (*vect_)[index_];
+    }
+
+    const_reference operator*() const {
+      ceph_assert(vect_->size() >= index_);
+      return (*vect_)[index_];
+    }
+
+    pointer operator->() {
+      ceph_assert(vect_->size() >= index_);
+      return &(*vect_)[index_];
+    }
+
+    const_pointer operator->() const {
+      ceph_assert(vect_->size() >= index_);
+      return &(*vect_)[index_];
+    }
+
+
+    // -- Logical Operators
+    bool operator==(const self_type& other) const {
+      ceph_assert(vect_ == other.vect_);
+      return index_ == other.index_;
+    }
+
+    bool operator!=(const self_type& other) const { return !(*this == other); }
+
+    bool operator>(const self_type& other) const {
+      ceph_assert(vect_ == other.vect_);
+      return index_ > other.index_;
+    }
+
+    bool operator<(const self_type& other) const {
+      ceph_assert(vect_ == other.vect_);
+      return index_ < other.index_;
+    }
+
+    bool operator>=(const self_type& other) const {
+      ceph_assert(vect_ == other.vect_);
+      return index_ >= other.index_;
+    }
+
+    bool operator<=(const self_type& other) const {
+      ceph_assert(vect_ == other.vect_);
+      return index_ <= other.index_;
+    }
+
+   private:
+    TAutoVector* vect_ = nullptr;
+    size_t index_ = 0;
+  };
+
+  typedef iterator_impl<autovector, value_type> iterator;
+  typedef iterator_impl<const autovector, const value_type> const_iterator;
+  typedef std::reverse_iterator<iterator> reverse_iterator;
+  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+
+  autovector() = default;
+
+  autovector(std::initializer_list<T> init_list) {
+    for (const T& item : init_list) {
+      push_back(item);
+    }
+  }
+
+  ~autovector() = default;
+
+  // -- Immutable operations
+  // Indicate if all data resides in in-stack data structure.
+  bool only_in_stack() const {
+    // If no element was inserted at all, the vector's capacity will be `0`.
+    return vect_.capacity() == 0;
+  }
+
+  size_type size() const { return num_stack_items_ + vect_.size(); }
+
+  // resize does not guarantee anything about the contents of the newly
+  // available elements
+  void resize(size_type n) {
+    if (n > kSize) {
+      vect_.resize(n - kSize);
+      num_stack_items_ = kSize;
+    } else {
+      vect_.clear();
+      num_stack_items_ = n;
+    }
+  }
+
+  bool empty() const { return size() == 0; }
+
+  const_reference operator[](size_type n) const {
+    ceph_assert(n < size());
+    return n < kSize ? values_[n] : vect_[n - kSize];
+  }
+
+  reference operator[](size_type n) {
+    ceph_assert(n < size());
+    return n < kSize ? values_[n] : vect_[n - kSize];
+  }
+
+  const_reference at(size_type n) const {
+    ceph_assert(n < size());
+    return (*this)[n];
+  }
+
+  reference at(size_type n) {
+    ceph_assert(n < size());
+    return (*this)[n];
+  }
+
+  reference front() {
+    ceph_assert(!empty());
+    return *begin();
+  }
+
+  const_reference front() const {
+    ceph_assert(!empty());
+    return *begin();
+  }
+
+  reference back() {
+    ceph_assert(!empty());
+    return *(end() - 1);
+  }
+
+  const_reference back() const {
+    ceph_assert(!empty());
+    return *(end() - 1);
+  }
+
+  // -- Mutable Operations
+  void push_back(T&& item) {
+    if (num_stack_items_ < kSize) {
+      values_[num_stack_items_++] = std::move(item);
+    } else {
+      vect_.push_back(item);
+    }
+  }
+
+  void push_back(const T& item) {
+    if (num_stack_items_ < kSize) {
+      values_[num_stack_items_++] = item;
+    } else {
+      vect_.push_back(item);
+    }
+  }
+
+  template <class... Args>
+  void emplace_back(Args&&... args) {
+    push_back(value_type(args...));
+  }
+
+  void pop_back() {
+    ceph_assert(!empty());
+    if (!vect_.empty()) {
+      vect_.pop_back();
+    } else {
+      --num_stack_items_;
+    }
+  }
+
+  void clear() {
+    num_stack_items_ = 0;
+    vect_.clear();
+  }
+
+  // -- Copy and Assignment
+  autovector& assign(const autovector& other);
+
+  autovector(const autovector& other) { assign(other); }
+
+  autovector& operator=(const autovector& other) { return assign(other); }
+
+  // -- Iterator Operations
+  iterator begin() { return iterator(this, 0); }
+
+  const_iterator begin() const { return const_iterator(this, 0); }
+
+  iterator end() { return iterator(this, this->size()); }
+
+  const_iterator end() const { return const_iterator(this, this->size()); }
+
+  reverse_iterator rbegin() { return reverse_iterator(end()); }
+
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
+  }
+
+  reverse_iterator rend() { return reverse_iterator(begin()); }
+
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
+  }
+
+ private:
+  size_type num_stack_items_ = 0;  // current number of items
+  value_type values_[kSize];       // the first `kSize` items
+  // used only if there are more than `kSize` items.
+  std::vector<T> vect_;
+};
+
+template <class T, size_t kSize>
+autovector<T, kSize>& autovector<T, kSize>::assign(const autovector& other) {
+  // copy the internal vector
+  vect_.assign(other.vect_.begin(), other.vect_.end());
+
+  // copy array
+  num_stack_items_ = other.num_stack_items_;
+  std::copy(other.values_, other.values_ + num_stack_items_, values_);
+
+  return *this;
+}
+}  // namespace ceph 
+#endif // CEPH_AUTOVECTOR_H
diff --git a/src/common/bit_str.cc b/src/common/bit_str.cc
new file mode 100644
index 00000000..f14b2daa
--- /dev/null
+++ b/src/common/bit_str.cc
@@ -0,0 +1,72 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "common/bit_str.h"
+#include "common/Formatter.h"
+#include "include/ceph_assert.h"
+
+static void _dump_bit_str(
+    uint64_t bits,
+    std::ostream *out,
+    ceph::Formatter *f,
+    std::function<const char*(uint64_t)> func,
+    bool dump_bit_val)
+{
+  uint64_t b = bits;
+  int cnt = 0;
+  bool outted = false;
+
+  while (b && cnt < 64) {
+    uint64_t r = bits & (1ULL << cnt++);
+    if (r) {
+      if (out) {
+        if (outted)
+          *out << ",";
+        *out << func(r);
+        if (dump_bit_val) {
+          *out << "(" << r << ")";
+        }
+      } else {
+        ceph_assert(f != NULL);
+        if (dump_bit_val) {
+          f->dump_stream("bit_flag") << func(r)
+                                     << "(" << r << ")";
+        } else {
+          f->dump_stream("bit_flag") << func(r);
+        }
+      }
+      outted = true;
+    }
+    b >>= 1;
+  }
+  if (!outted && out)
+      *out << "none";
+}
+
+void print_bit_str(
+    uint64_t bits,
+    std::ostream &out,
+    const std::function<const char*(uint64_t)> &func,
+    bool dump_bit_val)
+{
+  _dump_bit_str(bits, &out, NULL, func, dump_bit_val);
+}
+
+void dump_bit_str(
+    uint64_t bits,
+    ceph::Formatter *f,
+    const std::function<const char*(uint64_t)> &func,
+    bool dump_bit_val)
+{
+  _dump_bit_str(bits, NULL, f, func, dump_bit_val);
+}
diff --git a/src/common/bit_str.h b/src/common/bit_str.h
new file mode 100644
index 00000000..5271c8ff
--- /dev/null
+++ b/src/common/bit_str.h
@@ -0,0 +1,37 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_COMMON_BIT_STR_H
+#define CEPH_COMMON_BIT_STR_H
+
+#include <cstdint>
+#include <iosfwd>
+#include <functional>
+
+namespace ceph {
+  class Formatter;
+}
+
+extern void print_bit_str(
+    uint64_t bits,
+    std::ostream &out,
+    const std::function<const char*(uint64_t)> &func,
+    bool dump_bit_val = false);
+
+extern void dump_bit_str(
+    uint64_t bits,
+    ceph::Formatter *f,
+    const std::function<const char*(uint64_t)> &func,
+    bool dump_bit_val = false);
+
+#endif /* CEPH_COMMON_BIT_STR_H */
diff --git a/src/common/bit_vector.hpp b/src/common/bit_vector.hpp
new file mode 100644
index 00000000..0ffce1b2
--- /dev/null
+++ b/src/common/bit_vector.hpp
@@ -0,0 +1,652 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat <contact@redhat.com>
+ *
+ * LGPL2.1 (see COPYING-LGPL2.1) or later
+ */
+
+#ifndef BIT_VECTOR_HPP
+#define BIT_VECTOR_HPP
+
+#include "common/Formatter.h"
+#include "include/ceph_assert.h"
+#include "include/encoding.h"
+#include <memory>
+#include <utility>
+#include <vector>
+
+namespace ceph {
+
+template <uint8_t _bit_count>
+class BitVector
+{
+private:
+  static const uint8_t BITS_PER_BYTE = 8;
+  static const uint32_t ELEMENTS_PER_BLOCK = BITS_PER_BYTE / _bit_count;
+  static const uint8_t MASK = static_cast<uint8_t>((1 << _bit_count) - 1);
+
+  // must be power of 2
+  BOOST_STATIC_ASSERT((_bit_count != 0) && !(_bit_count & (_bit_count - 1)));
+  BOOST_STATIC_ASSERT(_bit_count <= BITS_PER_BYTE);
+
+  template <typename DataIterator>
+  class ReferenceImpl {
+  protected:
+    DataIterator m_data_iterator;
+    uint64_t m_shift;
+
+    ReferenceImpl(const DataIterator& data_iterator, uint64_t shift)
+      : m_data_iterator(data_iterator), m_shift(shift) {
+    }
+    ReferenceImpl(DataIterator&& data_iterator, uint64_t shift)
+      : m_data_iterator(std::move(data_iterator)), m_shift(shift) {
+    }
+
+  public:
+    inline operator uint8_t() const {
+      return (*m_data_iterator >> m_shift) & MASK;
+    }
+  };
+
+public:
+
+  class ConstReference : public ReferenceImpl<bufferlist::const_iterator> {
+  private:
+    friend class BitVector;
+
+    ConstReference(const bufferlist::const_iterator& data_iterator,
+                   uint64_t shift)
+      : ReferenceImpl<bufferlist::const_iterator>(data_iterator, shift) {
+    }
+    ConstReference(bufferlist::const_iterator&& data_iterator, uint64_t shift)
+      : ReferenceImpl<bufferlist::const_iterator>(std::move(data_iterator),
+                                                  shift) {
+    }
+  };
+
+  class Reference : public ReferenceImpl<bufferlist::iterator> {
+  public:
+    Reference& operator=(uint8_t v);
+
+  private:
+    friend class BitVector;
+
+    Reference(const bufferlist::iterator& data_iterator, uint64_t shift)
+      : ReferenceImpl<bufferlist::iterator>(data_iterator, shift) {
+    }
+    Reference(bufferlist::iterator&& data_iterator, uint64_t shift)
+      : ReferenceImpl<bufferlist::iterator>(std::move(data_iterator), shift) {
+    }
+  };
+
+public:
+  template <typename BitVectorT, typename DataIterator>
+  class IteratorImpl {
+  private:
+    friend class BitVector;
+
+    uint64_t m_offset = 0;
+    BitVectorT *m_bit_vector;
+
+    // cached derived values
+    uint64_t m_index = 0;
+    uint64_t m_shift = 0;
+    DataIterator m_data_iterator;
+
+    IteratorImpl(BitVectorT *bit_vector, uint64_t offset)
+      : m_bit_vector(bit_vector),
+        m_data_iterator(bit_vector->m_data.begin()) {
+      *this += offset;
+    }
+
+  public:
+    inline IteratorImpl& operator++() {
+      ++m_offset;
+
+      uint64_t index;
+      compute_index(m_offset, &index, &m_shift);
+
+      ceph_assert(index == m_index || index == m_index + 1);
+      if (index > m_index) {
+        m_index = index;
+        ++m_data_iterator;
+      }
+      return *this;
+    }
+    inline IteratorImpl& operator+=(uint64_t offset) {
+      m_offset += offset;
+      compute_index(m_offset, &m_index, &m_shift);
+      if (m_offset < m_bit_vector->size()) {
+        m_data_iterator.seek(m_index);
+      } else {
+        m_data_iterator = m_bit_vector->m_data.end();
+      }
+      return *this;
+    }
+
+    inline IteratorImpl operator++(int) {
+      IteratorImpl iterator_impl(*this);
+      ++iterator_impl;
+      return iterator_impl;
+    }
+    inline IteratorImpl operator+(uint64_t offset) {
+      IteratorImpl iterator_impl(*this);
+      iterator_impl += offset;
+      return iterator_impl;
+    }
+
+    inline bool operator==(const IteratorImpl& rhs) const {
+      return (m_offset == rhs.m_offset && m_bit_vector == rhs.m_bit_vector);
+    }
+    inline bool operator!=(const IteratorImpl& rhs) const {
+      return (m_offset != rhs.m_offset || m_bit_vector != rhs.m_bit_vector);
+    }
+
+    inline ConstReference operator*() const {
+      return ConstReference(m_data_iterator, m_shift);
+    }
+    inline Reference operator*() {
+      return Reference(m_data_iterator, m_shift);
+    }
+  };
+
+  typedef IteratorImpl<const BitVector,
+                       bufferlist::const_iterator> ConstIterator;
+  typedef IteratorImpl<BitVector, bufferlist::iterator> Iterator;
+
+  static const uint32_t BLOCK_SIZE;
+  static const uint8_t BIT_COUNT = _bit_count;
+
+  BitVector();
+
+  inline ConstIterator begin() const {
+    return ConstIterator(this, 0);
+  }
+  inline ConstIterator end() const {
+    return ConstIterator(this, m_size);
+  }
+  inline Iterator begin() {
+    return Iterator(this, 0);
+  }
+  inline Iterator end() {
+    return Iterator(this, m_size);
+  }
+
+  void set_crc_enabled(bool enabled) {
+    m_crc_enabled = enabled;
+  }
+  void clear();
+
+  void resize(uint64_t elements);
+  uint64_t size() const;
+
+  const bufferlist& get_data() const;
+
+  Reference operator[](uint64_t offset);
+  ConstReference operator[](uint64_t offset) const;
+
+  void encode_header(bufferlist& bl) const;
+  void decode_header(bufferlist::const_iterator& it);
+  uint64_t get_header_length() const;
+
+  void encode_data(bufferlist& bl, uint64_t data_byte_offset,
+		   uint64_t byte_length) const;
+  void decode_data(bufferlist::const_iterator& it, uint64_t data_byte_offset);
+  void get_data_extents(uint64_t offset, uint64_t length,
+                        uint64_t *data_byte_offset,
+                        uint64_t *object_byte_offset,
+                        uint64_t *byte_length) const;
+
+  void encode_footer(bufferlist& bl) const;
+  void decode_footer(bufferlist::const_iterator& it);
+  uint64_t get_footer_offset() const;
+
+  void decode_header_crc(bufferlist::const_iterator& it);
+  void get_header_crc_extents(uint64_t *byte_offset,
+                              uint64_t *byte_length) const;
+
+  void encode_data_crcs(bufferlist& bl, uint64_t offset,
+                        uint64_t length) const;
+  void decode_data_crcs(bufferlist::const_iterator& it, uint64_t offset);
+  void get_data_crcs_extents(uint64_t offset, uint64_t length,
+                             uint64_t *byte_offset,
+                             uint64_t *byte_length) const;
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& it);
+  void dump(Formatter *f) const;
+
+  bool operator==(const BitVector &b) const;
+
+  static void generate_test_instances(std::list<BitVector *> &o);
+private:
+  struct NoInitAllocator : public std::allocator<__u32> {
+    NoInitAllocator() {}
+    NoInitAllocator(const std::allocator<__u32>& alloc)
+      : std::allocator<__u32>(alloc) {
+    }
+
+    template <class U, class... Args>
+    void construct(U* p, Args&&... args) const {
+    }
+  };
+
+  bufferlist m_data;
+  uint64_t m_size;
+  bool m_crc_enabled;
+
+  mutable __u32 m_header_crc;
+  mutable std::vector<__u32, NoInitAllocator> m_data_crcs;
+
+  void resize(uint64_t elements, bool zero);
+
+  static void compute_index(uint64_t offset, uint64_t *index, uint64_t *shift);
+
+};
+
+template <uint8_t _b>
+const uint32_t BitVector<_b>::BLOCK_SIZE = 4096;
+
+template <uint8_t _b>
+BitVector<_b>::BitVector() : m_size(0), m_crc_enabled(true), m_header_crc(0)
+{
+}
+
+template <uint8_t _b>
+void BitVector<_b>::clear() {
+  m_data.clear();
+  m_data_crcs.clear();
+  m_size = 0;
+  m_header_crc = 0;
+}
+
+template <uint8_t _b>
+void BitVector<_b>::resize(uint64_t size) {
+  resize(size, true);
+}
+
+template <uint8_t _b>
+void BitVector<_b>::resize(uint64_t size, bool zero) {
+  uint64_t buffer_size = (size + ELEMENTS_PER_BLOCK - 1) / ELEMENTS_PER_BLOCK;
+  if (buffer_size > m_data.length()) {
+    if (zero) {
+      m_data.append_zero(buffer_size - m_data.length());
+    } else {
+      m_data.append(std::move(buffer::ptr(buffer_size - m_data.length())));
+    }
+  } else if (buffer_size < m_data.length()) {
+    bufferlist bl;
+    bl.substr_of(m_data, 0, buffer_size);
+    bl.swap(m_data);
+  }
+  m_size = size;
+
+  uint64_t block_count = (buffer_size + BLOCK_SIZE - 1) / BLOCK_SIZE;
+  m_data_crcs.resize(block_count);
+}
+
+template <uint8_t _b>
+uint64_t BitVector<_b>::size() const {
+  return m_size;
+}
+
+template <uint8_t _b>
+const bufferlist& BitVector<_b>::get_data() const {
+  return m_data;
+}
+
+template <uint8_t _b>
+void BitVector<_b>::compute_index(uint64_t offset, uint64_t *index, uint64_t *shift) {
+  *index = offset / ELEMENTS_PER_BLOCK;
+  *shift = ((ELEMENTS_PER_BLOCK - 1) - (offset % ELEMENTS_PER_BLOCK)) * _b;
+}
+
+template <uint8_t _b>
+void BitVector<_b>::encode_header(bufferlist& bl) const {
+  bufferlist header_bl;
+  ENCODE_START(1, 1, header_bl);
+  encode(m_size, header_bl);
+  ENCODE_FINISH(header_bl);
+  m_header_crc = header_bl.crc32c(0);
+
+  encode(header_bl, bl);
+}
+
+template <uint8_t _b>
+void BitVector<_b>::decode_header(bufferlist::const_iterator& it) {
+  using ceph::decode;
+  bufferlist header_bl;
+  decode(header_bl, it);
+
+  auto header_it = header_bl.cbegin();
+  uint64_t size;
+  DECODE_START(1, header_it);
+  decode(size, header_it);
+  DECODE_FINISH(header_it);
+
+  resize(size, false);
+  m_header_crc = header_bl.crc32c(0);
+}
+
+template <uint8_t _b>
+uint64_t BitVector<_b>::get_header_length() const {
+  // 4 byte bl length, 6 byte encoding header, 8 byte size
+  return 18;
+}
+
+template <uint8_t _b>
+void BitVector<_b>::encode_data(bufferlist& bl, uint64_t data_byte_offset,
+				uint64_t byte_length) const {
+  ceph_assert(data_byte_offset % BLOCK_SIZE == 0);
+  ceph_assert(data_byte_offset + byte_length == m_data.length() ||
+              byte_length % BLOCK_SIZE == 0);
+
+  uint64_t end_offset = data_byte_offset + byte_length;
+  while (data_byte_offset < end_offset) {
+    uint64_t len = std::min<uint64_t>(BLOCK_SIZE,
+                                      end_offset - data_byte_offset);
+
+    bufferlist bit;
+    bit.substr_of(m_data, data_byte_offset, len);
+    m_data_crcs[data_byte_offset / BLOCK_SIZE] = bit.crc32c(0);
+
+    bl.claim_append(bit);
+    data_byte_offset += BLOCK_SIZE;
+  }
+}
+
+template <uint8_t _b>
+void BitVector<_b>::decode_data(bufferlist::const_iterator& it,
+                                uint64_t data_byte_offset) {
+  ceph_assert(data_byte_offset % BLOCK_SIZE == 0);
+  if (it.end()) {
+    return;
+  }
+
+  uint64_t end_offset = data_byte_offset + it.get_remaining();
+  if (end_offset > m_data.length()) {
+    throw buffer::end_of_buffer();
+  }
+
+  bufferlist data;
+  if (data_byte_offset > 0) {
+    data.substr_of(m_data, 0, data_byte_offset);
+  }
+
+  while (data_byte_offset < end_offset) {
+    uint64_t len = std::min<uint64_t>(BLOCK_SIZE, end_offset - data_byte_offset);
+
+    bufferptr ptr;
+    it.copy_deep(len, ptr);
+
+    bufferlist bit;
+    bit.append(ptr);
+    if (m_crc_enabled &&
+	m_data_crcs[data_byte_offset / BLOCK_SIZE] != bit.crc32c(0)) {
+      throw buffer::malformed_input("invalid data block CRC");
+    }
+    data.append(bit);
+    data_byte_offset += bit.length();
+  }
+
+  if (m_data.length() > end_offset) {
+    bufferlist tail;
+    tail.substr_of(m_data, end_offset, m_data.length() - end_offset);
+    data.append(tail);
+  }
+  ceph_assert(data.length() == m_data.length());
+  data.swap(m_data);
+}
+
+template <uint8_t _b>
+void BitVector<_b>::get_data_extents(uint64_t offset, uint64_t length,
+                                     uint64_t *data_byte_offset,
+                                     uint64_t *object_byte_offset,
+                                     uint64_t *byte_length) const {
+  // read BLOCK_SIZE-aligned chunks
+  ceph_assert(length > 0 && offset + length <= m_size);
+  uint64_t shift;
+  compute_index(offset, data_byte_offset, &shift);
+  *data_byte_offset -= (*data_byte_offset % BLOCK_SIZE);
+
+  uint64_t end_offset;
+  compute_index(offset + length - 1, &end_offset, &shift);
+  end_offset += (BLOCK_SIZE - (end_offset % BLOCK_SIZE));
+  ceph_assert(*data_byte_offset <= end_offset);
+
+  *object_byte_offset = get_header_length() + *data_byte_offset;
+  *byte_length = end_offset - *data_byte_offset;
+  if (*data_byte_offset + *byte_length > m_data.length()) {
+    *byte_length = m_data.length() - *data_byte_offset;
+  }
+}
+
+template <uint8_t _b>
+void BitVector<_b>::encode_footer(bufferlist& bl) const {
+  using ceph::encode;
+  bufferlist footer_bl;
+  if (m_crc_enabled) {
+    encode(m_header_crc, footer_bl);
+
+    __u32 size = m_data_crcs.size();
+    encode(size, footer_bl);
+    encode_data_crcs(footer_bl, 0, m_size);
+  }
+  encode(footer_bl, bl);
+}
+
+template <uint8_t _b>
+void BitVector<_b>::decode_footer(bufferlist::const_iterator& it) {
+  using ceph::decode;
+  bufferlist footer_bl;
+  decode(footer_bl, it);
+
+  m_crc_enabled = (footer_bl.length() > 0);
+  if (m_crc_enabled) {
+    auto footer_it = footer_bl.cbegin();
+    decode_header_crc(footer_it);
+
+    __u32 data_src_size;
+    decode(data_src_size, footer_it);
+    decode_data_crcs(footer_it, 0);
+
+    uint64_t block_count = (m_data.length() + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    if (m_data_crcs.size() != block_count) {
+      throw buffer::malformed_input("invalid data block CRCs");
+    }
+  }
+}
+
+template <uint8_t _b>
+uint64_t BitVector<_b>::get_footer_offset() const {
+  return get_header_length() + m_data.length();
+}
+
+template <uint8_t _b>
+void BitVector<_b>::decode_header_crc(bufferlist::const_iterator& it) {
+  if (it.get_remaining() > 0) {
+    __u32 header_crc;
+    ceph::decode(header_crc, it);
+    if (m_header_crc != header_crc) {
+      throw buffer::malformed_input("incorrect header CRC");
+    }
+  }
+}
+
+template <uint8_t _b>
+void BitVector<_b>::get_header_crc_extents(uint64_t *byte_offset,
+                                           uint64_t *byte_length) const {
+  // footer is prefixed with a bufferlist length
+  *byte_offset = get_footer_offset() + sizeof(__u32);
+  *byte_length = sizeof(__u32);
+}
+
+template <uint8_t _b>
+void BitVector<_b>::encode_data_crcs(bufferlist& bl, uint64_t offset,
+                                     uint64_t length) const {
+  if (length == 0) {
+    return;
+  }
+
+  uint64_t index;
+  uint64_t shift;
+  compute_index(offset, &index, &shift);
+  uint64_t crc_index = index / BLOCK_SIZE;
+
+  compute_index(offset + length - 1, &index, &shift);
+  uint64_t end_crc_index = index / BLOCK_SIZE;
+  while (crc_index <= end_crc_index) {
+    __u32 crc = m_data_crcs[crc_index++];
+    ceph::encode(crc, bl);
+  }
+}
+
+template <uint8_t _b>
+void BitVector<_b>::decode_data_crcs(bufferlist::const_iterator& it,
+                                     uint64_t offset) {
+  if (it.end()) {
+    return;
+  }
+
+  uint64_t index;
+  uint64_t shift;
+  compute_index(offset, &index, &shift);
+
+  uint64_t crc_index = index / BLOCK_SIZE;
+  uint64_t remaining = it.get_remaining() / sizeof(__u32);
+  while (remaining > 0) {
+    __u32 crc;
+    ceph::decode(crc, it);
+    m_data_crcs[crc_index++] = crc;
+    --remaining;
+  }
+}
+
+template <uint8_t _b>
+void BitVector<_b>::get_data_crcs_extents(uint64_t offset, uint64_t length,
+                                          uint64_t *byte_offset,
+                                          uint64_t *byte_length) const {
+  // data CRCs immediately follow the header CRC
+  get_header_crc_extents(byte_offset, byte_length);
+  *byte_offset += *byte_length;
+
+  // skip past data CRC vector size
+  *byte_offset += sizeof(__u32);
+
+  // CRCs are computed over BLOCK_SIZE chunks
+  ceph_assert(length > 0 && offset + length <= m_size);
+  uint64_t index;
+  uint64_t shift;
+  compute_index(offset, &index, &shift);
+  uint64_t start_byte_offset =
+    *byte_offset + ((index / BLOCK_SIZE) * sizeof(__u32));
+
+  compute_index(offset + length, &index, &shift);
+  uint64_t end_byte_offset =
+    *byte_offset + (((index / BLOCK_SIZE) + 1) * sizeof(__u32));
+  ceph_assert(start_byte_offset < end_byte_offset);
+
+  *byte_offset = start_byte_offset;
+  *byte_length = end_byte_offset - start_byte_offset;
+}
+
+template <uint8_t _b>
+void BitVector<_b>::encode(bufferlist& bl) const {
+  encode_header(bl);
+  encode_data(bl, 0, m_data.length());
+  encode_footer(bl);
+}
+
+template <uint8_t _b>
+void BitVector<_b>::decode(bufferlist::const_iterator& it) {
+  decode_header(it);
+
+  bufferlist data_bl;
+  if (m_data.length() > 0) {
+    it.copy(m_data.length(), data_bl);
+  }
+
+  decode_footer(it);
+
+  auto data_it = data_bl.cbegin();
+  decode_data(data_it, 0);
+}
+
+template <uint8_t _b>
+void BitVector<_b>::dump(Formatter *f) const {
+  f->dump_unsigned("size", m_size);
+  f->open_array_section("bit_table");
+  for (unsigned i = 0; i < m_data.length(); ++i) {
+    f->dump_format("byte", "0x%02hhX", m_data[i]);
+  }
+  f->close_section();
+}
+
+template <uint8_t _b>
+bool BitVector<_b>::operator==(const BitVector &b) const {
+  return (this->m_size == b.m_size && this->m_data == b.m_data);
+}
+
+template <uint8_t _b>
+typename BitVector<_b>::Reference BitVector<_b>::operator[](uint64_t offset) {
+  uint64_t index;
+  uint64_t shift;
+  compute_index(offset, &index, &shift);
+
+  bufferlist::iterator data_iterator(m_data.begin());
+  data_iterator.seek(index);
+  return Reference(std::move(data_iterator), shift);
+}
+
+template <uint8_t _b>
+typename BitVector<_b>::ConstReference BitVector<_b>::operator[](uint64_t offset) const {
+  uint64_t index;
+  uint64_t shift;
+  compute_index(offset, &index, &shift);
+
+  bufferlist::const_iterator data_iterator(m_data.begin());
+  data_iterator.seek(index);
+  return ConstReference(std::move(data_iterator), shift);
+}
+
+template <uint8_t _b>
+typename BitVector<_b>::Reference& BitVector<_b>::Reference::operator=(uint8_t v) {
+  uint8_t mask = MASK << this->m_shift;
+  char packed_value = (*this->m_data_iterator & ~mask) |
+                      ((v << this->m_shift) & mask);
+  bufferlist::iterator it(this->m_data_iterator);
+  it.copy_in(1, &packed_value, true);
+  return *this;
+}
+
+template <uint8_t _b>
+void BitVector<_b>::generate_test_instances(std::list<BitVector *> &o) {
+  o.push_back(new BitVector());
+
+  BitVector *b = new BitVector();
+  const uint64_t radix = 1 << b->BIT_COUNT;
+  const uint64_t size = 1024;
+
+  b->resize(size, false);
+  for (uint64_t i = 0; i < size; ++i) {
+    (*b)[i] = rand() % radix;
+  }
+  o.push_back(b);
+}
+
+
+WRITE_CLASS_ENCODER(ceph::BitVector<2>)
+
+template <uint8_t _b>
+inline std::ostream& operator<<(std::ostream& out, const ceph::BitVector<_b> &b)
+{
+  out << "ceph::BitVector<" << _b << ">(size=" << b.size() << ", data="
+      << b.get_data() << ")";
+  return out;
+}
+}
+
+#endif // BIT_VECTOR_HPP
diff --git a/src/common/blkdev.cc b/src/common/blkdev.cc
new file mode 100644
index 00000000..fbc370c7
--- /dev/null
+++ b/src/common/blkdev.cc
@@ -0,0 +1,1346 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (c) 2015 Hewlett-Packard Development Company, L.P.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+
+#ifdef __FreeBSD__
+#include <sys/param.h>
+#include <geom/geom_disk.h>
+#include <sys/disk.h>
+#include <fcntl.h>
+#endif
+
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <boost/algorithm/string/replace.hpp>
+//#include "common/debug.h"
+#include "include/scope_guard.h"
+#include "include/uuid.h"
+#include "include/stringify.h"
+#include "blkdev.h"
+#include "numa.h"
+
+#include "json_spirit/json_spirit_reader.h"
+
+int get_device_by_path(const char *path, char* partition, char* device,
+		       size_t max)
+{
+  int fd = ::open(path, O_RDONLY|O_DIRECTORY);
+  if (fd < 0) {
+    return -errno;
+  }
+  auto close_fd = make_scope_guard([fd] {
+    ::close(fd);
+  });
+  BlkDev blkdev(fd);
+  if (auto ret = blkdev.partition(partition, max); ret) {
+    return ret;
+  }
+  if (auto ret = blkdev.wholedisk(device, max); ret) {
+    return ret;
+  }
+  return 0;
+}
+
+
+#include "common/blkdev.h"
+
+#ifdef __linux__
+#include <libudev.h>
+#include <linux/fs.h>
+#include <linux/kdev_t.h>
+#include <blkid/blkid.h>
+
+#include <set>
+
+#include "common/SubProcess.h"
+#include "common/errno.h"
+
+
+#define UUID_LEN 36
+
+#endif
+
+
+BlkDev::BlkDev(int f)
+  : fd(f)
+{}
+
+BlkDev::BlkDev(const std::string& devname)
+  : devname(devname)
+{}
+
+int BlkDev::get_devid(dev_t *id) const
+{
+  struct stat st;
+  int r;
+  if (fd >= 0) {
+    r = fstat(fd, &st);
+  } else {
+    char path[PATH_MAX];
+    snprintf(path, sizeof(path), "/dev/%s", devname.c_str());
+    r = stat(path, &st);
+  }
+  if (r < 0) {
+    return -errno;
+  }
+  *id = S_ISBLK(st.st_mode) ? st.st_rdev : st.st_dev;
+  return 0;
+}
+
+#ifdef __linux__
+static const char *blkdev_props2strings[] = {
+  [BLKDEV_PROP_DEV]                 = "dev",
+  [BLKDEV_PROP_DISCARD_GRANULARITY] = "queue/discard_granularity",
+  [BLKDEV_PROP_MODEL]               = "device/model",
+  [BLKDEV_PROP_ROTATIONAL]          = "queue/rotational",
+  [BLKDEV_PROP_SERIAL]              = "device/serial",
+  [BLKDEV_PROP_VENDOR]              = "device/device/vendor",
+  [BLKDEV_PROP_NUMA_NODE]           = "device/device/numa_node",
+  [BLKDEV_PROP_NUMA_CPUS]           = "device/device/local_cpulist",
+};
+
+const char *BlkDev::sysfsdir() const {
+  return "/sys";
+}
+
+int BlkDev::get_size(int64_t *psize) const
+{
+#ifdef BLKGETSIZE64
+  int ret = ::ioctl(fd, BLKGETSIZE64, psize);
+#elif defined(BLKGETSIZE)
+  unsigned long sectors = 0;
+  int ret = ::ioctl(fd, BLKGETSIZE, &sectors);
+  *psize = sectors * 512ULL;
+#else
+// cppcheck-suppress preprocessorErrorDirective
+# error "Linux configuration error (get_size)"
+#endif
+  if (ret < 0)
+    ret = -errno;
+  return ret;
+}
+
+/**
+ * get a block device property as a string
+ *
+ * store property in *val, up to maxlen chars
+ * return 0 on success
+ * return negative error on error
+ */
+int64_t BlkDev::get_string_property(blkdev_prop_t prop,
+				    char *val, size_t maxlen) const
+{
+  char filename[PATH_MAX], wd[PATH_MAX];
+  const char* dev = nullptr;
+  assert(prop < BLKDEV_PROP_NUMPROPS);
+  const char *propstr = blkdev_props2strings[prop];
+
+  if (fd >= 0) {
+    // sysfs isn't fully populated for partitions, so we need to lookup the sysfs
+    // entry for the underlying whole disk.
+    if (int r = wholedisk(wd, sizeof(wd)); r < 0)
+      return r;
+    dev = wd;
+  } else {
+    dev = devname.c_str();
+  }
+  if (snprintf(filename, sizeof(filename), "%s/block/%s/%s", sysfsdir(), dev,
+	       propstr) >= static_cast<int>(sizeof(filename))) {
+    return -ERANGE;
+  }
+
+  FILE *fp = fopen(filename, "r");
+  if (fp == NULL) {
+    return -errno;
+  }
+
+  int r = 0;
+  if (fgets(val, maxlen - 1, fp)) {
+    // truncate at newline
+    char *p = val;
+    while (*p && *p != '\n')
+      ++p;
+    *p = 0;
+  } else {
+    r = -EINVAL;
+  }
+  fclose(fp);
+  return r;
+}
+
+/**
+ * get a block device property
+ *
+ * return the value (we assume it is positive)
+ * return negative error on error
+ */
+int64_t BlkDev::get_int_property(blkdev_prop_t prop) const
+{
+  char buff[256] = {0};
+  int r = get_string_property(prop, buff, sizeof(buff));
+  if (r < 0)
+    return r;
+  // take only digits
+  for (char *p = buff; *p; ++p) {
+    if (!isdigit(*p)) {
+      *p = 0;
+      break;
+    }
+  }
+  char *endptr = 0;
+  r = strtoll(buff, &endptr, 10);
+  if (endptr != buff + strlen(buff))
+    r = -EINVAL;
+  return r;
+}
+
+bool BlkDev::support_discard() const
+{
+  return get_int_property(BLKDEV_PROP_DISCARD_GRANULARITY) > 0;
+}
+
+int BlkDev::discard(int64_t offset, int64_t len) const
+{
+  uint64_t range[2] = {(uint64_t)offset, (uint64_t)len};
+  return ioctl(fd, BLKDISCARD, range);
+}
+
+bool BlkDev::is_nvme() const
+{
+  char vendor[80];
+  // nvme has a device/device/vendor property; infer from that.  There is
+  // probably a better way?
+  int r = get_string_property(BLKDEV_PROP_VENDOR, vendor, 80);
+  return (r == 0);
+}
+
+bool BlkDev::is_rotational() const
+{
+  return get_int_property(BLKDEV_PROP_ROTATIONAL) > 0;
+}
+
+int BlkDev::get_numa_node(int *node) const
+{
+  int numa = get_int_property(BLKDEV_PROP_NUMA_NODE);
+  if (numa < 0)
+    return -1;
+  *node = numa;
+  return 0;
+}
+
+int BlkDev::dev(char *dev, size_t max) const
+{
+  return get_string_property(BLKDEV_PROP_DEV, dev, max);
+}
+
+int BlkDev::vendor(char *vendor, size_t max) const
+{
+  return get_string_property(BLKDEV_PROP_VENDOR, vendor, max);
+}
+
+int BlkDev::model(char *model, size_t max) const
+{
+  return get_string_property(BLKDEV_PROP_MODEL, model, max);
+}
+
+int BlkDev::serial(char *serial, size_t max) const
+{
+  return get_string_property(BLKDEV_PROP_SERIAL, serial, max);
+}
+
+int BlkDev::partition(char *partition, size_t max) const
+{
+  dev_t id;
+  int r = get_devid(&id);
+  if (r < 0)
+    return -EINVAL;  // hrm.
+
+  char *t = blkid_devno_to_devname(id);
+  if (!t) {
+    return -EINVAL;
+  }
+  strncpy(partition, t, max);
+  free(t);
+  return 0;
+}
+
+int BlkDev::wholedisk(char *device, size_t max) const
+{
+  dev_t id;
+  int r = get_devid(&id);
+  if (r < 0)
+    return -EINVAL;  // hrm.
+
+  r = blkid_devno_to_wholedisk(id, device, max, nullptr);
+  if (r < 0) {
+    return -EINVAL;
+  }
+  return 0;
+}
+
+static int easy_readdir(const std::string& dir, std::set<std::string> *out)
+{
+  DIR *h = ::opendir(dir.c_str());
+  if (!h) {
+    return -errno;
+  }
+  struct dirent *de = nullptr;
+  while ((de = ::readdir(h))) {
+    if (strcmp(de->d_name, ".") == 0 ||
+	strcmp(de->d_name, "..") == 0) {
+      continue;
+    }
+    out->insert(de->d_name);
+  }
+  closedir(h);
+  return 0;
+}
+
+void get_dm_parents(const std::string& dev, std::set<std::string> *ls)
+{
+  std::string p = std::string("/sys/block/") + dev + "/slaves";
+  std::set<std::string> parents;
+  easy_readdir(p, &parents);
+  for (auto& d : parents) {
+    ls->insert(d);
+    // recurse in case it is dm-on-dm
+    if (d.find("dm-") == 0) {
+      get_dm_parents(d, ls);
+    }
+  }
+}
+
+void get_raw_devices(const std::string& in,
+		     std::set<std::string> *ls)
+{
+  if (in.substr(0, 3) == "dm-") {
+    std::set<std::string> o;
+    get_dm_parents(in, &o);
+    for (auto& d : o) {
+      get_raw_devices(d, ls);
+    }
+  } else {
+    BlkDev d(in);
+    std::string wholedisk;
+    if (d.wholedisk(&wholedisk) == 0) {
+      ls->insert(wholedisk);
+    } else {
+      ls->insert(in);
+    }
+  }
+}
+
+int _get_vdo_stats_handle(const char *devname, std::string *vdo_name)
+{
+  int vdo_fd = -1;
+
+  // we need to go from the raw devname (e.g., dm-4) to the VDO volume name.
+  // currently the best way seems to be to look at /dev/mapper/* ...
+  std::string expect = std::string("../") + devname;  // expected symlink target
+  DIR *dir = ::opendir("/dev/mapper");
+  if (!dir) {
+    return -1;
+  }
+  struct dirent *de = nullptr;
+  while ((de = ::readdir(dir))) {
+    if (de->d_name[0] == '.')
+      continue;
+    char fn[4096], target[4096];
+    snprintf(fn, sizeof(fn), "/dev/mapper/%s", de->d_name);
+    int r = readlink(fn, target, sizeof(target));
+    if (r < 0 || r >= (int)sizeof(target))
+      continue;
+    target[r] = 0;
+    if (expect == target) {
+      snprintf(fn, sizeof(fn), "/sys/kvdo/%s/statistics", de->d_name);
+      vdo_fd = ::open(fn, O_RDONLY|O_CLOEXEC); //DIRECTORY);
+      if (vdo_fd >= 0) {
+	*vdo_name = de->d_name;
+	break;
+      }
+    }
+  }
+  closedir(dir);
+  return vdo_fd;
+}
+
+int get_vdo_stats_handle(const char *devname, std::string *vdo_name)
+{
+  std::set<std::string> devs = { devname };
+  while (!devs.empty()) {
+    std::string dev = *devs.begin();
+    devs.erase(devs.begin());
+    int fd = _get_vdo_stats_handle(dev.c_str(), vdo_name);
+    if (fd >= 0) {
+      // yay, it's vdo
+      return fd;
+    }
+    // ok, see if there are constituent devices
+    if (dev.find("dm-") == 0) {
+      get_dm_parents(dev, &devs);
+    }
+  }
+  return -1;
+}
+
+int64_t get_vdo_stat(int vdo_fd, const char *property)
+{
+  int64_t ret = 0;
+  int fd = ::openat(vdo_fd, property, O_RDONLY|O_CLOEXEC);
+  if (fd < 0) {
+    return 0;
+  }
+  char buf[1024];
+  int r = ::read(fd, buf, sizeof(buf) - 1);
+  if (r > 0) {
+    buf[r] = 0;
+    ret = atoll(buf);
+  }
+  TEMP_FAILURE_RETRY(::close(fd));
+  return ret;
+}
+
+bool get_vdo_utilization(int fd, uint64_t *total, uint64_t *avail)
+{
+  int64_t block_size = get_vdo_stat(fd, "block_size");
+  int64_t physical_blocks = get_vdo_stat(fd, "physical_blocks");
+  int64_t overhead_blocks_used = get_vdo_stat(fd, "overhead_blocks_used");
+  int64_t data_blocks_used = get_vdo_stat(fd, "data_blocks_used");
+  if (!block_size
+      || !physical_blocks
+      || !overhead_blocks_used
+      || !data_blocks_used) {
+    return false;
+  }
+  int64_t avail_blocks =
+    physical_blocks - overhead_blocks_used - data_blocks_used;
+  *total = block_size * physical_blocks;
+  *avail = block_size * avail_blocks;
+  return true;
+}
+
+std::string _decode_model_enc(const std::string& in)
+{
+  auto v = boost::replace_all_copy(in, "\\x20", " ");
+  if (auto found = v.find_last_not_of(" "); found != v.npos) {
+    v.erase(found + 1);
+  }
+  std::replace(v.begin(), v.end(), ' ', '_');
+  return v;
+}
+
+// trying to use udev first, and if it doesn't work, we fall back to 
+// reading /sys/block/$devname/device/(vendor/model/serial).
+std::string get_device_id(const std::string& devname,
+			  std::string *err)
+{
+  struct udev_device *dev;
+  static struct udev *udev;
+  const char *data;
+
+  udev = udev_new();
+  if (!udev) {
+    if (err) {
+      *err = "udev_new failed";
+    }
+    return {};
+  }
+  dev = udev_device_new_from_subsystem_sysname(udev, "block", devname.c_str());
+  if (!dev) {
+    if (err) {
+      *err = std::string("udev_device_new_from_subsystem_sysname failed on '")
+	+ devname + "'";
+    }
+    udev_unref(udev);
+    return {};
+  }
+
+  // ****
+  //   NOTE: please keep this implementation in sync with _get_device_id() in
+  //   src/ceph-volume/ceph_volume/util/device.py
+  // ****
+
+  std::string id_vendor, id_model, id_serial, id_serial_short, id_scsi_serial;
+  data = udev_device_get_property_value(dev, "ID_VENDOR");
+  if (data) {
+    id_vendor = data;
+  }
+  data = udev_device_get_property_value(dev, "ID_MODEL");
+  if (data) {
+    id_model = data;
+    // sometimes, ID_MODEL is "LVM ..." but ID_MODEL_ENC is correct (but
+    // encoded with \x20 for space).
+    if (id_model.substr(0, 7) == "LVM PV ") {
+      const char *enc = udev_device_get_property_value(dev, "ID_MODEL_ENC");
+      if (enc) {
+	id_model = _decode_model_enc(enc);
+      } else {
+	// ignore ID_MODEL then
+	id_model.clear();
+      }
+    }
+  }
+  data = udev_device_get_property_value(dev, "ID_SERIAL_SHORT");
+  if (data) {
+    id_serial_short = data;
+  }
+  data = udev_device_get_property_value(dev, "ID_SCSI_SERIAL");
+  if (data) {
+    id_scsi_serial = data;
+  }
+  data = udev_device_get_property_value(dev, "ID_SERIAL");
+  if (data) {
+    id_serial = data;
+  }
+  udev_device_unref(dev);
+  udev_unref(udev);
+
+  // ID_SERIAL is usually $vendor_$model_$serial, but not always
+  // ID_SERIAL_SHORT is mostly always just the serial
+  // ID_MODEL is sometimes $vendor_$model, but
+  // ID_VENDOR is sometimes $vendor and ID_MODEL just $model and ID_SCSI_SERIAL the real serial number, with ID_SERIAL and ID_SERIAL_SHORT gibberish (ick)
+  std::string device_id;
+  if (id_vendor.size() && id_model.size() && id_scsi_serial.size()) {
+    device_id = id_vendor + '_' + id_model + '_' + id_scsi_serial;
+  } else if (id_model.size() && id_serial_short.size()) {
+    device_id = id_model + '_' + id_serial_short;
+  } else if (id_serial.size()) {
+    device_id = id_serial;
+    if (device_id.substr(0, 4) == "MTFD") {
+      // Micron NVMes hide the vendor
+      device_id = "Micron_" + device_id;
+    }
+  }
+  if (device_id.size()) {
+    std::replace(device_id.begin(), device_id.end(), ' ', '_');
+    return device_id;
+  }
+
+  // either udev_device_get_property_value() failed, or succeeded but
+  // returned nothing; trying to read from files.  note that the 'vendor'
+  // file rarely contains the actual vendor; it's usually 'ATA'.
+  std::string model, serial;
+  char buf[1024] = {0};
+  BlkDev blkdev(devname);
+  if (!blkdev.model(buf, sizeof(buf))) {
+    model = buf;
+  }
+  if (!blkdev.serial(buf, sizeof(buf))) {
+    serial = buf;
+  }
+  if (err) {
+    if (model.empty() && serial.empty()) {
+      *err = std::string("fallback method has no model nor serial'");
+      return {};
+    } else if (model.empty()) {
+      *err = std::string("fallback method has serial '") + serial
+        + "' but no model'";
+      return {};
+    } else if (serial.empty()) {
+      *err = std::string("fallback method has model '") + model
+        + "' but no serial'";
+      return {};
+    }
+  }
+
+  device_id = model + "_" + serial;
+  std::replace(device_id.begin(), device_id.end(), ' ', '_');
+  return device_id;
+}
+
+static std::string get_device_vendor(const std::string& devname)
+{
+  struct udev_device *dev;
+  static struct udev *udev;
+  const char *data;
+
+  udev = udev_new();
+  if (!udev) {
+    return {};
+  }
+  dev = udev_device_new_from_subsystem_sysname(udev, "block", devname.c_str());
+  if (!dev) {
+    udev_unref(udev);
+    return {};
+  }
+
+  std::string id_vendor, id_model;
+  data = udev_device_get_property_value(dev, "ID_VENDOR");
+  if (data) {
+    id_vendor = data;
+  }
+  data = udev_device_get_property_value(dev, "ID_MODEL");
+  if (data) {
+    id_model = data;
+  }
+  udev_device_unref(dev);
+  udev_unref(udev);
+
+  std::transform(id_vendor.begin(), id_vendor.end(), id_vendor.begin(),
+		 ::tolower);
+  std::transform(id_model.begin(), id_model.end(), id_model.begin(),
+		 ::tolower);
+
+  if (id_vendor.size()) {
+    return id_vendor;
+  }
+  if (id_model.size()) {
+    int pos = id_model.find(" ");
+    if (pos > 0) {
+      return id_model.substr(0, pos);
+    } else {
+      return id_model;
+    }
+  }
+
+  std::string vendor, model;
+  char buf[1024] = {0};
+  BlkDev blkdev(devname);
+  if (!blkdev.vendor(buf, sizeof(buf))) {
+    vendor = buf;
+  }
+  if (!blkdev.model(buf, sizeof(buf))) {
+    model = buf;
+  }
+  if (vendor.size()) {
+    return vendor;
+  }
+  if (model.size()) {
+     int pos = model.find(" ");
+    if (pos > 0) {
+      return model.substr(0, pos);
+    } else {
+      return model;
+    }
+  }
+
+  return {};
+}
+
+static int block_device_run_vendor_nvme(
+  const string& devname, const string& vendor, int timeout,
+  std::string *result)
+{
+  string device = "/dev/" + devname;
+
+  SubProcessTimed nvmecli(
+    "sudo", SubProcess::CLOSE, SubProcess::PIPE, SubProcess::CLOSE,
+    timeout);
+  nvmecli.add_cmd_args(
+    "nvme",
+    vendor.c_str(),
+    "smart-log-add",
+    "--json",
+    device.c_str(),
+    NULL);
+  int ret = nvmecli.spawn();
+  if (ret != 0) {
+    *result = std::string("error spawning nvme command: ") + nvmecli.err();
+    return ret;
+  }
+
+  bufferlist output;
+  ret = output.read_fd(nvmecli.get_stdout(), 100*1024);
+  if (ret < 0) {
+    bufferlist err;
+    err.read_fd(nvmecli.get_stderr(), 100 * 1024);
+    *result = std::string("failed to execute nvme: ") + err.to_str();
+  } else {
+    ret = 0;
+    *result = output.to_str();
+  }
+
+  if (nvmecli.join() != 0) {
+    *result = std::string("nvme returned an error: ") + nvmecli.err();
+    return -EINVAL;
+  }
+
+  return ret;
+}
+
+std::string get_device_path(const std::string& devname,
+			    std::string *err)
+{
+  std::set<std::string> links;
+  int r = easy_readdir("/dev/disk/by-path", &links);
+  if (r < 0) {
+    *err = "unable to list contents of /dev/disk/by-path: "s +
+      cpp_strerror(r);
+    return {};
+  }
+  for (auto& i : links) {
+    char fn[PATH_MAX];
+    char target[PATH_MAX+1];
+    snprintf(fn, sizeof(fn), "/dev/disk/by-path/%s", i.c_str());
+    int r = readlink(fn, target, sizeof(target));
+    if (r < 0 || r >= (int)sizeof(target))
+      continue;
+    target[r] = 0;
+    if ((unsigned)r > devname.size() + 1 &&
+	strncmp(target + r - devname.size(), devname.c_str(), r) == 0 &&
+	target[r - devname.size() - 1] == '/') {
+      return fn;
+    }
+  }
+  *err = "no symlink to "s + devname + " in /dev/disk/by-path";
+  return {};
+}
+
+static int block_device_run_smartctl(const string& devname, int timeout,
+				     std::string *result)
+{
+  string device = "/dev/" + devname;
+
+  // when using --json, smartctl will report its errors in JSON format to stdout 
+  SubProcessTimed smartctl(
+    "sudo", SubProcess::CLOSE, SubProcess::PIPE, SubProcess::CLOSE,
+    timeout);
+  smartctl.add_cmd_args(
+    "smartctl",
+    "-a",
+    //"-x",
+    "--json=o",
+    device.c_str(),
+    NULL);
+
+  int ret = smartctl.spawn();
+  if (ret != 0) {
+    *result = std::string("error spawning smartctl: ") + smartctl.err();
+    return ret;
+  }
+
+  bufferlist output;
+  ret = output.read_fd(smartctl.get_stdout(), 100*1024);
+  if (ret < 0) {
+    *result = std::string("failed read smartctl output: ") + cpp_strerror(-ret);
+  } else {
+    ret = 0;
+    *result = output.to_str();
+  }
+
+  int joinerr = smartctl.join();
+  // Bit 0: Command line did not parse.
+  // Bit 1: Device open failed, device did not return an IDENTIFY DEVICE structure, or device is in a low-power mode (see '-n' option above).
+  // Bit 2: Some SMART or other ATA command to the disk failed, or there was a checksum error in a SMART data structure (see '-b' option above).
+  // Bit 3: SMART status check returned "DISK FAILING".
+  // Bit 4: We found prefail Attributes <= threshold.
+  // Bit 5: SMART status check returned "DISK OK" but we found that some (usage or prefail) Attributes have been <= threshold at some time in the past.
+  // Bit 6: The device error log contains records of errors.
+  // Bit 7: The device self-test log contains records of errors.  [ATA only] Failed self-tests outdated by a newer successful extended self-test are ignored.
+  if (joinerr & 3) {
+    *result = "smartctl returned an error ("s + stringify(joinerr) +
+      "): stderr:\n"s + smartctl.err() + "\nstdout:\n"s + *result;
+    return -EINVAL;
+  }
+
+  return ret;
+}
+
+static std::string escape_quotes(const std::string& s)
+{
+  std::string r = s;
+  auto pos = r.find("\"");
+  while (pos != std::string::npos) {
+    r.replace(pos, 1, "\"");
+    pos = r.find("\"", pos + 1);
+  }
+  return r;
+}
+
+int block_device_get_metrics(const string& devname, int timeout,
+			     json_spirit::mValue *result)
+{
+  std::string s;
+
+  // smartctl
+  if (int r = block_device_run_smartctl(devname, timeout, &s);
+      r != 0) {
+    string orig = s;
+    s = "{\"error\": \"smartctl failed\", \"dev\": \"/dev/";
+    s += devname;
+    s += "\", \"smartctl_error_code\": " + stringify(r);
+    s += ", \"smartctl_output\": \"" + escape_quotes(orig);
+    s += + "\"}";
+  } else if (!json_spirit::read(s, *result)) {
+    string orig = s;
+    s = "{\"error\": \"smartctl returned invalid JSON\", \"dev\": \"/dev/";
+    s += devname;
+    s += "\",\"output\":\"";
+    s += escape_quotes(orig);
+    s += "\"}";
+  }
+  if (!json_spirit::read(s, *result)) {
+    return -EINVAL;
+  }
+
+  json_spirit::mObject& base = result->get_obj();
+  string vendor = get_device_vendor(devname);
+  if (vendor.size()) {
+    base["nvme_vendor"] = vendor;
+    s.clear();
+    json_spirit::mValue nvme_json;
+    if (int r = block_device_run_vendor_nvme(devname, vendor, timeout, &s);
+	r == 0) {
+      if (json_spirit::read(s, nvme_json) != 0) {
+	base["nvme_smart_health_information_add_log"] = nvme_json;
+      } else {
+	base["nvme_smart_health_information_add_log_error"] = "bad json output: "
+	  + s;
+      }
+    } else {
+      base["nvme_smart_health_information_add_log_error_code"] = r;
+      base["nvme_smart_health_information_add_log_error"] = s;
+    }
+  } else {
+    base["nvme_vendor"] = "unknown";
+  }
+
+  return 0;
+}
+
+#elif defined(__APPLE__)
+#include <sys/disk.h>
+
+const char *BlkDev::sysfsdir() const {
+  assert(false);  // Should never be called on Apple
+  return "";
+}
+
+int BlkDev::dev(char *dev, size_t max) const
+{
+  struct stat sb;
+
+  if (fstat(fd, &sb) < 0)
+    return -errno;
+
+  snprintf(dev, max, "%" PRIu64, (uint64_t)sb.st_rdev);
+
+  return 0;
+}
+
+int BlkDev::get_size(int64_t *psize) const
+{
+  unsigned long blocksize = 0;
+  int ret = ::ioctl(fd, DKIOCGETBLOCKSIZE, &blocksize);
+  if (!ret) {
+    unsigned long nblocks;
+    ret = ::ioctl(fd, DKIOCGETBLOCKCOUNT, &nblocks);
+    if (!ret)
+      *psize = (int64_t)nblocks * blocksize;
+  }
+  if (ret < 0)
+    ret = -errno;
+  return ret;
+}
+
+int64_t BlkDev::get_int_property(blkdev_prop_t prop) const
+{
+  return 0;
+}
+
+bool BlkDev::support_discard() const
+{
+  return false;
+}
+
+int BlkDev::discard(int64_t offset, int64_t len) const
+{
+  return -EOPNOTSUPP;
+}
+
+bool BlkDev::is_nvme() const
+{
+  return false;
+}
+
+bool BlkDev::is_rotational() const
+{
+  return false;
+}
+
+int BlkDev::get_numa_node(int *node) const
+{
+  return -1;
+}
+
+int BlkDev::model(char *model, size_t max) const
+{
+  return -EOPNOTSUPP;
+}
+
+int BlkDev::serial(char *serial, size_t max) const
+{
+  return -EOPNOTSUPP;
+}
+
+int BlkDev::partition(char *partition, size_t max) const
+{
+  return -EOPNOTSUPP;
+}
+
+int BlkDev::wholedisk(char *device, size_t max) const
+{
+}
+
+
+void get_dm_parents(const std::string& dev, std::set<std::string> *ls)
+{
+}
+
+void get_raw_devices(const std::string& in,
+		     std::set<std::string> *ls)
+{
+}
+
+int get_vdo_stats_handle(const char *devname, std::string *vdo_name)
+{
+  return -1;
+}
+
+int64_t get_vdo_stat(int fd, const char *property)
+{
+  return 0;
+}
+
+bool get_vdo_utilization(int fd, uint64_t *total, uint64_t *avail)
+{
+  return false;
+}
+
+std::string get_device_id(const std::string& devname,
+			  std::string *err)
+{
+  // FIXME: implement me
+  if (err) {
+    *err = "not implemented";
+  }
+  return std::string();
+}
+
+std::string get_device_path(const std::string& devname,
+			    std::string *err)
+{
+  // FIXME: implement me
+  if (err) {
+    *err = "not implemented";
+  }
+  return std::string();
+}
+
+#elif defined(__FreeBSD__)
+
+const char *BlkDev::sysfsdir() const {
+  assert(false);  // Should never be called on FreeBSD
+  return "";
+}
+
+int BlkDev::dev(char *dev, size_t max) const
+{
+  struct stat sb;
+
+  if (fstat(fd, &sb) < 0)
+    return -errno;
+
+  snprintf(dev, max, "%" PRIu64, (uint64_t)sb.st_rdev);
+
+  return 0;
+}
+
+int BlkDev::get_size(int64_t *psize) const
+{
+  int ret = ::ioctl(fd, DIOCGMEDIASIZE, psize);
+  if (ret < 0)
+    ret = -errno;
+  return ret;
+}
+
+int64_t BlkDev::get_int_property(blkdev_prop_t prop) const
+{
+  return 0;
+}
+
+bool BlkDev::support_discard() const
+{
+#ifdef FREEBSD_WITH_TRIM
+  // there is no point to claim support of discard, but
+  // unable to do so.
+  struct diocgattr_arg arg;
+
+  strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
+  arg.len = sizeof(arg.value.i);
+  if (ioctl(fd, DIOCGATTR, &arg) == 0) {
+    return (arg.value.i != 0);
+  } else {
+    return false;
+  }
+#endif
+  return false;
+}
+
+int BlkDev::discard(int64_t offset, int64_t len) const
+{
+  return -EOPNOTSUPP;
+}
+
+bool BlkDev::is_nvme() const
+{
+  // FreeBSD doesn't have a good way to tell if a device's underlying protocol
+  // is NVME, especially since multiple GEOM transforms may be involved.  So
+  // we'll just guess based on the device name.
+  struct fiodgname_arg arg;
+  const char *nda = "nda";        //CAM-based attachment
+  const char *nvd = "nvd";        //CAM-less attachment
+  char devname[PATH_MAX];
+
+  arg.buf = devname;
+  arg.len = sizeof(devname);
+  if (ioctl(fd, FIODGNAME, &arg) < 0)
+    return false; //When in doubt, it's probably not NVME
+
+  return (strncmp(nvd, devname, strlen(nvd)) == 0 ||
+          strncmp(nda, devname, strlen(nda)) == 0);
+}
+
+bool BlkDev::is_rotational() const
+{
+#if __FreeBSD_version >= 1200049
+  struct diocgattr_arg arg;
+
+  strlcpy(arg.name, "GEOM::rotation_rate", sizeof(arg.name));
+  arg.len = sizeof(arg.value.u16);
+
+  int ioctl_ret = ioctl(fd, DIOCGATTR, &arg);
+  bool ret;
+  if (ioctl_ret < 0 || arg.value.u16 == DISK_RR_UNKNOWN)
+    // DISK_RR_UNKNOWN usually indicates an old drive, which is usually spinny
+    ret = true;
+  else if (arg.value.u16 == DISK_RR_NON_ROTATING)
+    ret = false;
+  else if (arg.value.u16 >= DISK_RR_MIN && arg.value.u16 <= DISK_RR_MAX)
+    ret = true;
+  else
+    ret = true;     // Invalid value.  Probably spinny?
+
+  return ret;
+#else
+  return true;      // When in doubt, it's probably spinny
+#endif
+}
+
+int BlkDev::get_numa_node(int *node) const
+{
+  int numa = get_int_property(BLKDEV_PROP_NUMA_NODE);
+  if (numa < 0)
+    return -1;
+  *node = numa;
+  return 0;
+}
+
+int BlkDev::model(char *model, size_t max) const
+{
+  struct diocgattr_arg arg;
+
+  strlcpy(arg.name, "GEOM::descr", sizeof(arg.name));
+  arg.len = sizeof(arg.value.str);
+  if (ioctl(fd, DIOCGATTR, &arg) < 0) {
+    return -errno;
+  }
+
+  // The GEOM description is of the form "vendor product" for SCSI disks
+  // and "ATA device_model" for ATA disks.  Some vendors choose to put the
+  // vendor name in device_model, and some don't.  Strip the first bit.
+  char *p = arg.value.str;
+  if (p == NULL || *p == '\0') {
+    *model = '\0';
+  } else {
+    (void) strsep(&p, " ");
+    snprintf(model, max, "%s", p);
+  }
+
+  return 0;
+}
+
+int BlkDev::serial(char *serial, size_t max) const
+{
+  char ident[DISK_IDENT_SIZE];
+
+  if (ioctl(fd, DIOCGIDENT, ident) < 0)
+    return -errno;
+
+  snprintf(serial, max, "%s", ident);
+
+  return 0;
+}
+
+void get_dm_parents(const std::string& dev, std::set<std::string> *ls)
+{
+}
+
+void get_raw_devices(const std::string& in,
+		     std::set<std::string> *ls)
+{
+}
+
+int get_vdo_stats_handle(const char *devname, std::string *vdo_name)
+{
+  return -1;
+}
+
+int64_t get_vdo_stat(int fd, const char *property)
+{
+  return 0;
+}
+
+bool get_vdo_utilization(int fd, uint64_t *total, uint64_t *avail)
+{
+  return false;
+}
+
+std::string get_device_id(const std::string& devname,
+			  std::string *err)
+{
+  // FIXME: implement me for freebsd
+  if (err) {
+    *err = "not implemented for FreeBSD";
+  }
+  return std::string();
+}
+
+std::string get_device_path(const std::string& devname,
+			    std::string *err)
+{
+  // FIXME: implement me for freebsd
+  if (err) {
+    *err = "not implemented for FreeBSD";
+  }
+  return std::string();
+}
+
+int block_device_run_smartctl(const char *device, int timeout,
+			      std::string *result)
+{
+  // FIXME: implement me for freebsd
+  return -EOPNOTSUPP;  
+}
+
+int block_device_get_metrics(const string& devname, int timeout,
+                             json_spirit::mValue *result)
+{
+  // FIXME: implement me for freebsd
+  return -EOPNOTSUPP;  
+}
+
+int block_device_run_nvme(const char *device, const char *vendor, int timeout,
+             std::string *result)
+{
+  return -EOPNOTSUPP;
+}
+
+static int block_device_devname(int fd, char *devname, size_t max)
+{
+  struct fiodgname_arg arg;
+
+  arg.buf = devname;
+  arg.len = max;
+  if (ioctl(fd, FIODGNAME, &arg) < 0)
+    return -errno;
+  return 0;
+}
+
+int BlkDev::partition(char *partition, size_t max) const
+{
+  char devname[PATH_MAX];
+
+  if (block_device_devname(fd, devname, sizeof(devname)) < 0)
+    return -errno;
+  snprintf(partition, max, "/dev/%s", devname);
+  return 0;
+}
+
+int BlkDev::wholedisk(char *wd, size_t max) const
+{
+  char devname[PATH_MAX];
+
+  if (block_device_devname(fd, devname, sizeof(devname)) < 0)
+    return -errno;
+
+  size_t first_digit = strcspn(devname, "0123456789");
+  // first_digit now indexes the first digit or null character of devname
+  size_t next_nondigit = strspn(&devname[first_digit], "0123456789");
+  next_nondigit += first_digit;
+  // next_nondigit now indexes the first alphabetic or null character after the
+  // unit number
+  strlcpy(wd, devname, next_nondigit + 1);
+  return 0;
+}
+
+#else
+
+const char *BlkDev::sysfsdir() const {
+  assert(false);  // Should never be called on non-Linux
+  return "";
+}
+
+int BlkDev::dev(char *dev, size_t max) const
+{
+  return -EOPNOTSUPP;
+}
+
+int BlkDev::get_size(int64_t *psize) const
+{
+  return -EOPNOTSUPP;
+}
+
+bool BlkDev::support_discard() const
+{
+  return false;
+}
+
+int BlkDev::discard(int fd, int64_t offset, int64_t len) const
+{
+  return -EOPNOTSUPP;
+}
+
+bool BlkDev::is_nvme(const char *devname) const
+{
+  return false;
+}
+
+bool BlkDev::is_rotational(const char *devname) const
+{
+  return false;
+}
+
+int BlkDev::model(char *model, size_t max) const
+{
+  return -EOPNOTSUPP;
+}
+
+int BlkDev::serial(char *serial, size_t max) const
+{
+  return -EOPNOTSUPP;
+}
+
+int BlkDev::partition(char *partition, size_t max) const
+{
+  return -EOPNOTSUPP;
+}
+
+int BlkDev::wholedisk(char *wd, size_t max) const
+{
+  return -EOPNOTSUPP;
+}
+
+void get_dm_parents(const std::string& dev, std::set<std::string> *ls)
+{
+}
+
+void get_raw_devices(const std::string& in,
+		     std::set<std::string> *ls)
+{
+}
+
+int get_vdo_stats_handle(const char *devname, std::string *vdo_name)
+{
+  return -1;
+}
+
+int64_t get_vdo_stat(int fd, const char *property)
+{
+  return 0;
+}
+
+bool get_vdo_utilization(int fd, uint64_t *total, uint64_t *avail)
+{
+  return false;
+}
+
+std::string get_device_id(const std::string& devname,
+			  std::string *err)
+{
+  // not implemented
+  if (err) {
+    *err = "not implemented";
+  }
+  return std::string();
+}
+
+std::string get_device_path(const std::string& devname,
+			  std::string *err)
+{
+  // not implemented
+  if (err) {
+    *err = "not implemented";
+  }
+  return std::string();
+}
+
+int block_device_run_smartctl(const char *device, int timeout,
+			      std::string *result)
+{
+  return -EOPNOTSUPP;
+}
+
+int block_device_get_metrics(const string& devname, int timeout,
+                             json_spirit::mValue *result)
+{
+  return -EOPNOTSUPP;
+}
+
+int block_device_run_nvme(const char *device, const char *vendor, int timeout,
+            std::string *result)
+{
+  return -EOPNOTSUPP;
+}
+
+#endif
+
+
+
+void get_device_metadata(
+  const std::set<std::string>& devnames,
+  std::map<std::string,std::string> *pm,
+  std::map<std::string,std::string> *errs)
+{
+  (*pm)["devices"] = stringify(devnames);
+  string &devids = (*pm)["device_ids"];
+  string &devpaths = (*pm)["device_paths"];
+  for (auto& dev : devnames) {
+    string err;
+    string id = get_device_id(dev, &err);
+    if (id.size()) {
+      if (!devids.empty()) {
+	devids += ",";
+      }
+      devids += dev + "=" + id;
+    } else {
+      (*errs)[dev] = " no unique device id for "s + dev + ": " + err;
+    }
+    string path = get_device_path(dev, &err);
+    if (path.size()) {
+      if (!devpaths.empty()) {
+	devpaths += ",";
+      }
+      devpaths += dev + "=" + path;
+    } else {
+      (*errs)[dev] + " no unique device path for "s + dev + ": " + err;
+    }
+  }
+}
diff --git a/src/common/blkdev.h b/src/common/blkdev.h
new file mode 100644
index 00000000..58e09c89
--- /dev/null
+++ b/src/common/blkdev.h
@@ -0,0 +1,102 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef __CEPH_COMMON_BLKDEV_H
+#define __CEPH_COMMON_BLKDEV_H
+
+#include <set>
+#include <map>
+#include <string>
+#include "json_spirit/json_spirit_value.h"
+
+enum blkdev_prop_t {
+  BLKDEV_PROP_DEV,
+  BLKDEV_PROP_DISCARD_GRANULARITY,
+  BLKDEV_PROP_MODEL,
+  BLKDEV_PROP_ROTATIONAL,
+  BLKDEV_PROP_SERIAL,
+  BLKDEV_PROP_VENDOR,
+  BLKDEV_PROP_NUMA_NODE,
+  BLKDEV_PROP_NUMA_CPUS,
+  BLKDEV_PROP_NUMPROPS,
+};
+
+extern int get_device_by_path(const char *path, char* partition, char* device, size_t max);
+
+extern std::string _decode_model_enc(const std::string& in);  // helper, exported only so we can unit test
+
+// get $vendor_$model_$serial style device id
+extern std::string get_device_id(const std::string& devname,
+				 std::string *err=0);
+
+// get /dev/disk/by-path/... style device id that is stable for a disk slot across reboots etc
+extern std::string get_device_path(const std::string& devname,
+				   std::string *err=0);
+
+// populate daemon metadata map with device info
+extern void get_device_metadata(
+  const std::set<std::string>& devnames,
+  std::map<std::string,std::string> *pm,
+  std::map<std::string,std::string> *errs);
+
+extern void get_dm_parents(const std::string& dev, std::set<std::string> *ls);
+extern int block_device_get_metrics(const std::string& devname, int timeout,
+				    json_spirit::mValue *result);
+
+// do everything to translate a device to the raw physical devices that
+// back it, including partitions -> wholedisks and dm -> constituent devices.
+extern void get_raw_devices(const std::string& in,
+			    std::set<std::string> *ls);
+
+// for VDO
+/// return an op fd for the sysfs stats dir, if this is a VDO device
+extern int get_vdo_stats_handle(const char *devname, std::string *vdo_name);
+extern int64_t get_vdo_stat(int fd, const char *property);
+extern bool get_vdo_utilization(int fd, uint64_t *total, uint64_t *avail);
+
+class BlkDev {
+public:
+  BlkDev(int fd);
+  BlkDev(const std::string& devname);
+  /* GoogleMock requires a virtual destructor */
+  virtual ~BlkDev() {}
+
+  // from an fd
+  int discard(int64_t offset, int64_t len) const;
+  int get_size(int64_t *psize) const;
+  int get_devid(dev_t *id) const;
+  int partition(char* partition, size_t max) const;
+  // from a device (e.g., "sdb")
+  bool support_discard() const;
+  bool is_nvme() const;
+  bool is_rotational() const;
+  int get_numa_node(int *node) const;
+  int dev(char *dev, size_t max) const;
+  int vendor(char *vendor, size_t max) const;
+  int model(char *model, size_t max) const;
+  int serial(char *serial, size_t max) const;
+
+  /* virtual for testing purposes */
+  virtual const char *sysfsdir() const;
+  virtual int wholedisk(char* device, size_t max) const;
+  int wholedisk(std::string *s) const {
+    char out[PATH_MAX] = {0};
+    int r = wholedisk(out, sizeof(out));
+    if (r < 0) {
+      return r;
+    }
+    *s = out;
+    return r;
+  }
+
+protected:
+  int64_t get_int_property(blkdev_prop_t prop) const;
+  int64_t get_string_property( blkdev_prop_t prop, char *val,
+    size_t maxlen) const;
+
+private:
+  int fd = -1;
+  std::string devname;
+};
+
+#endif
diff --git a/src/common/bloom_filter.cc b/src/common/bloom_filter.cc
new file mode 100644
index 00000000..9c149361
--- /dev/null
+++ b/src/common/bloom_filter.cc
@@ -0,0 +1,138 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/bloom_filter.hpp"
+
+MEMPOOL_DEFINE_FACTORY(unsigned char, byte, bloom_filter);
+
+void bloom_filter::encode(bufferlist& bl) const
+{
+  ENCODE_START(2, 2, bl);
+  encode((uint64_t)salt_count_, bl);
+  encode((uint64_t)insert_count_, bl);
+  encode((uint64_t)target_element_count_, bl);
+  encode((uint64_t)random_seed_, bl);
+  bufferptr bp((const char*)bit_table_, table_size_);
+  encode(bp, bl);
+  ENCODE_FINISH(bl);
+}
+
+void bloom_filter::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(2, p);
+  uint64_t v;
+  decode(v, p);
+  salt_count_ = v;
+  decode(v, p);
+  insert_count_ = v;
+  decode(v, p);
+  target_element_count_ = v;
+  decode(v, p);
+  random_seed_ = v;
+  bufferlist t;
+  decode(t, p);
+
+  salt_.clear();
+  generate_unique_salt();
+  table_size_ = t.length();
+  delete[] bit_table_;
+  if (table_size_) {
+    bit_table_ = new cell_type[table_size_];
+    t.copy(0, table_size_, (char *)bit_table_);
+  } else {
+    bit_table_ = NULL;
+  }
+
+  DECODE_FINISH(p);
+}
+
+void bloom_filter::dump(Formatter *f) const
+{
+  f->dump_unsigned("salt_count", salt_count_);
+  f->dump_unsigned("table_size", table_size_);
+  f->dump_unsigned("insert_count", insert_count_);
+  f->dump_unsigned("target_element_count", target_element_count_);
+  f->dump_unsigned("random_seed", random_seed_);
+
+  f->open_array_section("salt_table");
+  for (std::vector<bloom_type>::const_iterator i = salt_.begin(); i != salt_.end(); ++i)
+    f->dump_unsigned("salt", *i);
+  f->close_section();
+
+  f->open_array_section("bit_table");
+  for (unsigned i = 0; i < table_size_; ++i)
+    f->dump_unsigned("byte", (unsigned)bit_table_[i]);
+  f->close_section();
+}
+
+void bloom_filter::generate_test_instances(std::list<bloom_filter*>& ls)
+{
+  ls.push_back(new bloom_filter(10, .5, 1));
+  ls.push_back(new bloom_filter(10, .5, 1));
+  ls.back()->insert("foo");
+  ls.back()->insert("bar");
+  ls.push_back(new bloom_filter(50, .5, 1));
+  ls.back()->insert("foo");
+  ls.back()->insert("bar");
+  ls.back()->insert("baz");
+  ls.back()->insert("boof");
+  ls.back()->insert("boogggg");
+}
+
+
+void compressible_bloom_filter::encode(bufferlist& bl) const
+{
+  ENCODE_START(2, 2, bl);
+  bloom_filter::encode(bl);
+
+  uint32_t s = size_list.size();
+  encode(s, bl);
+  for (std::vector<size_t>::const_iterator p = size_list.begin();
+       p != size_list.end(); ++p)
+    encode((uint64_t)*p, bl);
+
+  ENCODE_FINISH(bl);
+}
+
+void compressible_bloom_filter::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(2, p);
+  bloom_filter::decode(p);
+
+  uint32_t s;
+  decode(s, p);
+  size_list.resize(s);
+  for (unsigned i = 0; i < s; i++) {
+    uint64_t v;
+    decode(v, p);
+    size_list[i] = v;
+  }
+
+  DECODE_FINISH(p);
+}
+
+void compressible_bloom_filter::dump(Formatter *f) const
+{
+  bloom_filter::dump(f);
+
+  f->open_array_section("table_sizes");
+  for (std::vector<size_t>::const_iterator p = size_list.begin();
+       p != size_list.end(); ++p)
+    f->dump_unsigned("size", (uint64_t)*p);
+  f->close_section();
+}
+
+void compressible_bloom_filter::generate_test_instances(std::list<compressible_bloom_filter*>& ls)
+{
+  ls.push_back(new compressible_bloom_filter(10, .5, 1));
+  ls.push_back(new compressible_bloom_filter(10, .5, 1));
+  ls.back()->insert("foo");
+  ls.back()->insert("bar");
+  ls.push_back(new compressible_bloom_filter(50, .5, 1));
+  ls.back()->insert("foo");
+  ls.back()->insert("bar");
+  ls.back()->insert("baz");
+  ls.back()->insert("boof");
+  ls.back()->compress(20);
+  ls.back()->insert("boogggg");
+}
diff --git a/src/common/bloom_filter.hpp b/src/common/bloom_filter.hpp
new file mode 100644
index 00000000..8484d4ed
--- /dev/null
+++ b/src/common/bloom_filter.hpp
@@ -0,0 +1,639 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ *******************************************************************
+ *                                                                 *
+ *                        Open Bloom Filter                        *
+ *                                                                 *
+ * Author: Arash Partow - 2000                                     *
+ * URL: http://www.partow.net/programming/hashfunctions/index.html *
+ *                                                                 *
+ * Copyright notice:                                               *
+ * Free use of the Open Bloom Filter Library is permitted under    *
+ * the guidelines and in accordance with the most current version  *
+ * of the Boost Software License, Version 1.0                      *
+ * http://www.opensource.org/licenses/bsl1.0.html                  *
+ *                                                                 *
+ *******************************************************************
+*/
+
+
+#ifndef COMMON_BLOOM_FILTER_HPP
+#define COMMON_BLOOM_FILTER_HPP
+
+#include <cmath>
+
+#include "include/mempool.h"
+#include "include/encoding.h"
+
+static const std::size_t bits_per_char = 0x08;    // 8 bits in 1 char(unsigned)
+static const unsigned char bit_mask[bits_per_char] = {
+  0x01,  //00000001
+  0x02,  //00000010
+  0x04,  //00000100
+  0x08,  //00001000
+  0x10,  //00010000
+  0x20,  //00100000
+  0x40,  //01000000
+  0x80   //10000000
+};
+
+MEMPOOL_DECLARE_FACTORY(unsigned char, byte, bloom_filter);
+
+class bloom_filter
+{
+protected:
+
+  typedef unsigned int bloom_type;
+  typedef unsigned char cell_type;
+
+  unsigned char*       bit_table_;   ///< pointer to bit map
+  std::vector<bloom_type> salt_;     ///< vector of salts
+  std::size_t         salt_count_;   ///< number of salts
+  std::size_t         table_size_;   ///< bit table size in bytes
+  std::size_t         insert_count_;  ///< insertion count
+  std::size_t         target_element_count_;  ///< target number of unique insertions
+  std::size_t         random_seed_;  ///< random seed
+
+public:
+
+  bloom_filter()
+    : bit_table_(0),
+      salt_count_(0),
+      table_size_(0),
+      insert_count_(0),
+      target_element_count_(0),
+      random_seed_(0)
+  {}
+
+  bloom_filter(const std::size_t& predicted_inserted_element_count,
+	       const double& false_positive_probability,
+	       const std::size_t& random_seed)
+    : bit_table_(0),
+      insert_count_(0),
+      target_element_count_(predicted_inserted_element_count),
+      random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
+  {
+    ceph_assert(false_positive_probability > 0.0);
+    find_optimal_parameters(predicted_inserted_element_count, false_positive_probability,
+			    &salt_count_, &table_size_);
+    init();
+  }
+
+  bloom_filter(const std::size_t& salt_count,
+	       std::size_t table_size,
+	       const std::size_t& random_seed,
+	       std::size_t target_element_count)
+    : bit_table_(0),
+      salt_count_(salt_count),
+      table_size_(table_size),
+      insert_count_(0),
+      target_element_count_(target_element_count),
+      random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
+  {
+    init();
+  }
+
+  void init() {
+    generate_unique_salt();
+    if (table_size_) {
+      bit_table_ = mempool::bloom_filter::alloc_byte.allocate(table_size_);
+      std::fill_n(bit_table_, table_size_, 0x00);
+    } else {
+      bit_table_ = NULL;
+    }
+  }
+
+  bloom_filter(const bloom_filter& filter)
+    : bit_table_(0)
+  {
+    this->operator=(filter);
+  }
+
+  bloom_filter& operator = (const bloom_filter& filter)
+  {
+    if (this != &filter) {
+      if (bit_table_) {
+	mempool::bloom_filter::alloc_byte.deallocate(bit_table_, table_size_);
+      }
+      salt_count_ = filter.salt_count_;
+      table_size_ = filter.table_size_;
+      insert_count_ = filter.insert_count_;
+      target_element_count_ = filter.target_element_count_;
+      random_seed_ = filter.random_seed_;
+      bit_table_ = mempool::bloom_filter::alloc_byte.allocate(table_size_);
+      std::copy(filter.bit_table_, filter.bit_table_ + table_size_, bit_table_);
+      salt_ = filter.salt_;
+    }
+    return *this;
+  }
+
+  virtual ~bloom_filter()
+  {
+    mempool::bloom_filter::alloc_byte.deallocate(bit_table_, table_size_);
+  }
+
+  inline bool operator!() const
+  {
+    return (0 == table_size_);
+  }
+
+  inline void clear()
+  {
+    if (bit_table_)
+      std::fill_n(bit_table_, table_size_, 0x00);
+    insert_count_ = 0;
+  }
+
+  /**
+   * insert a u32 into the set
+   *
+   * NOTE: the internal hash is weak enough that consecutive inputs do
+   * not achieve the desired fpp.  Well-mixed values should be used
+   * here (e.g., put rjhash(x) into the filter instead of just x).
+   *
+   * @param val integer value to insert
+   */
+  inline void insert(uint32_t val) {
+    ceph_assert(bit_table_);
+    std::size_t bit_index = 0;
+    std::size_t bit = 0;
+    for (std::size_t i = 0; i < salt_.size(); ++i)
+    {
+      compute_indices(hash_ap(val,salt_[i]),bit_index,bit);
+      bit_table_[bit_index >> 3] |= bit_mask[bit];
+    }
+    ++insert_count_;
+  }
+
+  inline void insert(const unsigned char* key_begin, const std::size_t& length)
+  {
+    ceph_assert(bit_table_);
+    std::size_t bit_index = 0;
+    std::size_t bit = 0;
+    for (std::size_t i = 0; i < salt_.size(); ++i)
+    {
+      compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
+      bit_table_[bit_index >> 3] |= bit_mask[bit];
+    }
+    ++insert_count_;
+  }
+
+  template<typename T>
+  inline void insert(const T& t)
+  {
+    // Note: T must be a C++ POD type.
+    insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T));
+  }
+
+  inline void insert(const std::string& key)
+  {
+    insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
+  }
+
+  inline void insert(const char* data, const std::size_t& length)
+  {
+    insert(reinterpret_cast<const unsigned char*>(data),length);
+  }
+
+  template<typename InputIterator>
+  inline void insert(const InputIterator begin, const InputIterator end)
+  {
+    InputIterator itr = begin;
+    while (end != itr)
+    {
+      insert(*(itr++));
+    }
+  }
+
+  /**
+   * check if a u32 is contained by set
+   *
+   * NOTE: the internal hash is weak enough that consecutive inputs do
+   * not achieve the desired fpp.  Well-mixed values should be used
+   * here (e.g., put rjhash(x) into the filter instead of just x).
+   *
+   * @param val integer value to query
+   * @returns true if value is (probably) in the set, false if it definitely is not
+   */
+  inline virtual bool contains(uint32_t val) const
+  {
+    if (!bit_table_)
+      return false;
+    std::size_t bit_index = 0;
+    std::size_t bit = 0;
+    for (std::size_t i = 0; i < salt_.size(); ++i)
+    {
+      compute_indices(hash_ap(val,salt_[i]),bit_index,bit);
+      if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit])
+      {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
+  {
+    if (!bit_table_)
+      return false;
+    std::size_t bit_index = 0;
+    std::size_t bit = 0;
+    for (std::size_t i = 0; i < salt_.size(); ++i)
+    {
+      compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
+      if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit])
+      {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  template<typename T>
+  inline bool contains(const T& t) const
+  {
+    return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T)));
+  }
+
+  inline bool contains(const std::string& key) const
+  {
+    return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
+  }
+
+  inline bool contains(const char* data, const std::size_t& length) const
+  {
+    return contains(reinterpret_cast<const unsigned char*>(data),length);
+  }
+
+  template<typename InputIterator>
+  inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
+  {
+    InputIterator itr = begin;
+    while (end != itr)
+    {
+      if (!contains(*itr))
+      {
+        return itr;
+      }
+      ++itr;
+    }
+    return end;
+  }
+
+  template<typename InputIterator>
+  inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
+  {
+    InputIterator itr = begin;
+    while (end != itr)
+    {
+      if (contains(*itr))
+      {
+        return itr;
+      }
+      ++itr;
+    }
+    return end;
+  }
+
+  inline virtual std::size_t size() const
+  {
+    return table_size_ * bits_per_char;
+  }
+
+  inline std::size_t element_count() const
+  {
+    return insert_count_;
+  }
+
+  inline bool is_full() const
+  {
+    return insert_count_ >= target_element_count_;
+  }
+
+  /*
+   * density of bits set.  inconvenient units, but:
+   *    .3  = ~50% target insertions
+   *    .5  = 100% target insertions, "perfectly full"
+   *    .75 = 200% target insertions
+   *   1.0  = all bits set... infinite insertions
+   */
+  inline double density() const
+  {
+    if (!bit_table_)
+      return 0.0;
+    size_t set = 0;
+    uint8_t *p = bit_table_;
+    size_t left = table_size_;
+    while (left-- > 0) {
+      uint8_t c = *p;
+      for (; c; ++set)
+	c &= c - 1;
+      ++p;
+    }
+    return (double)set / (double)(table_size_ << 3);
+  }
+
+  virtual inline double approx_unique_element_count() const {
+    // this is not a very good estimate; a better solution should have
+    // some asymptotic behavior as density() approaches 1.0.
+    return (double)target_element_count_ * 2.0 * density();
+  }
+
+  inline double effective_fpp() const
+  {
+    /*
+      Note:
+      The effective false positive probability is calculated using the
+      designated table size and hash function count in conjunction with
+      the current number of inserted elements - not the user defined
+      predicated/expected number of inserted elements.
+    */
+    return std::pow(1.0 - std::exp(-1.0 * salt_.size() * insert_count_ / size()), 1.0 * salt_.size());
+  }
+
+  inline const cell_type* table() const
+  {
+    return bit_table_;
+  }
+
+protected:
+
+  inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
+  {
+    bit_index = hash % (table_size_ << 3);
+    bit = bit_index & 7;
+  }
+
+  void generate_unique_salt()
+  {
+    /*
+      Note:
+      A distinct hash function need not be implementation-wise
+      distinct. In the current implementation "seeding" a common
+      hash function with different values seems to be adequate.
+    */
+    const unsigned int predef_salt_count = 128;
+    static const bloom_type predef_salt[predef_salt_count] = {
+      0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
+      0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
+      0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
+      0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
+      0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
+      0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
+      0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
+      0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
+      0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
+      0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
+      0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
+      0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
+      0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
+      0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
+      0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
+      0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
+      0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
+      0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
+      0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
+      0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
+      0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
+      0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
+      0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
+      0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
+      0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
+      0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
+      0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
+      0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
+      0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
+      0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
+      0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
+      0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
+    };
+
+    if (salt_count_ <= predef_salt_count)
+    {
+      std::copy(predef_salt,
+		predef_salt + salt_count_,
+		std::back_inserter(salt_));
+       for (unsigned int i = 0; i < salt_.size(); ++i)
+       {
+        /*
+          Note:
+          This is done to integrate the user defined random seed,
+          so as to allow for the generation of unique bloom filter
+          instances.
+        */
+        salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_;
+       }
+    }
+    else
+    {
+      std::copy(predef_salt,predef_salt + predef_salt_count,
+		std::back_inserter(salt_));
+      srand(static_cast<unsigned int>(random_seed_));
+      while (salt_.size() < salt_count_)
+      {
+        bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
+        if (0 == current_salt)
+	  continue;
+        if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
+        {
+          salt_.push_back(current_salt);
+        }
+      }
+    }
+  }
+
+  static void find_optimal_parameters(std::size_t target_insert_count,
+				      double target_fpp,
+				      std::size_t *salt_count,
+				      std::size_t *table_size)
+  {
+    /*
+      Note:
+      The following will attempt to find the number of hash functions
+      and minimum amount of storage bits required to construct a bloom
+      filter consistent with the user defined false positive probability
+      and estimated element insertion count.
+    */
+
+    double min_m = std::numeric_limits<double>::infinity();
+    double min_k = 0.0;
+    double curr_m = 0.0;
+    double k = 1.0;
+    while (k < 1000.0)
+    {
+      double numerator  = (- k * target_insert_count);
+      double denominator = std::log(1.0 - std::pow(target_fpp, 1.0 / k));
+      curr_m = numerator / denominator;
+
+      if (curr_m < min_m)
+      {
+        min_m = curr_m;
+        min_k = k;
+      }
+      k += 1.0;
+    }
+
+    *salt_count = static_cast<std::size_t>(min_k);
+    size_t t = static_cast<std::size_t>(min_m);
+    t += (((t & 7) != 0) ? (bits_per_char - (t & 7)) : 0);
+    *table_size = t >> 3;
+  }
+
+  inline bloom_type hash_ap(uint32_t val, bloom_type hash) const
+  {
+    hash ^=    (hash <<  7) ^  ((val & 0xff000000) >> 24) * (hash >> 3);
+    hash ^= (~((hash << 11) + (((val & 0xff0000) >> 16) ^ (hash >> 5))));
+    hash ^=    (hash <<  7) ^  ((val & 0xff00) >> 8) * (hash >> 3);
+    hash ^= (~((hash << 11) + (((val & 0xff)) ^ (hash >> 5))));
+    return hash;
+  }
+
+  inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
+  {
+    const unsigned char* itr = begin;
+
+    while (remaining_length >= 4)
+    {
+      hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
+      hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+      hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
+      hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+      remaining_length -= 4;
+    }
+
+    while (remaining_length >= 2)
+    {
+      hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
+      hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+      remaining_length -= 2;
+    }
+
+    if (remaining_length)
+    {
+      hash ^= (hash <<  7) ^ (*itr) * (hash >> 3);
+    }
+
+    return hash;
+  }
+
+public:
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& bl);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<bloom_filter*>& ls);
+};
+WRITE_CLASS_ENCODER(bloom_filter)
+
+
+class compressible_bloom_filter : public bloom_filter
+{
+public:
+
+  compressible_bloom_filter() : bloom_filter() {}
+
+  compressible_bloom_filter(const std::size_t& predicted_element_count,
+			    const double& false_positive_probability,
+			    const std::size_t& random_seed)
+    : bloom_filter(predicted_element_count, false_positive_probability, random_seed)
+  {
+    size_list.push_back(table_size_);
+  }
+
+  compressible_bloom_filter(const std::size_t& salt_count,
+			    std::size_t table_size,
+			    const std::size_t& random_seed,
+			    std::size_t target_count)
+    : bloom_filter(salt_count, table_size, random_seed, target_count)
+  {
+    size_list.push_back(table_size_);
+  }
+
+  inline std::size_t size() const override
+  {
+    return size_list.back() * bits_per_char;
+  }
+
+  inline bool compress(const double& target_ratio)
+  {
+    if (!bit_table_)
+      return false;
+
+    if ((0.0 >= target_ratio) || (target_ratio >= 1.0))
+    {
+      return false;
+    }
+
+    std::size_t original_table_size = size_list.back();
+    std::size_t new_table_size = static_cast<std::size_t>(size_list.back() * target_ratio);
+
+    if ((!new_table_size) || (new_table_size >= original_table_size))
+    {
+      return false;
+    }
+
+    cell_type* tmp = mempool::bloom_filter::alloc_byte.allocate(new_table_size);
+    std::copy(bit_table_, bit_table_ + (new_table_size), tmp);
+    cell_type* itr = bit_table_ + (new_table_size);
+    cell_type* end = bit_table_ + (original_table_size);
+    cell_type* itr_tmp = tmp;
+    cell_type* itr_end = tmp + (new_table_size);
+    while (end != itr)
+    {
+      *(itr_tmp++) |= (*itr++);
+      if (itr_tmp == itr_end)
+	itr_tmp = tmp;
+    }
+
+    mempool::bloom_filter::alloc_byte.deallocate(bit_table_, table_size_);
+    bit_table_ = tmp;
+    size_list.push_back(new_table_size);
+    table_size_ = new_table_size;
+
+    return true;
+  }
+
+  inline double approx_unique_element_count() const override {
+    // this is not a very good estimate; a better solution should have
+    // some asymptotic behavior as density() approaches 1.0.
+    //
+    // the compress() correction is also bad; it tends to under-estimate.
+    return (double)target_element_count_ * 2.0 * density() * (double)size_list.back() / (double)size_list.front();
+  }
+
+private:
+
+  inline void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const override
+  {
+    bit_index = hash;
+    for (std::size_t i = 0; i < size_list.size(); ++i)
+    {
+      bit_index %= size_list[i] << 3;
+    }
+    bit = bit_index & 7;
+  }
+
+  std::vector<std::size_t> size_list;
+public:
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& bl);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<compressible_bloom_filter*>& ls);
+};
+WRITE_CLASS_ENCODER(compressible_bloom_filter)
+
+#endif
+
+
+/*
+  Note 1:
+  If it can be guaranteed that bits_per_char will be of the form 2^n then
+  the following optimization can be used:
+
+  hash_table[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)];
+
+  Note 2:
+  For performance reasons where possible when allocating memory it should
+  be aligned (aligned_alloc) according to the architecture being used.
+*/
diff --git a/src/common/bounded_key_counter.h b/src/common/bounded_key_counter.h
new file mode 100644
index 00000000..ee7fa304
--- /dev/null
+++ b/src/common/bounded_key_counter.h
@@ -0,0 +1,191 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * Author: Casey Bodley <cbodley@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef BOUNDED_KEY_COUNTER_H
+#define BOUNDED_KEY_COUNTER_H
+
+#include <algorithm>
+#include <map>
+#include <tuple>
+#include <vector>
+
+#include "include/ceph_assert.h"
+
+/**
+ * BoundedKeyCounter
+ *
+ * A data structure that counts the number of times a given key is inserted,
+ * and can return the keys with the highest counters. The number of unique keys
+ * is bounded by the given constructor argument, meaning that new keys will be
+ * rejected if they would exceed this bound.
+ *
+ * It is optimized for use where insertion is frequent, but sorted listings are
+ * both infrequent and tend to request a small subset of the available keys.
+ */
+template <typename Key, typename Count>
+class BoundedKeyCounter {
+  /// map type to associate keys with their counter values
+  using map_type = std::map<Key, Count>;
+  using value_type = typename map_type::value_type;
+
+  /// view type used for sorting key-value pairs by their counter value
+  using view_type = std::vector<const value_type*>;
+
+  /// maximum number of counters to store at once
+  const size_t bound;
+
+  /// map of counters, with a maximum size given by 'bound'
+  map_type counters;
+
+  /// storage for sorted key-value pairs
+  view_type sorted;
+
+  /// remembers how much of the range is actually sorted
+  typename view_type::iterator sorted_position;
+
+  /// invalidate view of sorted entries
+  void invalidate_sorted()
+  {
+    sorted_position = sorted.begin();
+    sorted.clear();
+  }
+
+  /// value_type comparison function for sorting in descending order
+  static bool value_greater(const value_type *lhs, const value_type *rhs)
+  {
+    return lhs->second > rhs->second;
+  }
+
+  /// map iterator that adapts value_type to value_type*
+  struct const_pointer_iterator : public map_type::const_iterator {
+    const_pointer_iterator(typename map_type::const_iterator i)
+      : map_type::const_iterator(i) {}
+
+    using value_type = typename map_type::const_iterator::value_type*;
+    using reference = const typename map_type::const_iterator::value_type*;
+
+    reference operator*() const {
+      return &map_type::const_iterator::operator*();
+    }
+  };
+
+ protected:
+  /// return the number of sorted entries. marked protected for unit testing
+  size_t get_num_sorted() const
+  {
+    using const_iterator = typename view_type::const_iterator;
+    return std::distance<const_iterator>(sorted.begin(), sorted_position);
+  }
+
+ public:
+  BoundedKeyCounter(size_t bound)
+    : bound(bound)
+  {
+    sorted.reserve(bound);
+    sorted_position = sorted.begin();
+  }
+
+  /// return the number of keys stored
+  size_t size() const noexcept { return counters.size(); }
+
+  /// return the maximum number of keys
+  size_t capacity() const noexcept { return bound; }
+
+  /// increment a counter for the given key and return its value. if the key was
+  /// not present, insert it. if the map is full, return 0
+  Count insert(const Key& key, Count n = 1)
+  {
+    typename map_type::iterator i;
+
+    if (counters.size() < bound) {
+      // insert new entries at count=0
+      bool inserted;
+      std::tie(i, inserted) = counters.emplace(key, 0);
+      if (inserted) {
+        sorted.push_back(&*i);
+      }
+    } else {
+      // when full, refuse to insert new entries
+      i = counters.find(key);
+      if (i == counters.end()) {
+        return 0;
+      }
+    }
+
+    i->second += n; // add to the counter
+
+    // update sorted position if necessary. use a binary search for the last
+    // element in the sorted range that's greater than this counter
+    sorted_position = std::lower_bound(sorted.begin(), sorted_position,
+                                       &*i, &value_greater);
+
+    return i->second;
+  }
+
+  /// remove the given key from the map of counters
+  void erase(const Key& key)
+  {
+    auto i = counters.find(key);
+    if (i == counters.end()) {
+      return;
+    }
+    // removing the sorted entry would require linear search; invalidate instead
+    invalidate_sorted();
+
+    counters.erase(i);
+  }
+
+  /// query the highest N key-value pairs sorted by counter value, passing each
+  /// in order to the given callback with arguments (Key, Count)
+  template <typename Callback>
+  void get_highest(size_t count, Callback&& cb)
+  {
+    if (sorted.empty()) {
+      // initialize the vector with pointers to all key-value pairs
+      sorted.assign(const_pointer_iterator{counters.cbegin()},
+                    const_pointer_iterator{counters.cend()});
+      // entire range is unsorted
+      ceph_assert(sorted_position == sorted.begin());
+    }
+
+    const size_t sorted_count = get_num_sorted();
+    if (sorted_count < count) {
+      // move sorted_position to cover the requested number of entries
+      sorted_position = sorted.begin() + std::min(count, sorted.size());
+
+      // sort all entries in descending order up to the given position
+      std::partial_sort(sorted.begin(), sorted_position, sorted.end(),
+                        &value_greater);
+    }
+
+    // return the requested range via callback
+    for (const auto& pair : sorted) {
+      if (count-- == 0) {
+        return;
+      }
+      cb(pair->first, pair->second);
+    }
+  }
+
+  /// remove all keys and counters and invalidate the sorted range
+  void clear()
+  {
+    invalidate_sorted();
+    counters.clear();
+  }
+};
+
+#endif // BOUNDED_KEY_COUNTER_H
diff --git a/src/common/buffer.cc b/src/common/buffer.cc
new file mode 100644
index 00000000..8caaae90
--- /dev/null
+++ b/src/common/buffer.cc
@@ -0,0 +1,2271 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include <atomic>
+#include <errno.h>
+#include <limits.h>
+
+#include <sys/uio.h>
+
+#include "include/ceph_assert.h"
+#include "include/types.h"
+#include "include/buffer_raw.h"
+#include "include/compat.h"
+#include "include/mempool.h"
+#include "armor.h"
+#include "common/environment.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "common/strtol.h"
+#include "common/likely.h"
+#include "common/valgrind.h"
+#include "common/deleter.h"
+#include "common/RWLock.h"
+#include "include/spinlock.h"
+#include "include/scope_guard.h"
+
+#if defined(HAVE_XIO)
+#include "msg/xio/XioMsg.h"
+#endif
+
+using namespace ceph;
+
+#define CEPH_BUFFER_ALLOC_UNIT  4096u
+#define CEPH_BUFFER_APPEND_SIZE (CEPH_BUFFER_ALLOC_UNIT - sizeof(raw_combined))
+
+#ifdef BUFFER_DEBUG
+static ceph::spinlock debug_lock;
+# define bdout { std::lock_guard<ceph::spinlock> lg(debug_lock); std::cout
+# define bendl std::endl; }
+#else
+# define bdout if (0) { std::cout
+# define bendl std::endl; }
+#endif
+
+  static std::atomic<unsigned> buffer_cached_crc { 0 };
+  static std::atomic<unsigned> buffer_cached_crc_adjusted { 0 };
+  static std::atomic<unsigned> buffer_missed_crc { 0 };
+
+  static bool buffer_track_crc = get_env_bool("CEPH_BUFFER_TRACK");
+
+  void buffer::track_cached_crc(bool b) {
+    buffer_track_crc = b;
+  }
+  int buffer::get_cached_crc() {
+    return buffer_cached_crc;
+  }
+  int buffer::get_cached_crc_adjusted() {
+    return buffer_cached_crc_adjusted;
+  }
+
+  int buffer::get_missed_crc() {
+    return buffer_missed_crc;
+  }
+
+  const char * buffer::error::what() const throw () {
+    return "buffer::exception";
+  }
+  const char * buffer::bad_alloc::what() const throw () {
+    return "buffer::bad_alloc";
+  }
+  const char * buffer::end_of_buffer::what() const throw () {
+    return "buffer::end_of_buffer";
+  }
+  const char * buffer::malformed_input::what() const throw () {
+    return buf;
+  }
+  buffer::error_code::error_code(int error) :
+    buffer::malformed_input(cpp_strerror(error).c_str()), code(error) {}
+
+  /*
+   * raw_combined is always placed within a single allocation along
+   * with the data buffer.  the data goes at the beginning, and
+   * raw_combined at the end.
+   */
+  class buffer::raw_combined : public buffer::raw {
+    size_t alignment;
+  public:
+    raw_combined(char *dataptr, unsigned l, unsigned align,
+		 int mempool)
+      : raw(dataptr, l, mempool),
+	alignment(align) {
+    }
+    raw* clone_empty() override {
+      return create(len, alignment);
+    }
+
+    static raw_combined *create(unsigned len,
+				unsigned align,
+				int mempool = mempool::mempool_buffer_anon) {
+      if (!align)
+	align = sizeof(size_t);
+      size_t rawlen = round_up_to(sizeof(buffer::raw_combined),
+				  alignof(buffer::raw_combined));
+      size_t datalen = round_up_to(len, alignof(buffer::raw_combined));
+
+#ifdef DARWIN
+      char *ptr = (char *) valloc(rawlen + datalen);
+#else
+      char *ptr = 0;
+      int r = ::posix_memalign((void**)(void*)&ptr, align, rawlen + datalen);
+      if (r)
+	throw bad_alloc();
+#endif /* DARWIN */
+      if (!ptr)
+	throw bad_alloc();
+
+      // actual data first, since it has presumably larger alignment restriction
+      // then put the raw_combined at the end
+      return new (ptr + datalen) raw_combined(ptr, len, align, mempool);
+    }
+
+    static void operator delete(void *ptr) {
+      raw_combined *raw = (raw_combined *)ptr;
+      ::free((void *)raw->data);
+    }
+  };
+
+  class buffer::raw_malloc : public buffer::raw {
+  public:
+    MEMPOOL_CLASS_HELPERS();
+
+    explicit raw_malloc(unsigned l) : raw(l) {
+      if (len) {
+	data = (char *)malloc(len);
+        if (!data)
+          throw bad_alloc();
+      } else {
+	data = 0;
+      }
+      bdout << "raw_malloc " << this << " alloc " << (void *)data << " " << l << bendl;
+    }
+    raw_malloc(unsigned l, char *b) : raw(b, l) {
+      bdout << "raw_malloc " << this << " alloc " << (void *)data << " " << l << bendl;
+    }
+    ~raw_malloc() override {
+      free(data);
+      bdout << "raw_malloc " << this << " free " << (void *)data << " " << bendl;
+    }
+    raw* clone_empty() override {
+      return new raw_malloc(len);
+    }
+  };
+
+#ifndef __CYGWIN__
+  class buffer::raw_posix_aligned : public buffer::raw {
+    unsigned align;
+  public:
+    MEMPOOL_CLASS_HELPERS();
+
+    raw_posix_aligned(unsigned l, unsigned _align) : raw(l) {
+      align = _align;
+      ceph_assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
+#ifdef DARWIN
+      data = (char *) valloc(len);
+#else
+      int r = ::posix_memalign((void**)(void*)&data, align, len);
+      if (r)
+	throw bad_alloc();
+#endif /* DARWIN */
+      if (!data)
+	throw bad_alloc();
+      bdout << "raw_posix_aligned " << this << " alloc " << (void *)data
+	    << " l=" << l << ", align=" << align << bendl;
+    }
+    ~raw_posix_aligned() override {
+      ::free(data);
+      bdout << "raw_posix_aligned " << this << " free " << (void *)data << bendl;
+    }
+    raw* clone_empty() override {
+      return new raw_posix_aligned(len, align);
+    }
+  };
+#endif
+
+#ifdef __CYGWIN__
+  class buffer::raw_hack_aligned : public buffer::raw {
+    unsigned align;
+    char *realdata;
+  public:
+    raw_hack_aligned(unsigned l, unsigned _align) : raw(l) {
+      align = _align;
+      realdata = new char[len+align-1];
+      unsigned off = ((unsigned)realdata) & (align-1);
+      if (off)
+	data = realdata + align - off;
+      else
+	data = realdata;
+      //cout << "hack aligned " << (unsigned)data
+      //<< " in raw " << (unsigned)realdata
+      //<< " off " << off << std::endl;
+      ceph_assert(((unsigned)data & (align-1)) == 0);
+    }
+    ~raw_hack_aligned() {
+      delete[] realdata;
+    }
+    raw* clone_empty() {
+      return new raw_hack_aligned(len, align);
+    }
+  };
+#endif
+
+  /*
+   * primitive buffer types
+   */
+  class buffer::raw_char : public buffer::raw {
+  public:
+    MEMPOOL_CLASS_HELPERS();
+
+    explicit raw_char(unsigned l) : raw(l) {
+      if (len)
+	data = new char[len];
+      else
+	data = 0;
+      bdout << "raw_char " << this << " alloc " << (void *)data << " " << l << bendl;
+    }
+    raw_char(unsigned l, char *b) : raw(b, l) {
+      bdout << "raw_char " << this << " alloc " << (void *)data << " " << l << bendl;
+    }
+    ~raw_char() override {
+      delete[] data;
+      bdout << "raw_char " << this << " free " << (void *)data << bendl;
+    }
+    raw* clone_empty() override {
+      return new raw_char(len);
+    }
+  };
+
+  class buffer::raw_claimed_char : public buffer::raw {
+  public:
+    MEMPOOL_CLASS_HELPERS();
+
+    explicit raw_claimed_char(unsigned l, char *b) : raw(b, l) {
+      bdout << "raw_claimed_char " << this << " alloc " << (void *)data
+	    << " " << l << bendl;
+    }
+    ~raw_claimed_char() override {
+      bdout << "raw_claimed_char " << this << " free " << (void *)data
+	    << bendl;
+    }
+    raw* clone_empty() override {
+      return new raw_char(len);
+    }
+  };
+
+  class buffer::raw_unshareable : public buffer::raw {
+  public:
+    MEMPOOL_CLASS_HELPERS();
+
+    explicit raw_unshareable(unsigned l) : raw(l) {
+      if (len)
+	data = new char[len];
+      else
+	data = 0;
+    }
+    raw_unshareable(unsigned l, char *b) : raw(b, l) {
+    }
+    raw* clone_empty() override {
+      return new raw_char(len);
+    }
+    bool is_shareable() const override {
+      return false; // !shareable, will force make_shareable()
+    }
+    ~raw_unshareable() override {
+      delete[] data;
+    }
+  };
+
+  class buffer::raw_static : public buffer::raw {
+  public:
+    MEMPOOL_CLASS_HELPERS();
+
+    raw_static(const char *d, unsigned l) : raw((char*)d, l) { }
+    ~raw_static() override {}
+    raw* clone_empty() override {
+      return new buffer::raw_char(len);
+    }
+  };
+
+  class buffer::raw_claim_buffer : public buffer::raw {
+    deleter del;
+   public:
+    raw_claim_buffer(const char *b, unsigned l, deleter d)
+        : raw((char*)b, l), del(std::move(d)) { }
+    ~raw_claim_buffer() override {}
+    raw* clone_empty() override {
+      return new buffer::raw_char(len);
+    }
+  };
+
+#if defined(HAVE_XIO)
+  class buffer::xio_msg_buffer : public buffer::raw {
+  private:
+    XioDispatchHook* m_hook;
+  public:
+    xio_msg_buffer(XioDispatchHook* _m_hook, const char *d,
+	unsigned l) :
+      raw((char*)d, l), m_hook(_m_hook->get()) {}
+
+    bool is_shareable() const override { return false; }
+    static void operator delete(void *p)
+    {
+      xio_msg_buffer *buf = static_cast<xio_msg_buffer*>(p);
+      // return hook ref (counts against pool);  it appears illegal
+      // to do this in our dtor, because this fires after that
+      buf->m_hook->put();
+    }
+    raw* clone_empty() {
+      return new buffer::raw_char(len);
+    }
+  };
+
+  class buffer::xio_mempool : public buffer::raw {
+  public:
+    struct xio_reg_mem *mp;
+    xio_mempool(struct xio_reg_mem *_mp, unsigned l) :
+      raw((char*)_mp->addr, l), mp(_mp)
+    { }
+    ~xio_mempool() {}
+    raw* clone_empty() {
+      return new buffer::raw_char(len);
+    }
+  };
+
+  struct xio_reg_mem* get_xio_mp(const buffer::ptr& bp)
+  {
+    buffer::xio_mempool *mb = dynamic_cast<buffer::xio_mempool*>(bp.get_raw());
+    if (mb) {
+      return mb->mp;
+    }
+    return NULL;
+  }
+
+  buffer::raw* buffer::create_msg(
+      unsigned len, char *buf, XioDispatchHook* m_hook) {
+    XioPool& pool = m_hook->get_pool();
+    buffer::raw* bp =
+      static_cast<buffer::raw*>(pool.alloc(sizeof(xio_msg_buffer)));
+    new (bp) xio_msg_buffer(m_hook, buf, len);
+    return bp;
+  }
+#endif /* HAVE_XIO */
+
+  ceph::unique_leakable_ptr<buffer::raw> buffer::copy(const char *c, unsigned len) {
+    auto r = buffer::create_aligned(len, sizeof(size_t));
+    memcpy(r->data, c, len);
+    return r;
+  }
+
+  ceph::unique_leakable_ptr<buffer::raw> buffer::create(unsigned len) {
+    return buffer::create_aligned(len, sizeof(size_t));
+  }
+  ceph::unique_leakable_ptr<buffer::raw> buffer::create_in_mempool(unsigned len, int mempool) {
+    return buffer::create_aligned_in_mempool(len, sizeof(size_t), mempool);
+  }
+  buffer::raw* buffer::claim_char(unsigned len, char *buf) {
+    return new raw_claimed_char(len, buf);
+  }
+  buffer::raw* buffer::create_malloc(unsigned len) {
+    return new raw_malloc(len);
+  }
+  buffer::raw* buffer::claim_malloc(unsigned len, char *buf) {
+    return new raw_malloc(len, buf);
+  }
+  buffer::raw* buffer::create_static(unsigned len, char *buf) {
+    return new raw_static(buf, len);
+  }
+  buffer::raw* buffer::claim_buffer(unsigned len, char *buf, deleter del) {
+    return new raw_claim_buffer(buf, len, std::move(del));
+  }
+
+  ceph::unique_leakable_ptr<buffer::raw> buffer::create_aligned_in_mempool(
+    unsigned len, unsigned align, int mempool) {
+    // If alignment is a page multiple, use a separate buffer::raw to
+    // avoid fragmenting the heap.
+    //
+    // Somewhat unexpectedly, I see consistently better performance
+    // from raw_combined than from raw even when the allocation size is
+    // a page multiple (but alignment is not).
+    //
+    // I also see better performance from a separate buffer::raw once the
+    // size passes 8KB.
+    if ((align & ~CEPH_PAGE_MASK) == 0 ||
+	len >= CEPH_PAGE_SIZE * 2) {
+#ifndef __CYGWIN__
+      return ceph::unique_leakable_ptr<buffer::raw>(new raw_posix_aligned(len, align));
+#else
+      return ceph::unique_leakable_ptr<buffer::raw>(new raw_hack_aligned(len, align));
+#endif
+    }
+    return ceph::unique_leakable_ptr<buffer::raw>(
+      raw_combined::create(len, align, mempool));
+  }
+  ceph::unique_leakable_ptr<buffer::raw> buffer::create_aligned(
+    unsigned len, unsigned align) {
+    return create_aligned_in_mempool(len, align,
+				     mempool::mempool_buffer_anon);
+  }
+
+  ceph::unique_leakable_ptr<buffer::raw> buffer::create_page_aligned(unsigned len) {
+    return create_aligned(len, CEPH_PAGE_SIZE);
+  }
+  ceph::unique_leakable_ptr<buffer::raw> buffer::create_small_page_aligned(unsigned len) {
+    if (len < CEPH_PAGE_SIZE) {
+      return create_aligned(len, CEPH_BUFFER_ALLOC_UNIT);
+    } else
+      return create_aligned(len, CEPH_PAGE_SIZE);
+  }
+
+  buffer::raw* buffer::create_unshareable(unsigned len) {
+    return new raw_unshareable(len);
+  }
+
+  buffer::ptr::ptr(raw* r) : _raw(r), _off(0), _len(r->len)   // no lock needed; this is an unref raw.
+  {
+    r->nref++;
+    bdout << "ptr " << this << " get " << _raw << bendl;
+  }
+  buffer::ptr::ptr(ceph::unique_leakable_ptr<raw> r)
+    : _raw(r.release()),
+      _off(0),
+      _len(_raw->len)
+  {
+    _raw->nref.store(1, std::memory_order_release);
+    bdout << "ptr " << this << " get " << _raw << bendl;
+  }
+  buffer::ptr::ptr(unsigned l) : _off(0), _len(l)
+  {
+    _raw = buffer::create(l).release();
+    _raw->nref.store(1, std::memory_order_release);
+    bdout << "ptr " << this << " get " << _raw << bendl;
+  }
+  buffer::ptr::ptr(const char *d, unsigned l) : _off(0), _len(l)    // ditto.
+  {
+    _raw = buffer::copy(d, l).release();
+    _raw->nref.store(1, std::memory_order_release);
+    bdout << "ptr " << this << " get " << _raw << bendl;
+  }
+  buffer::ptr::ptr(const ptr& p) : _raw(p._raw), _off(p._off), _len(p._len)
+  {
+    if (_raw) {
+      _raw->nref++;
+      bdout << "ptr " << this << " get " << _raw << bendl;
+    }
+  }
+  buffer::ptr::ptr(ptr&& p) noexcept : _raw(p._raw), _off(p._off), _len(p._len)
+  {
+    p._raw = nullptr;
+    p._off = p._len = 0;
+  }
+  buffer::ptr::ptr(const ptr& p, unsigned o, unsigned l)
+    : _raw(p._raw), _off(p._off + o), _len(l)
+  {
+    ceph_assert(o+l <= p._len);
+    ceph_assert(_raw);
+    _raw->nref++;
+    bdout << "ptr " << this << " get " << _raw << bendl;
+  }
+  buffer::ptr::ptr(const ptr& p, ceph::unique_leakable_ptr<raw> r)
+    : _raw(r.release()),
+      _off(p._off),
+      _len(p._len)
+  {
+    _raw->nref.store(1, std::memory_order_release);
+    bdout << "ptr " << this << " get " << _raw << bendl;
+  }
+  buffer::ptr& buffer::ptr::operator= (const ptr& p)
+  {
+    if (p._raw) {
+      p._raw->nref++;
+      bdout << "ptr " << this << " get " << _raw << bendl;
+    }
+    buffer::raw *raw = p._raw; 
+    release();
+    if (raw) {
+      _raw = raw;
+      _off = p._off;
+      _len = p._len;
+    } else {
+      _off = _len = 0;
+    }
+    return *this;
+  }
+  buffer::ptr& buffer::ptr::operator= (ptr&& p) noexcept
+  {
+    release();
+    buffer::raw *raw = p._raw;
+    if (raw) {
+      _raw = raw;
+      _off = p._off;
+      _len = p._len;
+      p._raw = nullptr;
+      p._off = p._len = 0;
+    } else {
+      _off = _len = 0;
+    }
+    return *this;
+  }
+
+  ceph::unique_leakable_ptr<buffer::raw> buffer::ptr::clone()
+  {
+    return _raw->clone();
+  }
+
+  void buffer::ptr::swap(ptr& other) noexcept
+  {
+    raw *r = _raw;
+    unsigned o = _off;
+    unsigned l = _len;
+    _raw = other._raw;
+    _off = other._off;
+    _len = other._len;
+    other._raw = r;
+    other._off = o;
+    other._len = l;
+  }
+
+  void buffer::ptr::release()
+  {
+    if (_raw) {
+      bdout << "ptr " << this << " release " << _raw << bendl;
+      const bool last_one = (1 == _raw->nref.load(std::memory_order_acquire));
+      if (likely(last_one) || --_raw->nref == 0) {
+        // BE CAREFUL: this is called also for hypercombined ptr_node. After
+        // freeing underlying raw, `*this` can become inaccessible as well!
+        const auto* delete_raw = _raw;
+        _raw = nullptr;
+	//cout << "hosing raw " << (void*)_raw << " len " << _raw->len << std::endl;
+        ANNOTATE_HAPPENS_AFTER(&_raw->nref);
+        ANNOTATE_HAPPENS_BEFORE_FORGET_ALL(&_raw->nref);
+	delete delete_raw;  // dealloc old (if any)
+      } else {
+        ANNOTATE_HAPPENS_BEFORE(&_raw->nref);
+        _raw = nullptr;
+      }
+    }
+  }
+
+  int buffer::ptr::get_mempool() const {
+    if (_raw) {
+      return _raw->mempool;
+    }
+    return mempool::mempool_buffer_anon;
+  }
+
+  void buffer::ptr::reassign_to_mempool(int pool) {
+    if (_raw) {
+      _raw->reassign_to_mempool(pool);
+    }
+  }
+  void buffer::ptr::try_assign_to_mempool(int pool) {
+    if (_raw) {
+      _raw->try_assign_to_mempool(pool);
+    }
+  }
+
+  const char *buffer::ptr::c_str() const {
+    ceph_assert(_raw);
+    return _raw->get_data() + _off;
+  }
+  char *buffer::ptr::c_str() {
+    ceph_assert(_raw);
+    return _raw->get_data() + _off;
+  }
+  const char *buffer::ptr::end_c_str() const {
+    ceph_assert(_raw);
+    return _raw->get_data() + _off + _len;
+  }
+  char *buffer::ptr::end_c_str() {
+    ceph_assert(_raw);
+    return _raw->get_data() + _off + _len;
+  }
+
+  unsigned buffer::ptr::unused_tail_length() const
+  {
+    if (_raw)
+      return _raw->len - (_off+_len);
+    else
+      return 0;
+  }
+  const char& buffer::ptr::operator[](unsigned n) const
+  {
+    ceph_assert(_raw);
+    ceph_assert(n < _len);
+    return _raw->get_data()[_off + n];
+  }
+  char& buffer::ptr::operator[](unsigned n)
+  {
+    ceph_assert(_raw);
+    ceph_assert(n < _len);
+    return _raw->get_data()[_off + n];
+  }
+
+  const char *buffer::ptr::raw_c_str() const { ceph_assert(_raw); return _raw->data; }
+  unsigned buffer::ptr::raw_length() const { ceph_assert(_raw); return _raw->len; }
+  int buffer::ptr::raw_nref() const { ceph_assert(_raw); return _raw->nref; }
+
+  void buffer::ptr::copy_out(unsigned o, unsigned l, char *dest) const {
+    ceph_assert(_raw);
+    if (o+l > _len)
+        throw end_of_buffer();
+    char* src =  _raw->data + _off + o;
+    maybe_inline_memcpy(dest, src, l, 8);
+  }
+
+  unsigned buffer::ptr::wasted() const
+  {
+    return _raw->len - _len;
+  }
+
+  int buffer::ptr::cmp(const ptr& o) const
+  {
+    int l = _len < o._len ? _len : o._len;
+    if (l) {
+      int r = memcmp(c_str(), o.c_str(), l);
+      if (r)
+	return r;
+    }
+    if (_len < o._len)
+      return -1;
+    if (_len > o._len)
+      return 1;
+    return 0;
+  }
+
+  bool buffer::ptr::is_zero() const
+  {
+    return mem_is_zero(c_str(), _len);
+  }
+
+  unsigned buffer::ptr::append(char c)
+  {
+    ceph_assert(_raw);
+    ceph_assert(1 <= unused_tail_length());
+    char* ptr = _raw->data + _off + _len;
+    *ptr = c;
+    _len++;
+    return _len + _off;
+  }
+
+  unsigned buffer::ptr::append(const char *p, unsigned l)
+  {
+    ceph_assert(_raw);
+    ceph_assert(l <= unused_tail_length());
+    char* c = _raw->data + _off + _len;
+    maybe_inline_memcpy(c, p, l, 32);
+    _len += l;
+    return _len + _off;
+  }
+
+  unsigned buffer::ptr::append_zeros(unsigned l)
+  {
+    ceph_assert(_raw);
+    ceph_assert(l <= unused_tail_length());
+    char* c = _raw->data + _off + _len;
+    // FIPS zeroization audit 20191115: this memset is not security related.
+    memset(c, 0, l);
+    _len += l;
+    return _len + _off;
+  }
+
+  void buffer::ptr::copy_in(unsigned o, unsigned l, const char *src, bool crc_reset)
+  {
+    ceph_assert(_raw);
+    ceph_assert(o <= _len);
+    ceph_assert(o+l <= _len);
+    char* dest = _raw->data + _off + o;
+    if (crc_reset)
+        _raw->invalidate_crc();
+    maybe_inline_memcpy(dest, src, l, 64);
+  }
+
+  void buffer::ptr::zero(bool crc_reset)
+  {
+    if (crc_reset)
+        _raw->invalidate_crc();
+    // FIPS zeroization audit 20191115: this memset is not security related.
+    memset(c_str(), 0, _len);
+  }
+
+  void buffer::ptr::zero(unsigned o, unsigned l, bool crc_reset)
+  {
+    ceph_assert(o+l <= _len);
+    if (crc_reset)
+        _raw->invalidate_crc();
+    // FIPS zeroization audit 20191115: this memset is not security related.
+    memset(c_str()+o, 0, l);
+  }
+
+  // -- buffer::list::iterator --
+  /*
+  buffer::list::iterator operator=(const buffer::list::iterator& other)
+  {
+    if (this != &other) {
+      bl = other.bl;
+      ls = other.ls;
+      off = other.off;
+      p = other.p;
+      p_off = other.p_off;
+    }
+    return *this;
+    }*/
+
+  template<bool is_const>
+  buffer::list::iterator_impl<is_const>::iterator_impl(bl_t *l, unsigned o)
+    : bl(l), ls(&bl->_buffers), p(ls->begin()), off(0), p_off(0)
+  {
+    advance(o);
+  }
+
+  template<bool is_const>
+  buffer::list::iterator_impl<is_const>::iterator_impl(const buffer::list::iterator& i)
+    : iterator_impl<is_const>(i.bl, i.off, i.p, i.p_off) {}
+
+  template<bool is_const>
+  void buffer::list::iterator_impl<is_const>::advance(unsigned o)
+  {
+    //cout << this << " advance " << o << " from " << off
+    //     << " (p_off " << p_off << " in " << p->length() << ")"
+    //     << std::endl;
+
+    p_off +=o;
+    while (p != ls->end()) {
+      if (p_off >= p->length()) {
+        // skip this buffer
+        p_off -= p->length();
+        p++;
+      } else {
+        // somewhere in this buffer!
+        break;
+      }
+    }
+    if (p == ls->end() && p_off) {
+      throw end_of_buffer();
+    }
+    off += o;
+  }
+
+  template<bool is_const>
+  void buffer::list::iterator_impl<is_const>::seek(unsigned o)
+  {
+    p = ls->begin();
+    off = p_off = 0;
+    advance(o);
+  }
+
+  template<bool is_const>
+  char buffer::list::iterator_impl<is_const>::operator*() const
+  {
+    if (p == ls->end())
+      throw end_of_buffer();
+    return (*p)[p_off];
+  }
+
+  template<bool is_const>
+  buffer::list::iterator_impl<is_const>&
+  buffer::list::iterator_impl<is_const>::operator++()
+  {
+    if (p == ls->end())
+      throw end_of_buffer();
+    advance(1u);
+    return *this;
+  }
+
+  template<bool is_const>
+  buffer::ptr buffer::list::iterator_impl<is_const>::get_current_ptr() const
+  {
+    if (p == ls->end())
+      throw end_of_buffer();
+    return ptr(*p, p_off, p->length() - p_off);
+  }
+
+  template<bool is_const>
+  bool buffer::list::iterator_impl<is_const>::is_pointing_same_raw(
+    const ptr& other) const
+  {
+    if (p == ls->end())
+      throw end_of_buffer();
+    return p->get_raw() == other.get_raw();
+  }
+
+  // copy data out.
+  // note that these all _append_ to dest!
+  template<bool is_const>
+  void buffer::list::iterator_impl<is_const>::copy(unsigned len, char *dest)
+  {
+    if (p == ls->end()) seek(off);
+    while (len > 0) {
+      if (p == ls->end())
+	throw end_of_buffer();
+
+      unsigned howmuch = p->length() - p_off;
+      if (len < howmuch) howmuch = len;
+      p->copy_out(p_off, howmuch, dest);
+      dest += howmuch;
+
+      len -= howmuch;
+      advance(howmuch);
+    }
+  }
+
+  template<bool is_const>
+  void buffer::list::iterator_impl<is_const>::copy(unsigned len, ptr &dest)
+  {
+    copy_deep(len, dest);
+  }
+
+  template<bool is_const>
+  void buffer::list::iterator_impl<is_const>::copy_deep(unsigned len, ptr &dest)
+  {
+    if (!len) {
+      return;
+    }
+    if (p == ls->end())
+      throw end_of_buffer();
+    dest = create(len);
+    copy(len, dest.c_str());
+  }
+  template<bool is_const>
+  void buffer::list::iterator_impl<is_const>::copy_shallow(unsigned len,
+							   ptr &dest)
+  {
+    if (!len) {
+      return;
+    }
+    if (p == ls->end())
+      throw end_of_buffer();
+    unsigned howmuch = p->length() - p_off;
+    if (howmuch < len) {
+      dest = create(len);
+      copy(len, dest.c_str());
+    } else {
+      dest = ptr(*p, p_off, len);
+      advance(len);
+    }
+  }
+
+  template<bool is_const>
+  void buffer::list::iterator_impl<is_const>::copy(unsigned len, list &dest)
+  {
+    if (p == ls->end())
+      seek(off);
+    while (len > 0) {
+      if (p == ls->end())
+	throw end_of_buffer();
+
+      unsigned howmuch = p->length() - p_off;
+      if (len < howmuch)
+	howmuch = len;
+      dest.append(*p, p_off, howmuch);
+
+      len -= howmuch;
+      advance(howmuch);
+    }
+  }
+
+  template<bool is_const>
+  void buffer::list::iterator_impl<is_const>::copy(unsigned len, std::string &dest)
+  {
+    if (p == ls->end())
+      seek(off);
+    while (len > 0) {
+      if (p == ls->end())
+	throw end_of_buffer();
+
+      unsigned howmuch = p->length() - p_off;
+      const char *c_str = p->c_str();
+      if (len < howmuch)
+	howmuch = len;
+      dest.append(c_str + p_off, howmuch);
+
+      len -= howmuch;
+      advance(howmuch);
+    }
+  }
+
+  template<bool is_const>
+  void buffer::list::iterator_impl<is_const>::copy_all(list &dest)
+  {
+    if (p == ls->end())
+      seek(off);
+    while (1) {
+      if (p == ls->end())
+	return;
+
+      unsigned howmuch = p->length() - p_off;
+      const char *c_str = p->c_str();
+      dest.append(c_str + p_off, howmuch);
+
+      advance(howmuch);
+    }
+  }
+
+  template<bool is_const>
+  size_t buffer::list::iterator_impl<is_const>::get_ptr_and_advance(
+    size_t want, const char **data)
+  {
+    if (p == ls->end()) {
+      seek(off);
+      if (p == ls->end()) {
+	return 0;
+      }
+    }
+    *data = p->c_str() + p_off;
+    size_t l = std::min<size_t>(p->length() - p_off, want);
+    p_off += l;
+    if (p_off == p->length()) {
+      ++p;
+      p_off = 0;
+    }
+    off += l;
+    return l;
+  }
+
+  template<bool is_const>
+  uint32_t buffer::list::iterator_impl<is_const>::crc32c(
+    size_t length, uint32_t crc)
+  {
+    length = std::min<size_t>(length, get_remaining());
+    while (length > 0) {
+      const char *p;
+      size_t l = get_ptr_and_advance(length, &p);
+      crc = ceph_crc32c(crc, (unsigned char*)p, l);
+      length -= l;
+    }
+    return crc;
+  }
+
+  // explicitly instantiate only the iterator types we need, so we can hide the
+  // details in this compilation unit without introducing unnecessary link time
+  // dependencies.
+  template class buffer::list::iterator_impl<true>;
+  template class buffer::list::iterator_impl<false>;
+
+  buffer::list::iterator::iterator(bl_t *l, unsigned o)
+    : iterator_impl(l, o)
+  {}
+
+  buffer::list::iterator::iterator(bl_t *l, unsigned o, list_iter_t ip, unsigned po)
+    : iterator_impl(l, o, ip, po)
+  {}
+
+  // copy data in
+  void buffer::list::iterator::copy_in(unsigned len, const char *src, bool crc_reset)
+  {
+    // copy
+    if (p == ls->end())
+      seek(off);
+    while (len > 0) {
+      if (p == ls->end())
+	throw end_of_buffer();
+      
+      unsigned howmuch = p->length() - p_off;
+      if (len < howmuch)
+	howmuch = len;
+      p->copy_in(p_off, howmuch, src, crc_reset);
+	
+      src += howmuch;
+      len -= howmuch;
+      advance(howmuch);
+    }
+  }
+  
+  void buffer::list::iterator::copy_in(unsigned len, const list& otherl)
+  {
+    if (p == ls->end())
+      seek(off);
+    unsigned left = len;
+    for (const auto& node : otherl._buffers) {
+      unsigned l = node.length();
+      if (left < l)
+	l = left;
+      copy_in(l, node.c_str());
+      left -= l;
+      if (left == 0)
+	break;
+    }
+  }
+
+  // -- buffer::list --
+
+  buffer::list::list(list&& other) noexcept
+    : _buffers(std::move(other._buffers)),
+      _carriage(&always_empty_bptr),
+      _len(other._len),
+      _memcopy_count(other._memcopy_count),
+      last_p(this) {
+    other.clear();
+  }
+
+  void buffer::list::swap(list& other) noexcept
+  {
+    std::swap(_len, other._len);
+    std::swap(_memcopy_count, other._memcopy_count);
+    std::swap(_carriage, other._carriage);
+    _buffers.swap(other._buffers);
+    //last_p.swap(other.last_p);
+    last_p = begin();
+    other.last_p = other.begin();
+  }
+
+  bool buffer::list::contents_equal(const ceph::buffer::list& other) const
+  {
+    if (length() != other.length())
+      return false;
+
+    // buffer-wise comparison
+    if (true) {
+      auto a = std::cbegin(_buffers);
+      auto b = std::cbegin(other._buffers);
+      unsigned aoff = 0, boff = 0;
+      while (a != std::cend(_buffers)) {
+	unsigned len = a->length() - aoff;
+	if (len > b->length() - boff)
+	  len = b->length() - boff;
+	if (memcmp(a->c_str() + aoff, b->c_str() + boff, len) != 0)
+	  return false;
+	aoff += len;
+	if (aoff == a->length()) {
+	  aoff = 0;
+	  ++a;
+	}
+	boff += len;
+	if (boff == b->length()) {
+	  boff = 0;
+	  ++b;
+	}
+      }
+      return true;
+    }
+
+    // byte-wise comparison
+    if (false) {
+      bufferlist::const_iterator me = begin();
+      bufferlist::const_iterator him = other.begin();
+      while (!me.end()) {
+	if (*me != *him)
+	  return false;
+	++me;
+	++him;
+      }
+      return true;
+    }
+  }
+
+  bool buffer::list::is_provided_buffer(const char* const dst) const
+  {
+    if (_buffers.empty()) {
+      return false;
+    }
+    return (is_contiguous() && (_buffers.front().c_str() == dst));
+  }
+
+  bool buffer::list::is_aligned(const unsigned align) const
+  {
+    for (const auto& node : _buffers) {
+      if (!node.is_aligned(align)) {
+	return false;
+      }
+    }
+    return true;
+  }
+
+  bool buffer::list::is_n_align_sized(const unsigned align) const
+  {
+    for (const auto& node : _buffers) {
+      if (!node.is_n_align_sized(align)) {
+	return false;
+      }
+    }
+    return true;
+  }
+
+  bool buffer::list::is_aligned_size_and_memory(
+    const unsigned align_size,
+    const unsigned align_memory) const
+  {
+    for (const auto& node : _buffers) {
+      if (!node.is_aligned(align_memory) || !node.is_n_align_sized(align_size)) {
+	return false;
+      }
+    }
+    return true;
+  }
+
+  bool buffer::list::is_zero() const {
+    for (const auto& node : _buffers) {
+      if (!node.is_zero()) {
+	return false;
+      }
+    }
+    return true;
+  }
+
+  void buffer::list::zero()
+  {
+    for (auto& node : _buffers) {
+      node.zero();
+    }
+  }
+
+  void buffer::list::zero(const unsigned o, const unsigned l)
+  {
+    ceph_assert(o+l <= _len);
+    unsigned p = 0;
+    for (auto& node : _buffers) {
+      if (p + node.length() > o) {
+        if (p >= o && p+node.length() <= o+l) {
+          // 'o'------------- l -----------|
+          //      'p'-- node.length() --|
+	  node.zero();
+        } else if (p >= o) {
+          // 'o'------------- l -----------|
+          //    'p'------- node.length() -------|
+	  node.zero(0, o+l-p);
+        } else if (p + node.length() <= o+l) {
+          //     'o'------------- l -----------|
+          // 'p'------- node.length() -------|
+	  node.zero(o-p, node.length()-(o-p));
+        } else {
+          //       'o'----------- l -----------|
+          // 'p'---------- node.length() ----------|
+          node.zero(o-p, l);
+        }
+      }
+      p += node.length();
+      if (o+l <= p) {
+	break;  // done
+      }
+    }
+  }
+
+  bool buffer::list::is_contiguous() const
+  {
+    return _buffers.size() <= 1;
+  }
+
+  bool buffer::list::is_n_page_sized() const
+  {
+    return is_n_align_sized(CEPH_PAGE_SIZE);
+  }
+
+  bool buffer::list::is_page_aligned() const
+  {
+    return is_aligned(CEPH_PAGE_SIZE);
+  }
+
+  int buffer::list::get_mempool() const
+  {
+    if (_buffers.empty()) {
+      return mempool::mempool_buffer_anon;
+    }
+    return _buffers.back().get_mempool();
+  }
+
+  void buffer::list::reassign_to_mempool(int pool)
+  {
+    for (auto& p : _buffers) {
+      p.get_raw()->reassign_to_mempool(pool);
+    }
+  }
+
+  void buffer::list::try_assign_to_mempool(int pool)
+  {
+    for (auto& p : _buffers) {
+      p.get_raw()->try_assign_to_mempool(pool);
+    }
+  }
+
+  uint64_t buffer::list::get_wasted_space() const
+  {
+    if (_buffers.size() == 1)
+      return _buffers.back().wasted();
+
+    std::vector<const raw*> raw_vec;
+    raw_vec.reserve(_buffers.size());
+    for (const auto& p : _buffers)
+      raw_vec.push_back(p.get_raw());
+    std::sort(raw_vec.begin(), raw_vec.end());
+
+    uint64_t total = 0;
+    const raw *last = nullptr;
+    for (const auto r : raw_vec) {
+      if (r == last)
+	continue;
+      last = r;
+      total += r->len;
+    }
+    // If multiple buffers are sharing the same raw buffer and they overlap
+    // with each other, the wasted space will be underestimated.
+    if (total <= length())
+      return 0;
+    return total - length();
+  }
+
+  void buffer::list::rebuild()
+  {
+    if (_len == 0) {
+      _carriage = &always_empty_bptr;
+      _buffers.clear_and_dispose();
+      return;
+    }
+    if ((_len & ~CEPH_PAGE_MASK) == 0)
+      rebuild(ptr_node::create(buffer::create_page_aligned(_len)));
+    else
+      rebuild(ptr_node::create(buffer::create(_len)));
+  }
+
+  void buffer::list::rebuild(
+    std::unique_ptr<buffer::ptr_node, buffer::ptr_node::disposer> nb)
+  {
+    unsigned pos = 0;
+    int mempool = _buffers.front().get_mempool();
+    nb->reassign_to_mempool(mempool);
+    for (auto& node : _buffers) {
+      nb->copy_in(pos, node.length(), node.c_str(), false);
+      pos += node.length();
+    }
+    _memcopy_count += pos;
+    _carriage = &always_empty_bptr;
+    _buffers.clear_and_dispose();
+    if (likely(nb->length())) {
+      _carriage = nb.get();
+      _buffers.push_back(*nb.release());
+    }
+    invalidate_crc();
+    last_p = begin();
+  }
+
+  bool buffer::list::rebuild_aligned(unsigned align)
+  {
+    return rebuild_aligned_size_and_memory(align, align);
+  }
+  
+  bool buffer::list::rebuild_aligned_size_and_memory(unsigned align_size,
+						    unsigned align_memory,
+						    unsigned max_buffers)
+  {
+    unsigned old_memcopy_count = _memcopy_count;
+
+    if (max_buffers && _buffers.size() > max_buffers
+	&& _len > (max_buffers * align_size)) {
+      align_size = round_up_to(round_up_to(_len, max_buffers) / max_buffers, align_size);
+    }
+    auto p = std::begin(_buffers);
+    auto p_prev = _buffers.before_begin();
+    while (p != std::end(_buffers)) {
+      // keep anything that's already align and sized aligned
+      if (p->is_aligned(align_memory) && p->is_n_align_sized(align_size)) {
+        /*cout << " segment " << (void*)p->c_str()
+  	     << " offset " << ((unsigned long)p->c_str() & (align - 1))
+  	     << " length " << p->length()
+  	     << " " << (p->length() & (align - 1)) << " ok" << std::endl;
+        */
+        p_prev = p++;
+        continue;
+      }
+      
+      // consolidate unaligned items, until we get something that is sized+aligned
+      list unaligned;
+      unsigned offset = 0;
+      do {
+        /*cout << " segment " << (void*)p->c_str()
+               << " offset " << ((unsigned long)p->c_str() & (align - 1))
+               << " length " << p->length() << " " << (p->length() & (align - 1))
+               << " overall offset " << offset << " " << (offset & (align - 1))
+  	     << " not ok" << std::endl;
+        */
+        offset += p->length();
+        // no need to reallocate, relinking is enough thankfully to bi::list.
+        auto p_after = _buffers.erase_after(p_prev);
+        unaligned._buffers.push_back(*p);
+        unaligned._len += p->length();
+        p = p_after;
+      } while (p != std::end(_buffers) &&
+  	     (!p->is_aligned(align_memory) ||
+  	      !p->is_n_align_sized(align_size) ||
+  	      (offset % align_size)));
+      if (!(unaligned.is_contiguous() && unaligned._buffers.front().is_aligned(align_memory))) {
+        unaligned.rebuild(
+          ptr_node::create(
+            buffer::create_aligned(unaligned._len, align_memory)));
+        _memcopy_count += unaligned._len;
+      }
+      _buffers.insert_after(p_prev, *ptr_node::create(unaligned._buffers.front()).release());
+      ++p_prev;
+    }
+    last_p = begin();
+
+    return  (old_memcopy_count != _memcopy_count);
+  }
+  
+  bool buffer::list::rebuild_page_aligned()
+  {
+   return  rebuild_aligned(CEPH_PAGE_SIZE);
+  }
+
+  void buffer::list::reserve(size_t prealloc)
+  {
+    if (get_append_buffer_unused_tail_length() < prealloc) {
+      auto ptr = ptr_node::create(buffer::create_page_aligned(prealloc));
+      ptr->set_length(0);   // unused, so far.
+      _carriage = ptr.get();
+      _buffers.push_back(*ptr.release());
+    }
+  }
+
+  // sort-of-like-assignment-op
+  void buffer::list::claim(list& bl, unsigned int flags)
+  {
+    // free my buffers
+    clear();
+    claim_append(bl, flags);
+  }
+
+  void buffer::list::claim_append(list& bl, unsigned int flags)
+  {
+    // steal the other guy's buffers
+    _len += bl._len;
+    if (!(flags & CLAIM_ALLOW_NONSHAREABLE)) {
+      auto curbuf = bl._buffers.begin();
+      auto curbuf_prev = bl._buffers.before_begin();
+
+      while (curbuf != bl._buffers.end()) {
+	const auto* const raw = curbuf->get_raw();
+	if (unlikely(raw && !raw->is_shareable())) {
+	  auto* clone = ptr_node::copy_hypercombined(*curbuf);
+	  curbuf = bl._buffers.erase_after_and_dispose(curbuf_prev);
+	  bl._buffers.insert_after(curbuf_prev, *clone);
+	  ++curbuf_prev;
+	} else {
+	  curbuf_prev = curbuf++;
+	}
+      }
+    }
+    _buffers.splice_back(bl._buffers);
+    bl._carriage = &always_empty_bptr;
+    bl._buffers.clear_and_dispose();
+    bl._len = 0;
+    bl.last_p = bl.begin();
+  }
+
+  void buffer::list::claim_append_piecewise(list& bl)
+  {
+    // steal the other guy's buffers
+    for (const auto& node : bl.buffers()) {
+      append(node, 0, node.length());
+    }
+    bl.clear();
+  }
+
+  void buffer::list::copy(unsigned off, unsigned len, char *dest) const
+  {
+    if (off + len > length())
+      throw end_of_buffer();
+    if (last_p.get_off() != off) 
+      last_p.seek(off);
+    last_p.copy(len, dest);
+  }
+
+  void buffer::list::copy(unsigned off, unsigned len, list &dest) const
+  {
+    if (off + len > length())
+      throw end_of_buffer();
+    if (last_p.get_off() != off) 
+      last_p.seek(off);
+    last_p.copy(len, dest);
+  }
+
+  void buffer::list::copy(unsigned off, unsigned len, std::string& dest) const
+  {
+    if (last_p.get_off() != off) 
+      last_p.seek(off);
+    return last_p.copy(len, dest);
+  }
+    
+  void buffer::list::copy_in(unsigned off, unsigned len, const char *src, bool crc_reset)
+  {
+    if (off + len > length())
+      throw end_of_buffer();
+    
+    if (last_p.get_off() != off) 
+      last_p.seek(off);
+    last_p.copy_in(len, src, crc_reset);
+  }
+
+  void buffer::list::copy_in(unsigned off, unsigned len, const list& src)
+  {
+    if (last_p.get_off() != off) 
+      last_p.seek(off);
+    last_p.copy_in(len, src);
+  }
+
+  void buffer::list::append(char c)
+  {
+    // put what we can into the existing append_buffer.
+    unsigned gap = get_append_buffer_unused_tail_length();
+    if (!gap) {
+      // make a new buffer!
+      auto buf = ptr_node::create(
+	raw_combined::create(CEPH_BUFFER_APPEND_SIZE, 0, get_mempool()));
+      buf->set_length(0);   // unused, so far.
+      _carriage = buf.get();
+      _buffers.push_back(*buf.release());
+    } else if (unlikely(_carriage != &_buffers.back())) {
+      auto bptr = ptr_node::create(*_carriage, _carriage->length(), 0);
+      _carriage = bptr.get();
+      _buffers.push_back(*bptr.release());
+    }
+    _carriage->append(c);
+    _len++;
+  }
+
+  buffer::ptr buffer::list::always_empty_bptr;
+
+  buffer::ptr_node& buffer::list::refill_append_space(const unsigned len)
+  {
+    // make a new buffer.  fill out a complete page, factoring in the
+    // raw_combined overhead.
+    size_t need = round_up_to(len, sizeof(size_t)) + sizeof(raw_combined);
+    size_t alen = round_up_to(need, CEPH_BUFFER_ALLOC_UNIT) -
+      sizeof(raw_combined);
+    auto new_back = \
+      ptr_node::create(raw_combined::create(alen, 0, get_mempool()));
+    new_back->set_length(0);   // unused, so far.
+    _carriage = new_back.get();
+    _buffers.push_back(*new_back.release());
+    return _buffers.back();
+  }
+
+  void buffer::list::append(const char *data, unsigned len)
+  {
+    _len += len;
+
+    const unsigned free_in_last = get_append_buffer_unused_tail_length();
+    const unsigned first_round = std::min(len, free_in_last);
+    if (first_round) {
+      // _buffers and carriage can desynchronize when 1) a new ptr
+      // we don't own has been added into the _buffers 2) _buffers
+      // has been emptied as as a result of std::move or stolen by
+      // claim_append.
+      if (unlikely(_carriage != &_buffers.back())) {
+        auto bptr = ptr_node::create(*_carriage, _carriage->length(), 0);
+	_carriage = bptr.get();
+	_buffers.push_back(*bptr.release());
+      }
+      _carriage->append(data, first_round);
+    }
+
+    const unsigned second_round = len - first_round;
+    if (second_round) {
+      auto& new_back = refill_append_space(second_round);
+      new_back.append(data + first_round, second_round);
+    }
+  }
+
+  buffer::list::reserve_t buffer::list::obtain_contiguous_space(
+    const unsigned len)
+  {
+    // note: if len < the normal append_buffer size it *might*
+    // be better to allocate a normal-sized append_buffer and
+    // use part of it.  however, that optimizes for the case of
+    // old-style types including new-style types.  and in most
+    // such cases, this won't be the very first thing encoded to
+    // the list, so append_buffer will already be allocated.
+    // OTOH if everything is new-style, we *should* allocate
+    // only what we need and conserve memory.
+    if (unlikely(get_append_buffer_unused_tail_length() < len)) {
+      auto new_back = \
+	buffer::ptr_node::create(buffer::create(len)).release();
+      new_back->set_length(0);   // unused, so far.
+      _buffers.push_back(*new_back);
+      _carriage = new_back;
+      return { new_back->c_str(), &new_back->_len, &_len };
+    } else {
+      if (unlikely(_carriage != &_buffers.back())) {
+        auto bptr = ptr_node::create(*_carriage, _carriage->length(), 0);
+	_carriage = bptr.get();
+	_buffers.push_back(*bptr.release());
+      }
+      return { _carriage->end_c_str(), &_carriage->_len, &_len };
+    }
+  }
+
+  void buffer::list::append(const ptr& bp)
+  {
+      push_back(bp);
+  }
+
+  void buffer::list::append(ptr&& bp)
+  {
+      push_back(std::move(bp));
+  }
+
+  void buffer::list::append(const ptr& bp, unsigned off, unsigned len)
+  {
+    ceph_assert(len+off <= bp.length());
+    if (!_buffers.empty()) {
+      ptr &l = _buffers.back();
+      if (l.get_raw() == bp.get_raw() &&
+	  l.end() == bp.start() + off) {
+	// yay contiguous with tail bp!
+	l.set_length(l.length()+len);
+	_len += len;
+	return;
+      }
+    }
+    // add new item to list
+    _buffers.push_back(*ptr_node::create(bp, off, len).release());
+    _len += len;
+  }
+
+  void buffer::list::append(const list& bl)
+  {
+    _len += bl._len;
+    for (const auto& node : bl._buffers) {
+      _buffers.push_back(*ptr_node::create(node).release());
+    }
+  }
+
+  void buffer::list::append(std::istream& in)
+  {
+    while (!in.eof()) {
+      std::string s;
+      getline(in, s);
+      append(s.c_str(), s.length());
+      if (s.length())
+	append("\n", 1);
+    }
+  }
+
+  buffer::list::contiguous_filler buffer::list::append_hole(const unsigned len)
+  {
+    _len += len;
+
+    if (unlikely(get_append_buffer_unused_tail_length() < len)) {
+      // make a new append_buffer.  fill out a complete page, factoring in
+      // the raw_combined overhead.
+      auto& new_back = refill_append_space(len);
+      new_back.set_length(len);
+      return { new_back.c_str() };
+    } else if (unlikely(_carriage != &_buffers.back())) {
+      auto bptr = ptr_node::create(*_carriage, _carriage->length(), 0);
+      _carriage = bptr.get();
+      _buffers.push_back(*bptr.release());
+    }
+    _carriage->set_length(_carriage->length() + len);
+    return { _carriage->end_c_str() - len };
+  }
+
+  void buffer::list::prepend_zero(unsigned len)
+  {
+    auto bp = ptr_node::create(len);
+    bp->zero(false);
+    _len += len;
+    _buffers.push_front(*bp.release());
+  }
+  
+  void buffer::list::append_zero(unsigned len)
+  {
+    _len += len;
+
+    const unsigned free_in_last = get_append_buffer_unused_tail_length();
+    const unsigned first_round = std::min(len, free_in_last);
+    if (first_round) {
+      if (unlikely(_carriage != &_buffers.back())) {
+        auto bptr = ptr_node::create(*_carriage, _carriage->length(), 0);
+	_carriage = bptr.get();
+	_buffers.push_back(*bptr.release());
+      }
+      _carriage->append_zeros(first_round);
+    }
+
+    const unsigned second_round = len - first_round;
+    if (second_round) {
+      auto& new_back = refill_append_space(second_round);
+      new_back.set_length(second_round);
+      new_back.zero(false);
+    }
+  }
+
+  
+  /*
+   * get a char
+   */
+  const char& buffer::list::operator[](unsigned n) const
+  {
+    if (n >= _len)
+      throw end_of_buffer();
+    
+    for (const auto& node : _buffers) {
+      if (n >= node.length()) {
+	n -= node.length();
+	continue;
+      }
+      return node[n];
+    }
+    ceph_abort();
+  }
+
+  /*
+   * return a contiguous ptr to whole bufferlist contents.
+   */
+  char *buffer::list::c_str()
+  {
+    if (_buffers.empty())
+      return 0;                         // no buffers
+
+    auto iter = std::cbegin(_buffers);
+    ++iter;
+
+    if (iter != std::cend(_buffers)) {
+      rebuild();
+    }
+    return _buffers.front().c_str();  // good, we're already contiguous.
+  }
+
+  string buffer::list::to_str() const {
+    string s;
+    s.reserve(length());
+    for (const auto& node : _buffers) {
+      if (node.length()) {
+	s.append(node.c_str(), node.length());
+      }
+    }
+    return s;
+  }
+
+  void buffer::list::substr_of(const list& other, unsigned off, unsigned len)
+  {
+    if (off + len > other.length())
+      throw end_of_buffer();
+
+    clear();
+
+    // skip off
+    auto curbuf = std::cbegin(other._buffers);
+    while (off > 0 && off >= curbuf->length()) {
+      // skip this buffer
+      //cout << "skipping over " << *curbuf << std::endl;
+      off -= (*curbuf).length();
+      ++curbuf;
+    }
+    ceph_assert(len == 0 || curbuf != std::cend(other._buffers));
+    
+    while (len > 0) {
+      // partial?
+      if (off + len < curbuf->length()) {
+	//cout << "copying partial of " << *curbuf << std::endl;
+	_buffers.push_back(*ptr_node::create( *curbuf, off, len ).release());
+	_len += len;
+	break;
+      }
+      
+      // through end
+      //cout << "copying end (all?) of " << *curbuf << std::endl;
+      unsigned howmuch = curbuf->length() - off;
+      _buffers.push_back(*ptr_node::create( *curbuf, off, howmuch ).release());
+      _len += howmuch;
+      len -= howmuch;
+      off = 0;
+      ++curbuf;
+    }
+  }
+
+  // funky modifer
+  void buffer::list::splice(unsigned off, unsigned len, list *claim_by /*, bufferlist& replace_with */)
+  {    // fixme?
+    if (len == 0)
+      return;
+
+    if (off >= length())
+      throw end_of_buffer();
+
+    ceph_assert(len > 0);
+    //cout << "splice off " << off << " len " << len << " ... mylen = " << length() << std::endl;
+      
+    // skip off
+    auto curbuf = std::begin(_buffers);
+    auto curbuf_prev = _buffers.before_begin();
+    while (off > 0) {
+      ceph_assert(curbuf != std::end(_buffers));
+      if (off >= (*curbuf).length()) {
+	// skip this buffer
+	//cout << "off = " << off << " skipping over " << *curbuf << std::endl;
+	off -= (*curbuf).length();
+	curbuf_prev = curbuf++;
+      } else {
+	// somewhere in this buffer!
+	//cout << "off = " << off << " somewhere in " << *curbuf << std::endl;
+	break;
+      }
+    }
+    
+    if (off) {
+      // add a reference to the front bit
+      //  insert it before curbuf (which we'll hose)
+      //cout << "keeping front " << off << " of " << *curbuf << std::endl;
+      _buffers.insert_after(curbuf_prev,
+			    *ptr_node::create(*curbuf, 0, off).release());
+      _len += off;
+      ++curbuf_prev;
+    }
+    
+    _carriage = &always_empty_bptr;
+
+    while (len > 0) {
+      // partial?
+      if (off + len < (*curbuf).length()) {
+	//cout << "keeping end of " << *curbuf << ", losing first " << off+len << std::endl;
+	if (claim_by) 
+	  claim_by->append( *curbuf, off, len );
+	(*curbuf).set_offset( off+len + (*curbuf).offset() );    // ignore beginning big
+	(*curbuf).set_length( (*curbuf).length() - (len+off) );
+	_len -= off+len;
+	//cout << " now " << *curbuf << std::endl;
+	break;
+      }
+      
+      // hose though the end
+      unsigned howmuch = (*curbuf).length() - off;
+      //cout << "discarding " << howmuch << " of " << *curbuf << std::endl;
+      if (claim_by) 
+	claim_by->append( *curbuf, off, howmuch );
+      _len -= (*curbuf).length();
+      curbuf = _buffers.erase_after_and_dispose(curbuf_prev);
+      len -= howmuch;
+      off = 0;
+    }
+      
+    // splice in *replace (implement me later?)
+    
+    last_p = begin();  // just in case we were in the removed region.
+  }
+
+  void buffer::list::write(int off, int len, std::ostream& out) const
+  {
+    list s;
+    s.substr_of(*this, off, len);
+    for (const auto& node : s._buffers) {
+      if (node.length()) {
+	out.write(node.c_str(), node.length());
+      }
+    }
+  }
+  
+void buffer::list::encode_base64(buffer::list& o)
+{
+  bufferptr bp(length() * 4 / 3 + 3);
+  int l = ceph_armor(bp.c_str(), bp.c_str() + bp.length(), c_str(), c_str() + length());
+  bp.set_length(l);
+  o.push_back(std::move(bp));
+}
+
+void buffer::list::decode_base64(buffer::list& e)
+{
+  bufferptr bp(4 + ((e.length() * 3) / 4));
+  int l = ceph_unarmor(bp.c_str(), bp.c_str() + bp.length(), e.c_str(), e.c_str() + e.length());
+  if (l < 0) {
+    std::ostringstream oss;
+    oss << "decode_base64: decoding failed:\n";
+    hexdump(oss);
+    throw buffer::malformed_input(oss.str().c_str());
+  }
+  ceph_assert(l <= (int)bp.length());
+  bp.set_length(l);
+  push_back(std::move(bp));
+}
+
+
+int buffer::list::read_file(const char *fn, std::string *error)
+{
+  int fd = TEMP_FAILURE_RETRY(::open(fn, O_RDONLY|O_CLOEXEC));
+  if (fd < 0) {
+    int err = errno;
+    std::ostringstream oss;
+    oss << "can't open " << fn << ": " << cpp_strerror(err);
+    *error = oss.str();
+    return -err;
+  }
+
+  struct stat st;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&st, 0, sizeof(st));
+  if (::fstat(fd, &st) < 0) {
+    int err = errno;
+    std::ostringstream oss;
+    oss << "bufferlist::read_file(" << fn << "): stat error: "
+        << cpp_strerror(err);
+    *error = oss.str();
+    VOID_TEMP_FAILURE_RETRY(::close(fd));
+    return -err;
+  }
+
+  ssize_t ret = read_fd(fd, st.st_size);
+  if (ret < 0) {
+    std::ostringstream oss;
+    oss << "bufferlist::read_file(" << fn << "): read error:"
+	<< cpp_strerror(ret);
+    *error = oss.str();
+    VOID_TEMP_FAILURE_RETRY(::close(fd));
+    return ret;
+  }
+  else if (ret != st.st_size) {
+    // Premature EOF.
+    // Perhaps the file changed between stat() and read()?
+    std::ostringstream oss;
+    oss << "bufferlist::read_file(" << fn << "): warning: got premature EOF.";
+    *error = oss.str();
+    // not actually an error, but weird
+  }
+  VOID_TEMP_FAILURE_RETRY(::close(fd));
+  return 0;
+}
+
+ssize_t buffer::list::read_fd(int fd, size_t len)
+{
+  auto bp = ptr_node::create(buffer::create(len));
+  ssize_t ret = safe_read(fd, (void*)bp->c_str(), len);
+  if (ret >= 0) {
+    bp->set_length(ret);
+    push_back(std::move(bp));
+  }
+  return ret;
+}
+
+int buffer::list::write_file(const char *fn, int mode)
+{
+  int fd = TEMP_FAILURE_RETRY(::open(fn, O_WRONLY|O_CREAT|O_TRUNC|O_CLOEXEC, mode));
+  if (fd < 0) {
+    int err = errno;
+    cerr << "bufferlist::write_file(" << fn << "): failed to open file: "
+	 << cpp_strerror(err) << std::endl;
+    return -err;
+  }
+  int ret = write_fd(fd);
+  if (ret) {
+    cerr << "bufferlist::write_fd(" << fn << "): write_fd error: "
+	 << cpp_strerror(ret) << std::endl;
+    VOID_TEMP_FAILURE_RETRY(::close(fd));
+    return ret;
+  }
+  if (TEMP_FAILURE_RETRY(::close(fd))) {
+    int err = errno;
+    cerr << "bufferlist::write_file(" << fn << "): close error: "
+	 << cpp_strerror(err) << std::endl;
+    return -err;
+  }
+  return 0;
+}
+
+static int do_writev(int fd, struct iovec *vec, uint64_t offset, unsigned veclen, unsigned bytes)
+{
+  while (bytes > 0) {
+    ssize_t r = 0;
+#ifdef HAVE_PWRITEV
+    r = ::pwritev(fd, vec, veclen, offset);
+#else
+    r = ::lseek64(fd, offset, SEEK_SET);
+    if (r != offset) {
+      return -errno;
+    }
+    r = ::writev(fd, vec, veclen);
+#endif
+    if (r < 0) {
+      if (errno == EINTR)
+        continue;
+      return -errno;
+    }
+
+    bytes -= r;
+    offset += r;
+    if (bytes == 0) break;
+
+    while (r > 0) {
+      if (vec[0].iov_len <= (size_t)r) {
+        // drain this whole item
+        r -= vec[0].iov_len;
+        ++vec;
+        --veclen;
+      } else {
+        vec[0].iov_base = (char *)vec[0].iov_base + r;
+        vec[0].iov_len -= r;
+        break;
+      }
+    }
+  }
+  return 0;
+}
+
+int buffer::list::write_fd(int fd) const
+{
+  // use writev!
+  iovec iov[IOV_MAX];
+  int iovlen = 0;
+  ssize_t bytes = 0;
+
+  auto p = std::cbegin(_buffers);
+  while (p != std::cend(_buffers)) {
+    if (p->length() > 0) {
+      iov[iovlen].iov_base = (void *)p->c_str();
+      iov[iovlen].iov_len = p->length();
+      bytes += p->length();
+      iovlen++;
+    }
+    ++p;
+
+    if (iovlen == IOV_MAX ||
+	p == _buffers.end()) {
+      iovec *start = iov;
+      int num = iovlen;
+      ssize_t wrote;
+    retry:
+      wrote = ::writev(fd, start, num);
+      if (wrote < 0) {
+	int err = errno;
+	if (err == EINTR)
+	  goto retry;
+	return -err;
+      }
+      if (wrote < bytes) {
+	// partial write, recover!
+	while ((size_t)wrote >= start[0].iov_len) {
+	  wrote -= start[0].iov_len;
+	  bytes -= start[0].iov_len;
+	  start++;
+	  num--;
+	}
+	if (wrote > 0) {
+	  start[0].iov_len -= wrote;
+	  start[0].iov_base = (char *)start[0].iov_base + wrote;
+	  bytes -= wrote;
+	}
+	goto retry;
+      }
+      iovlen = 0;
+      bytes = 0;
+    }
+  }
+  return 0;
+}
+
+int buffer::list::write_fd(int fd, uint64_t offset) const
+{
+  iovec iov[IOV_MAX];
+
+  auto p = std::cbegin(_buffers);
+  uint64_t left_pbrs = std::size(_buffers);
+  while (left_pbrs) {
+    ssize_t bytes = 0;
+    unsigned iovlen = 0;
+    uint64_t size = std::min<uint64_t>(left_pbrs, IOV_MAX);
+    left_pbrs -= size;
+    while (size > 0) {
+      iov[iovlen].iov_base = (void *)p->c_str();
+      iov[iovlen].iov_len = p->length();
+      iovlen++;
+      bytes += p->length();
+      ++p;
+      size--;
+    }
+
+    int r = do_writev(fd, iov, offset, iovlen, bytes);
+    if (r < 0)
+      return r;
+    offset += bytes;
+  }
+  return 0;
+}
+
+__u32 buffer::list::crc32c(__u32 crc) const
+{
+  int cache_misses = 0;
+  int cache_hits = 0;
+  int cache_adjusts = 0;
+
+  for (const auto& node : _buffers) {
+    if (node.length()) {
+      raw* const r = node.get_raw();
+      pair<size_t, size_t> ofs(node.offset(), node.offset() + node.length());
+      pair<uint32_t, uint32_t> ccrc;
+      if (r->get_crc(ofs, &ccrc)) {
+	if (ccrc.first == crc) {
+	  // got it already
+	  crc = ccrc.second;
+	  cache_hits++;
+	} else {
+	  /* If we have cached crc32c(buf, v) for initial value v,
+	   * we can convert this to a different initial value v' by:
+	   * crc32c(buf, v') = crc32c(buf, v) ^ adjustment
+	   * where adjustment = crc32c(0*len(buf), v ^ v')
+	   *
+	   * http://crcutil.googlecode.com/files/crc-doc.1.0.pdf
+	   * note, u for our crc32c implementation is 0
+	   */
+	  crc = ccrc.second ^ ceph_crc32c(ccrc.first ^ crc, NULL, node.length());
+	  cache_adjusts++;
+	}
+      } else {
+	cache_misses++;
+	uint32_t base = crc;
+	crc = ceph_crc32c(crc, (unsigned char*)node.c_str(), node.length());
+	r->set_crc(ofs, make_pair(base, crc));
+      }
+    }
+  }
+
+  if (buffer_track_crc) {
+    if (cache_adjusts)
+      buffer_cached_crc_adjusted += cache_adjusts;
+    if (cache_hits)
+      buffer_cached_crc += cache_hits;
+    if (cache_misses)
+      buffer_missed_crc += cache_misses;
+  }
+
+  return crc;
+}
+
+void buffer::list::invalidate_crc()
+{
+  for (const auto& node : _buffers) {
+    raw* const r = node.get_raw();
+    if (r) {
+      r->invalidate_crc();
+    }
+  }
+}
+
+#include "common/ceph_crypto.h"
+using ceph::crypto::SHA1;
+
+sha1_digest_t buffer::list::sha1()
+{
+  unsigned char fingerprint[CEPH_CRYPTO_SHA1_DIGESTSIZE];
+  SHA1 sha1_gen;
+  for (auto& p : _buffers) {
+    sha1_gen.Update((const unsigned char *)p.c_str(), p.length());
+  }
+  sha1_gen.Final(fingerprint);
+  return sha1_digest_t(fingerprint);
+}
+
+/**
+ * Binary write all contents to a C++ stream
+ */
+void buffer::list::write_stream(std::ostream &out) const
+{
+  for (const auto& node : _buffers) {
+    if (node.length() > 0) {
+      out.write(node.c_str(), node.length());
+    }
+  }
+}
+
+
+void buffer::list::hexdump(std::ostream &out, bool trailing_newline) const
+{
+  if (!length())
+    return;
+
+  std::ios_base::fmtflags original_flags = out.flags();
+
+  // do our best to match the output of hexdump -C, for better
+  // diff'ing!
+
+  out.setf(std::ios::right);
+  out.fill('0');
+
+  unsigned per = 16;
+  bool was_zeros = false, did_star = false;
+  for (unsigned o=0; o<length(); o += per) {
+    if (o + per < length()) {
+      bool row_is_zeros = true;
+      for (unsigned i=0; i<per && o+i<length(); i++) {
+	if ((*this)[o+i]) {
+	  row_is_zeros = false;
+	}
+      }
+      if (row_is_zeros) {
+	if (was_zeros) {
+	  if (!did_star) {
+	    out << "\n*";
+	    did_star = true;
+	  }
+	  continue;
+	}
+	was_zeros = true;
+      } else {
+	was_zeros = false;
+	did_star = false;
+      }
+    }
+    if (o)
+      out << "\n";
+    out << std::hex << std::setw(8) << o << " ";
+
+    unsigned i;
+    for (i=0; i<per && o+i<length(); i++) {
+      if (i == 8)
+	out << ' ';
+      out << " " << std::setw(2) << ((unsigned)(*this)[o+i] & 0xff);
+    }
+    for (; i<per; i++) {
+      if (i == 8)
+	out << ' ';
+      out << "   ";
+    }
+    
+    out << "  |";
+    for (i=0; i<per && o+i<length(); i++) {
+      char c = (*this)[o+i];
+      if (isupper(c) || islower(c) || isdigit(c) || c == ' ' || ispunct(c))
+	out << c;
+      else
+	out << '.';
+    }
+    out << '|' << std::dec;
+  }
+  if (trailing_newline) {
+    out << "\n" << std::hex << std::setw(8) << length();
+    out << "\n";
+  }
+
+  out.flags(original_flags);
+}
+
+
+buffer::list buffer::list::static_from_mem(char* c, size_t l) {
+  list bl;
+  bl.push_back(ptr_node::create(create_static(l, c)));
+  return bl;
+}
+
+buffer::list buffer::list::static_from_cstring(char* c) {
+  return static_from_mem(c, std::strlen(c));
+}
+
+buffer::list buffer::list::static_from_string(string& s) {
+  // C++14 just has string::data return a char* from a non-const
+  // string.
+  return static_from_mem(const_cast<char*>(s.data()), s.length());
+  // But the way buffer::list mostly doesn't work in a sane way with
+  // const makes me generally sad.
+}
+
+bool buffer::ptr_node::dispose_if_hypercombined(
+  buffer::ptr_node* const delete_this)
+{
+  const bool is_hypercombined = static_cast<void*>(delete_this) == \
+    static_cast<void*>(&delete_this->get_raw()->bptr_storage);
+  if (is_hypercombined) {
+    ceph_assert_always("hypercombining is currently disabled" == nullptr);
+    delete_this->~ptr_node();
+  }
+  return is_hypercombined;
+}
+
+std::unique_ptr<buffer::ptr_node, buffer::ptr_node::disposer>
+buffer::ptr_node::create_hypercombined(ceph::unique_leakable_ptr<buffer::raw> r)
+{
+  // FIXME: we don't currently hypercombine buffers due to crashes
+  // observed in the rados suite. After fixing we'll use placement
+  // new to create ptr_node on buffer::raw::bptr_storage.
+  return std::unique_ptr<buffer::ptr_node, buffer::ptr_node::disposer>(
+    new ptr_node(std::move(r)));
+}
+
+std::unique_ptr<buffer::ptr_node, buffer::ptr_node::disposer>
+buffer::ptr_node::create_hypercombined(buffer::raw* const r)
+{
+  if (likely(r->nref == 0)) {
+    // FIXME: we don't currently hypercombine buffers due to crashes
+    // observed in the rados suite. After fixing we'll use placement
+    // new to create ptr_node on buffer::raw::bptr_storage.
+    return std::unique_ptr<buffer::ptr_node, buffer::ptr_node::disposer>(
+      new ptr_node(r));
+  } else {
+    return std::unique_ptr<buffer::ptr_node, buffer::ptr_node::disposer>(
+      new ptr_node(r));
+  }
+}
+
+buffer::ptr_node* buffer::ptr_node::copy_hypercombined(
+  const buffer::ptr_node& copy_this)
+{
+  // FIXME: we don't currently hypercombine buffers due to crashes
+  // observed in the rados suite. After fixing we'll use placement
+  // new to create ptr_node on buffer::raw::bptr_storage.
+  auto raw_new = copy_this.get_raw()->clone();
+  return new ptr_node(copy_this, std::move(raw_new));
+}
+
+buffer::ptr_node* buffer::ptr_node::cloner::operator()(
+  const buffer::ptr_node& clone_this)
+{
+  const raw* const raw_this = clone_this.get_raw();
+  if (likely(!raw_this || raw_this->is_shareable())) {
+    return new ptr_node(clone_this);
+  } else {
+    // clone non-shareable buffers (make shareable)
+   return copy_hypercombined(clone_this);
+  }
+}
+
+std::ostream& buffer::operator<<(std::ostream& out, const buffer::raw &r) {
+  return out << "buffer::raw(" << (void*)r.data << " len " << r.len << " nref " << r.nref.load() << ")";
+}
+
+std::ostream& buffer::operator<<(std::ostream& out, const buffer::ptr& bp) {
+  if (bp.have_raw())
+    out << "buffer::ptr(" << bp.offset() << "~" << bp.length()
+	<< " " << (void*)bp.c_str()
+	<< " in raw " << (void*)bp.raw_c_str()
+	<< " len " << bp.raw_length()
+	<< " nref " << bp.raw_nref() << ")";
+  else
+    out << "buffer:ptr(" << bp.offset() << "~" << bp.length() << " no raw)";
+  return out;
+}
+
+std::ostream& buffer::operator<<(std::ostream& out, const buffer::list& bl) {
+  out << "buffer::list(len=" << bl.length() << "," << std::endl;
+
+  for (const auto& node : bl.buffers()) {
+    out << "\t" << node;
+    if (&node != &bl.buffers().back()) {
+      out << "," << std::endl;
+    }
+  }
+  out << std::endl << ")";
+  return out;
+}
+
+std::ostream& buffer::operator<<(std::ostream& out, const buffer::error& e)
+{
+  return out << e.what();
+}
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(buffer::raw_malloc, buffer_raw_malloc,
+			      buffer_meta);
+MEMPOOL_DEFINE_OBJECT_FACTORY(buffer::raw_posix_aligned,
+			      buffer_raw_posix_aligned, buffer_meta);
+MEMPOOL_DEFINE_OBJECT_FACTORY(buffer::raw_char, buffer_raw_char, buffer_meta);
+MEMPOOL_DEFINE_OBJECT_FACTORY(buffer::raw_claimed_char, buffer_raw_claimed_char,
+			      buffer_meta);
+MEMPOOL_DEFINE_OBJECT_FACTORY(buffer::raw_unshareable, buffer_raw_unshareable,
+			      buffer_meta);
+MEMPOOL_DEFINE_OBJECT_FACTORY(buffer::raw_static, buffer_raw_static,
+			      buffer_meta);
+
diff --git a/src/common/buffer_seastar.cc b/src/common/buffer_seastar.cc
new file mode 100644
index 00000000..cd1e07df
--- /dev/null
+++ b/src/common/buffer_seastar.cc
@@ -0,0 +1,111 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <seastar/core/sharded.hh>
+
+#include "include/buffer_raw.h"
+#include "buffer_seastar.h"
+
+using temporary_buffer = seastar::temporary_buffer<char>;
+
+namespace ceph::buffer {
+
+class raw_seastar_foreign_ptr : public raw {
+  seastar::foreign_ptr<temporary_buffer> ptr;
+ public:
+  raw_seastar_foreign_ptr(temporary_buffer&& buf)
+    : raw(buf.get_write(), buf.size()), ptr(std::move(buf)) {}
+  raw* clone_empty() override {
+    return create(len).release();
+  }
+};
+
+class raw_seastar_local_ptr : public raw {
+  temporary_buffer buf;
+ public:
+  raw_seastar_local_ptr(temporary_buffer&& buf)
+    : raw(buf.get_write(), buf.size()), buf(std::move(buf)) {}
+  raw* clone_empty() override {
+    return create(len).release();
+  }
+};
+
+inline namespace v14_2_0 {
+
+raw* create_foreign(temporary_buffer&& buf) {
+  return new raw_seastar_foreign_ptr(std::move(buf));
+}
+
+raw* create(temporary_buffer&& buf) {
+  return new raw_seastar_local_ptr(std::move(buf));
+}
+
+} // inline namespace v14_2_0
+
+// buffer::ptr conversions
+
+ptr::operator seastar::temporary_buffer<char>() &
+{
+  return {c_str(), _len, seastar::make_object_deleter(*this)};
+}
+
+ptr::operator seastar::temporary_buffer<char>() &&
+{
+  auto data = c_str();
+  auto length = _len;
+  return {data, length, seastar::make_object_deleter(std::move(*this))};
+}
+
+// buffer::list conversions
+
+list::operator seastar::net::packet() &&
+{
+  seastar::net::packet p;
+  p.reserve(_buffers.size());
+  for (auto& ptr : _buffers) {
+    // append each ptr as a temporary_buffer
+    p = seastar::net::packet(std::move(p), std::move(ptr));
+  }
+  clear();
+  return p;
+}
+
+} // namespace ceph::buffer
+
+namespace {
+
+using ceph::buffer::raw;
+class raw_seastar_local_shared_ptr : public raw {
+  temporary_buffer buf;
+public:
+  raw_seastar_local_shared_ptr(temporary_buffer& buf)
+    : raw(buf.get_write(), buf.size()), buf(buf.share()) {}
+  raw* clone_empty() override {
+    return ceph::buffer::create(len).release();
+  }
+};
+}
+
+buffer::ptr seastar_buffer_iterator::get_ptr(size_t len)
+{
+  buffer::raw* r = new raw_seastar_local_shared_ptr{buf};
+  buffer::ptr p{r};
+  p.set_length(len);
+  return p;
+}
+
+buffer::ptr const_seastar_buffer_iterator::get_ptr(size_t len)
+{
+  return buffer::ptr{ buffer::copy(get_pos_add(len), len) };
+}
diff --git a/src/common/buffer_seastar.h b/src/common/buffer_seastar.h
new file mode 100644
index 00000000..6a0e0cf6
--- /dev/null
+++ b/src/common/buffer_seastar.h
@@ -0,0 +1,61 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <seastar/core/temporary_buffer.hh>
+#include "include/buffer.h"
+
+namespace details {
+
+template<bool is_const>
+class buffer_iterator_impl {
+public:
+  using pointer = std::conditional_t<is_const, const char*, char *>;
+  buffer_iterator_impl(pointer first, const char* last)
+    : pos(first), end_ptr(last)
+  {}
+  pointer get_pos_add(size_t n) {
+    auto r = pos;
+    pos += n;
+    if (pos > end_ptr) {
+      throw buffer::end_of_buffer{};
+    }
+    return r;
+  }
+  pointer get() const {
+    return pos;
+  }
+protected:
+  pointer pos;
+  const char* end_ptr;
+};
+} // namespace details
+
+class seastar_buffer_iterator : details::buffer_iterator_impl<false> {
+  using parent = details::buffer_iterator_impl<false>;
+  using temporary_buffer = seastar::temporary_buffer<char>;
+public:
+  seastar_buffer_iterator(temporary_buffer& b)
+    : parent(b.get_write(), b.end()), buf(b)
+  {}
+  using parent::pointer;
+  using parent::get_pos_add;
+  using parent::get;
+  ceph::buffer::ptr get_ptr(size_t len);
+
+private:
+  // keep the reference to buf around, so it can be "shared" by get_ptr()
+  temporary_buffer& buf;
+};
+
+class const_seastar_buffer_iterator : details::buffer_iterator_impl<true> {
+  using parent = details::buffer_iterator_impl<true>;
+  using temporary_buffer = seastar::temporary_buffer<char>;
+public:
+  const_seastar_buffer_iterator(temporary_buffer& b)
+    : parent(b.get_write(), b.end())
+  {}
+  using parent::pointer;
+  using parent::get_pos_add;
+  using parent::get;
+  ceph::buffer::ptr get_ptr(size_t len);
+};
diff --git a/src/common/ceph_argparse.cc b/src/common/ceph_argparse.cc
new file mode 100644
index 00000000..08e82424
--- /dev/null
+++ b/src/common/ceph_argparse.cc
@@ -0,0 +1,578 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include <stdarg.h>
+
+#include "auth/Auth.h"
+#include "common/ceph_argparse.h"
+#include "common/config.h"
+#include "common/version.h"
+#include "include/str_list.h"
+
+/*
+ * Ceph argument parsing library
+ *
+ * We probably should eventually replace this with something standard like popt.
+ * Until we do that, though, this file is the place for argv parsing
+ * stuff to live.
+ */
+
+#undef dout
+#undef pdout
+#undef derr
+#undef generic_dout
+#undef dendl
+
+struct strict_str_convert {
+  const char *str;
+  std::string *err;
+  strict_str_convert(const char *str,  std::string *err)
+    : str(str), err(err) {}
+
+  inline operator float() const
+  {
+    return strict_strtof(str, err);
+  }
+  inline operator int() const
+  {
+    return strict_strtol(str, 10, err);
+  }
+  inline operator long long() const
+  {
+    return  strict_strtoll(str, 10, err);
+  }
+};
+
+void string_to_vec(std::vector<std::string>& args, std::string argstr)
+{
+  istringstream iss(argstr);
+  while(iss) {
+    string sub;
+    iss >> sub;
+    if (sub == "") break;
+    args.push_back(sub);
+  }
+}
+
+std::pair<std::vector<const char*>, std::vector<const char*>>
+split_dashdash(const std::vector<const char*>& args) {
+  auto dashdash = std::find_if(args.begin(), args.end(),
+			       [](const char* arg) {
+				 return strcmp(arg, "--") == 0;
+			       });
+  std::vector<const char*> options{args.begin(), dashdash};
+  if (dashdash != args.end()) {
+    ++dashdash;
+  }
+  std::vector<const char*> arguments{dashdash, args.end()};
+  return {std::move(options), std::move(arguments)};
+}
+
+static std::mutex g_str_vec_lock;
+static vector<string> g_str_vec;
+
+void clear_g_str_vec()
+{
+  g_str_vec_lock.lock();
+  g_str_vec.clear();
+  g_str_vec_lock.unlock();
+}
+
+void env_to_vec(std::vector<const char*>& args, const char *name)
+{
+  if (!name)
+    name = "CEPH_ARGS";
+
+  auto [options, arguments] = split_dashdash(args);
+
+  /*
+   * We can only populate str_vec once. Other threads could hold pointers into
+   * it, so clearing it out and replacing it is not currently safe.
+   */
+  g_str_vec_lock.lock();
+  if (g_str_vec.empty()) {
+    char *p = getenv(name);
+    if (!p) {
+      g_str_vec_lock.unlock();
+      return;
+    }
+    get_str_vec(p, " ", g_str_vec);
+  }
+  g_str_vec_lock.unlock();
+
+  std::vector<const char*> env;
+  for (const auto& s : g_str_vec) {
+    env.push_back(s.c_str());
+  }
+  auto [env_options, env_arguments] = split_dashdash(env);
+
+  args.clear();
+  args.insert(args.end(), env_options.begin(), env_options.end());
+  args.insert(args.end(), options.begin(), options.end());
+  if (arguments.empty() && env_arguments.empty()) {
+    return;
+  }
+  args.push_back("--");
+  args.insert(args.end(), env_arguments.begin(), env_arguments.end());
+  args.insert(args.end(), arguments.begin(), arguments.end());
+}
+
+void argv_to_vec(int argc, const char **argv,
+                 std::vector<const char*>& args)
+{
+  args.insert(args.end(), argv + 1, argv + argc);
+}
+
+void vec_to_argv(const char *argv0, std::vector<const char*>& args,
+                 int *argc, const char ***argv)
+{
+  *argv = (const char**)malloc(sizeof(char*) * (args.size() + 1));
+  if (!*argv)
+    throw bad_alloc();
+  *argc = 1;
+  (*argv)[0] = argv0;
+
+  for (unsigned i=0; i<args.size(); i++)
+    (*argv)[(*argc)++] = args[i];
+}
+
+void ceph_arg_value_type(const char * nextargstr, bool *bool_option, bool *bool_numeric)
+{
+  bool is_numeric = true;
+  bool is_float = false;
+  bool is_option;
+
+  if (nextargstr == NULL) {
+    return;
+  }
+
+  if (strlen(nextargstr) < 2) {
+    is_option = false;
+  } else {
+    is_option = (nextargstr[0] == '-') && (nextargstr[1] == '-');
+  }
+
+  for (unsigned int i = 0; i < strlen(nextargstr); i++) {
+    if (!(nextargstr[i] >= '0' && nextargstr[i] <= '9')) {
+      // May be negative numeral value
+      if ((i == 0) && (strlen(nextargstr) >= 2))  {
+	if (nextargstr[0] == '-')
+	  continue;
+      }
+      if ( (nextargstr[i] == '.') && (is_float == false) ) {
+        is_float = true;
+        continue;
+      }
+        
+      is_numeric = false;
+      break;
+    }
+  }
+
+  // -<option>
+  if (nextargstr[0] == '-' && is_numeric == false) {
+    is_option = true;
+  }
+
+  *bool_option = is_option;
+  *bool_numeric = is_numeric;
+
+  return;
+}
+
+
+bool parse_ip_port_vec(const char *s, vector<entity_addrvec_t>& vec, int type)
+{
+  // first split by [ ;], which are not valid for an addrvec
+  list<string> items;
+  get_str_list(s, " ;", items);
+
+  for (auto& i : items) {
+    const char *s = i.c_str();
+    while (*s) {
+      const char *end;
+
+      // try parsing as an addr
+      entity_addr_t a;
+      if (a.parse(s, &end, type)) {
+	vec.push_back(entity_addrvec_t(a));
+	s = end;
+	if (*s == ',') {
+	  ++s;
+	}
+	continue;
+      }
+
+      // ok, try parsing as an addrvec
+      entity_addrvec_t av;
+      if (!av.parse(s, &end)) {
+	return false;
+      }
+      vec.push_back(av);
+      s = end;
+      if (*s == ',') {
+	++s;
+      }
+    }
+  }
+  return true;
+}
+
+// The defaults for CephInitParameters
+CephInitParameters::CephInitParameters(uint32_t module_type_)
+  : module_type(module_type_)
+{
+  name.set(module_type, "admin");
+}
+
+static void dashes_to_underscores(const char *input, char *output)
+{
+  char c = 0;
+  char *o = output;
+  const char *i = input;
+  // first two characters are copied as-is
+  *o = *i++;
+  if (*o++ == '\0')
+    return;
+  *o = *i++;
+  if (*o++ == '\0')
+    return;
+  for (; ((c = *i)); ++i) {
+    if (c == '=') {
+      strcpy(o, i);
+      return;
+    }
+    if (c == '-')
+      *o++ = '_';
+    else
+      *o++ = c;
+  }
+  *o++ = '\0';
+}
+
+/** Once we see a standalone double dash, '--', we should remove it and stop
+ * looking for any other options and flags. */
+bool ceph_argparse_double_dash(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i)
+{
+  if (strcmp(*i, "--") == 0) {
+    i = args.erase(i);
+    return true;
+  }
+  return false;
+}
+
+bool ceph_argparse_flag(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, ...)
+{
+  const char *first = *i;
+  char tmp[strlen(first)+1];
+  dashes_to_underscores(first, tmp);
+  first = tmp;
+  va_list ap;
+
+  va_start(ap, i);
+  while (1) {
+    const char *a = va_arg(ap, char*);
+    if (a == NULL) {
+      va_end(ap);
+      return false;
+    }
+    char a2[strlen(a)+1];
+    dashes_to_underscores(a, a2);
+    if (strcmp(a2, first) == 0) {
+      i = args.erase(i);
+      va_end(ap);
+      return true;
+    }
+  }
+}
+
+static bool va_ceph_argparse_binary_flag(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, int *ret,
+	std::ostream *oss, va_list ap)
+{
+  const char *first = *i;
+  char tmp[strlen(first)+1];
+  dashes_to_underscores(first, tmp);
+  first = tmp;
+
+  // does this argument match any of the possibilities?
+  while (1) {
+    const char *a = va_arg(ap, char*);
+    if (a == NULL)
+      return false;
+    int strlen_a = strlen(a);
+    char a2[strlen_a+1];
+    dashes_to_underscores(a, a2);
+    if (strncmp(a2, first, strlen(a2)) == 0) {
+      if (first[strlen_a] == '=') {
+	i = args.erase(i);
+	const char *val = first + strlen_a + 1;
+	if ((strcmp(val, "true") == 0) || (strcmp(val, "1") == 0)) {
+	  *ret = 1;
+	  return true;
+	}
+	else if ((strcmp(val, "false") == 0) || (strcmp(val, "0") == 0)) {
+	  *ret = 0;
+	  return true;
+	}
+	if (oss) {
+	  (*oss) << "Parse error parsing binary flag  " << a
+	         << ". Expected true or false, but got '" << val << "'\n";
+	}
+	*ret = -EINVAL;
+	return true;
+      }
+      else if (first[strlen_a] == '\0') {
+	i = args.erase(i);
+	*ret = 1;
+	return true;
+      }
+    }
+  }
+}
+
+bool ceph_argparse_binary_flag(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, int *ret,
+	std::ostream *oss, ...)
+{
+  bool r;
+  va_list ap;
+  va_start(ap, oss);
+  r = va_ceph_argparse_binary_flag(args, i, ret, oss, ap);
+  va_end(ap);
+  return r;
+}
+
+static int va_ceph_argparse_witharg(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, std::string *ret,
+	std::ostream &oss, va_list ap)
+{
+  const char *first = *i;
+  char tmp[strlen(first)+1];
+  dashes_to_underscores(first, tmp);
+  first = tmp;
+
+  // does this argument match any of the possibilities?
+  while (1) {
+    const char *a = va_arg(ap, char*);
+    if (a == NULL)
+      return 0;
+    int strlen_a = strlen(a);
+    char a2[strlen_a+1];
+    dashes_to_underscores(a, a2);
+    if (strncmp(a2, first, strlen(a2)) == 0) {
+      if (first[strlen_a] == '=') {
+	*ret = first + strlen_a + 1;
+	i = args.erase(i);
+	return 1;
+      }
+      else if (first[strlen_a] == '\0') {
+	// find second part (or not)
+	if (i+1 == args.end()) {
+	  oss << "Option " << *i << " requires an argument." << std::endl;
+	  i = args.erase(i);
+	  return -EINVAL;
+	}
+	i = args.erase(i);
+	*ret = *i;
+	i = args.erase(i);
+	return 1;
+      }
+    }
+  }
+}
+
+template<class T>
+bool ceph_argparse_witharg(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, T *ret,
+	std::ostream &oss, ...)
+{
+  int r;
+  va_list ap;
+  bool is_option = false;
+  bool is_numeric = true;
+  std::string str;
+  va_start(ap, oss);
+  r = va_ceph_argparse_witharg(args, i, &str, oss, ap);
+  va_end(ap);
+  if (r == 0) {
+    return false;
+  } else if (r < 0) {
+    return true;
+  }
+
+  ceph_arg_value_type(str.c_str(), &is_option, &is_numeric);
+  if ((is_option == true) || (is_numeric == false)) {
+    *ret = EXIT_FAILURE;
+    if (is_option == true) {
+      oss << "Missing option value";
+    } else {
+      oss << "The option value '" << str << "' is invalid";
+    }
+    return true;
+  }
+
+  std::string err;
+  T myret = strict_str_convert(str.c_str(), &err);
+  *ret = myret;
+  if (!err.empty()) {
+    oss << err;
+  }
+  return true;
+}
+
+template bool ceph_argparse_witharg<int>(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, int *ret,
+	std::ostream &oss, ...);
+
+template bool ceph_argparse_witharg<long long>(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, long long *ret,
+	std::ostream &oss, ...);
+
+template bool ceph_argparse_witharg<float>(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, float *ret,
+	std::ostream &oss, ...);
+
+bool ceph_argparse_witharg(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, std::string *ret,
+	std::ostream &oss, ...)
+{
+  int r;
+  va_list ap;
+  va_start(ap, oss);
+  r = va_ceph_argparse_witharg(args, i, ret, oss, ap);
+  va_end(ap);
+  return r != 0;
+}
+
+bool ceph_argparse_witharg(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, std::string *ret, ...)
+{
+  int r;
+  va_list ap;
+  va_start(ap, ret);
+  r = va_ceph_argparse_witharg(args, i, ret, cerr, ap);
+  va_end(ap);
+  if (r < 0)
+    _exit(1);
+  return r != 0;
+}
+
+CephInitParameters ceph_argparse_early_args
+	  (std::vector<const char*>& args, uint32_t module_type,
+	   std::string *cluster, std::string *conf_file_list)
+{
+  CephInitParameters iparams(module_type);
+  std::string val;
+
+  vector<const char *> orig_args = args;
+
+  for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
+    if (strcmp(*i, "--") == 0) {
+      /* Normally we would use ceph_argparse_double_dash. However, in this
+       * function we *don't* want to remove the double dash, because later
+       * argument parses will still need to see it. */
+      break;
+    }
+    else if (ceph_argparse_flag(args, i, "--version", "-v", (char*)NULL)) {
+      cout << pretty_version_to_str() << std::endl;
+      _exit(0);
+    }
+    else if (ceph_argparse_witharg(args, i, &val, "--conf", "-c", (char*)NULL)) {
+      *conf_file_list = val;
+    }
+    else if (ceph_argparse_witharg(args, i, &val, "--cluster", (char*)NULL)) {
+      *cluster = val;
+    }
+    else if ((module_type != CEPH_ENTITY_TYPE_CLIENT) &&
+	     (ceph_argparse_witharg(args, i, &val, "-i", (char*)NULL))) {
+      iparams.name.set_id(val);
+    }
+    else if (ceph_argparse_witharg(args, i, &val, "--id", "--user", (char*)NULL)) {
+      iparams.name.set_id(val);
+    }
+    else if (ceph_argparse_witharg(args, i, &val, "--name", "-n", (char*)NULL)) {
+      if (!iparams.name.from_str(val)) {
+	cerr << "error parsing '" << val << "': expected string of the form TYPE.ID, "
+	     << "valid types are: " << EntityName::get_valid_types_as_str()
+	     << std::endl;
+	_exit(1);
+      }
+    }
+    else if (ceph_argparse_flag(args, i, "--show_args", (char*)NULL)) {
+      cout << "args: ";
+      for (std::vector<const char *>::iterator ci = orig_args.begin(); ci != orig_args.end(); ++ci) {
+        if (ci != orig_args.begin())
+          cout << " ";
+        cout << *ci;
+      }
+      cout << std::endl;
+    }
+    else {
+      // ignore
+      ++i;
+    }
+  }
+  return iparams;
+}
+
+static void generic_usage(bool is_server)
+{
+  cout <<
+    "  --conf/-c FILE    read configuration from the given configuration file" << std::endl <<
+    (is_server ?
+    "  --id/-i ID        set ID portion of my name" :
+    "  --id ID           set ID portion of my name") << std::endl <<
+    "  --name/-n TYPE.ID set name" << std::endl <<
+    "  --cluster NAME    set cluster name (default: ceph)" << std::endl <<
+    "  --setuser USER    set uid to user or uid (and gid to user's gid)" << std::endl <<
+    "  --setgroup GROUP  set gid to group or gid" << std::endl <<
+    "  --version         show version and quit" << std::endl
+    << std::endl;
+
+  if (is_server) {
+    cout <<
+      "  -d                run in foreground, log to stderr" << std::endl <<
+      "  -f                run in foreground, log to usual location" << std::endl <<
+      std::endl <<
+      "  --debug_ms N      set message debug level (e.g. 1)" << std::endl;
+  }
+
+  cout.flush();
+}
+
+bool ceph_argparse_need_usage(const std::vector<const char*>& args)
+{
+  if (args.empty()) {
+    return true;
+  }
+  for (auto a : args) {
+    if (strcmp(a, "-h") == 0 ||
+	strcmp(a, "--help") == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void generic_server_usage()
+{
+  generic_usage(true);
+}
+
+void generic_client_usage()
+{
+  generic_usage(false);
+}
diff --git a/src/common/ceph_argparse.h b/src/common/ceph_argparse.h
new file mode 100644
index 00000000..214b3dfb
--- /dev/null
+++ b/src/common/ceph_argparse.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2008-2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_ARGPARSE_H
+#define CEPH_ARGPARSE_H
+
+/*
+ * Ceph argument parsing library
+ *
+ * We probably should eventually replace this with something standard like popt.
+ * Until we do that, though, this file is the place for argv parsing
+ * stuff to live.
+ */
+
+#include <string>
+#include <vector>
+
+#include "common/entity_name.h"
+
+/////////////////////// Types ///////////////////////
+class CephInitParameters
+{
+public:
+  explicit CephInitParameters(uint32_t module_type_);
+  std::list<std::string> get_conf_files() const;
+
+  uint32_t module_type;
+  EntityName name;
+};
+
+/////////////////////// Functions ///////////////////////
+extern void string_to_vec(std::vector<std::string>& args, std::string argstr);
+extern void clear_g_str_vec();
+extern void env_to_vec(std::vector<const char*>& args, const char *name=NULL);
+extern void argv_to_vec(int argc, const char **argv,
+                 std::vector<const char*>& args);
+extern void vec_to_argv(const char *argv0, std::vector<const char*>& args,
+			int *argc, const char ***argv);
+
+extern bool parse_ip_port_vec(const char *s, std::vector<entity_addrvec_t>& vec,
+			      int type=0);
+bool ceph_argparse_double_dash(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i);
+bool ceph_argparse_flag(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, ...);
+bool ceph_argparse_witharg(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, std::string *ret,
+	std::ostream &oss, ...);
+bool ceph_argparse_witharg(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, std::string *ret, ...);
+template<class T>
+bool ceph_argparse_witharg(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, T *ret,
+	std::ostream &oss, ...);
+bool ceph_argparse_binary_flag(std::vector<const char*> &args,
+	std::vector<const char*>::iterator &i, int *ret,
+	std::ostream *oss, ...);
+extern CephInitParameters ceph_argparse_early_args
+	    (std::vector<const char*>& args, uint32_t module_type,
+	     std::string *cluster, std::string *conf_file_list);
+extern bool ceph_argparse_need_usage(const std::vector<const char*>& args);
+extern void generic_server_usage();
+extern void generic_client_usage();
+
+#endif
diff --git a/src/common/ceph_context.cc b/src/common/ceph_context.cc
new file mode 100644
index 00000000..545e7725
--- /dev/null
+++ b/src/common/ceph_context.cc
@@ -0,0 +1,966 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ * Copyright (C) 2017 OVH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/ceph_context.h"
+
+#include <mutex>
+#include <iostream>
+
+#include <pthread.h>
+
+#include <boost/algorithm/string.hpp>
+
+#include "include/mempool.h"
+#include "common/admin_socket.h"
+#include "common/code_environment.h"
+#include "common/ceph_mutex.h"
+#include "common/debug.h"
+#include "common/config.h"
+#include "common/ceph_crypto.h"
+#include "common/HeartbeatMap.h"
+#include "common/errno.h"
+#include "common/Graylog.h"
+
+#include "log/Log.h"
+
+#include "auth/Crypto.h"
+#include "include/str_list.h"
+#include "common/config.h"
+#include "common/config_obs.h"
+#include "common/PluginRegistry.h"
+#include "common/valgrind.h"
+#include "include/spinlock.h"
+#ifndef WITH_SEASTAR
+#include "mon/MonMap.h"
+#endif
+
+using ceph::bufferlist;
+using ceph::HeartbeatMap;
+
+// for CINIT_FLAGS
+#include "common/common_init.h"
+
+#include <iostream>
+#include <pthread.h>
+
+#ifdef WITH_SEASTAR
+CephContext::CephContext()
+  : _conf{ceph::common::local_conf()},
+    _perf_counters_collection{ceph::common::local_perf_coll()},
+    _crypto_random{std::make_unique<CryptoRandom>()}
+{}
+
+// define the dtor in .cc as CryptoRandom is an incomplete type in the header
+CephContext::~CephContext()
+{}
+
+uint32_t CephContext::get_module_type() const
+{
+  return CEPH_ENTITY_TYPE_OSD;
+}
+
+CryptoRandom* CephContext::random() const
+{
+  return _crypto_random.get();
+}
+
+CephContext* CephContext::get()
+{
+  ++nref;
+  return this;
+}
+
+void CephContext::put()
+{
+  if (--nref == 0) {
+    delete this;
+  }
+}
+
+PerfCountersCollectionImpl* CephContext::get_perfcounters_collection()
+{
+  return _perf_counters_collection.get_perf_collection();
+}
+
+#else  // WITH_SEASTAR
+namespace {
+
+class LockdepObs : public md_config_obs_t {
+public:
+  explicit LockdepObs(CephContext *cct)
+    : m_cct(cct), m_registered(false), lock(ceph::make_mutex("lock_dep_obs")) {
+  }
+  ~LockdepObs() override {
+    if (m_registered) {
+      lockdep_unregister_ceph_context(m_cct);
+    }
+  }
+
+  const char** get_tracked_conf_keys() const override {
+    static const char *KEYS[] = {"lockdep", NULL};
+    return KEYS;
+  }
+
+  void handle_conf_change(const ConfigProxy& conf,
+                          const std::set <std::string> &changed) override {
+    std::unique_lock locker(lock);
+    if (conf->lockdep && !m_registered) {
+      lockdep_register_ceph_context(m_cct);
+      m_registered = true;
+    } else if (!conf->lockdep && m_registered) {
+      lockdep_unregister_ceph_context(m_cct);
+      m_registered = false;
+    }
+  }
+private:
+  CephContext *m_cct;
+  bool m_registered;
+  ceph::mutex lock;
+};
+
+class MempoolObs : public md_config_obs_t,
+		  public AdminSocketHook {
+  CephContext *cct;
+  ceph::mutex lock;
+
+public:
+  explicit MempoolObs(CephContext *cct)
+    : cct(cct), lock(ceph::make_mutex("mem_pool_obs")) {
+    cct->_conf.add_observer(this);
+    int r = cct->get_admin_socket()->register_command(
+      "dump_mempools",
+      "dump_mempools",
+      this,
+      "get mempool stats");
+    ceph_assert(r == 0);
+  }
+  ~MempoolObs() override {
+    cct->_conf.remove_observer(this);
+    cct->get_admin_socket()->unregister_command("dump_mempools");
+  }
+
+  // md_config_obs_t
+  const char** get_tracked_conf_keys() const override {
+    static const char *KEYS[] = {
+      "mempool_debug",
+      NULL
+    };
+    return KEYS;
+  }
+
+  void handle_conf_change(const ConfigProxy& conf,
+                          const std::set <std::string> &changed) override {
+    std::unique_lock locker(lock);
+    if (changed.count("mempool_debug")) {
+      mempool::set_debug_mode(cct->_conf->mempool_debug);
+    }
+  }
+
+  // AdminSocketHook
+  bool call(std::string_view command, const cmdmap_t& cmdmap,
+	    std::string_view format, bufferlist& out) override {
+    if (command == "dump_mempools") {
+      std::unique_ptr<Formatter> f(Formatter::create(format));
+      f->open_object_section("mempools");
+      mempool::dump(f.get());
+      f->close_section();
+      f->flush(out);
+      return true;
+    }
+    return false;
+  }
+};
+
+} // anonymous namespace
+
+class CephContextServiceThread : public Thread
+{
+public:
+  explicit CephContextServiceThread(CephContext *cct)
+    : _reopen_logs(false), _exit_thread(false), _cct(cct)
+  {
+  }
+
+  ~CephContextServiceThread() override {}
+
+  void *entry() override
+  {
+    while (1) {
+      std::unique_lock l(_lock);
+      if (_exit_thread) {
+        break;
+      }
+
+      if (_cct->_conf->heartbeat_interval) {
+        auto interval = ceph::make_timespan(_cct->_conf->heartbeat_interval);
+        _cond.wait_for(l, interval);
+      } else
+        _cond.wait(l);
+
+      if (_exit_thread) {
+        break;
+      }
+
+      if (_reopen_logs) {
+        _cct->_log->reopen_log_file();
+        _reopen_logs = false;
+      }
+      _cct->_heartbeat_map->check_touch_file();
+
+      // refresh the perf coutners
+      _cct->_refresh_perf_values();
+    }
+    return NULL;
+  }
+
+  void reopen_logs()
+  {
+    std::lock_guard l(_lock);
+    _reopen_logs = true;
+    _cond.notify_all();
+  }
+
+  void exit_thread()
+  {
+    std::lock_guard l(_lock);
+    _exit_thread = true;
+    _cond.notify_all();
+  }
+
+private:
+  ceph::mutex _lock = ceph::make_mutex("CephContextServiceThread::_lock");
+  ceph::condition_variable _cond;
+  bool _reopen_logs;
+  bool _exit_thread;
+  CephContext *_cct;
+};
+
+
+/**
+ * observe logging config changes
+ *
+ * The logging subsystem sits below most of the ceph code, including
+ * the config subsystem, to keep it simple and self-contained.  Feed
+ * logging-related config changes to the log.
+ */
+class LogObs : public md_config_obs_t {
+  ceph::logging::Log *log;
+  ceph::mutex lock;
+
+public:
+  explicit LogObs(ceph::logging::Log *l)
+    : log(l), lock(ceph::make_mutex("log_obs")) {
+  }
+
+  const char** get_tracked_conf_keys() const override {
+    static const char *KEYS[] = {
+      "log_file",
+      "log_max_new",
+      "log_max_recent",
+      "log_to_file",
+      "log_to_syslog",
+      "err_to_syslog",
+      "log_stderr_prefix",
+      "log_to_stderr",
+      "err_to_stderr",
+      "log_to_graylog",
+      "err_to_graylog",
+      "log_graylog_host",
+      "log_graylog_port",
+      "log_coarse_timestamps",
+      "fsid",
+      "host",
+      NULL
+    };
+    return KEYS;
+  }
+
+  void handle_conf_change(const ConfigProxy& conf,
+                          const std::set <std::string> &changed) override {
+    std::unique_lock locker(lock);
+    // stderr
+    if (changed.count("log_to_stderr") || changed.count("err_to_stderr")) {
+      int l = conf->log_to_stderr ? 99 : (conf->err_to_stderr ? -1 : -2);
+      log->set_stderr_level(l, l);
+    }
+
+    // syslog
+    if (changed.count("log_to_syslog")) {
+      int l = conf->log_to_syslog ? 99 : (conf->err_to_syslog ? -1 : -2);
+      log->set_syslog_level(l, l);
+    }
+
+    // file
+    if (changed.count("log_file") ||
+	changed.count("log_to_file")) {
+      if (conf->log_to_file) {
+	log->set_log_file(conf->log_file);
+      } else {
+	log->set_log_file({});
+      }
+      log->reopen_log_file();
+    }
+
+    if (changed.count("log_stderr_prefix")) {
+      log->set_log_stderr_prefix(conf.get_val<string>("log_stderr_prefix"));
+    }
+
+    if (changed.count("log_max_new")) {
+
+      log->set_max_new(conf->log_max_new);
+    }
+
+    if (changed.count("log_max_recent")) {
+      log->set_max_recent(conf->log_max_recent);
+    }
+
+    // graylog
+    if (changed.count("log_to_graylog") || changed.count("err_to_graylog")) {
+      int l = conf->log_to_graylog ? 99 : (conf->err_to_graylog ? -1 : -2);
+      log->set_graylog_level(l, l);
+
+      if (conf->log_to_graylog || conf->err_to_graylog) {
+	log->start_graylog();
+      } else if (! (conf->log_to_graylog && conf->err_to_graylog)) {
+	log->stop_graylog();
+      }
+    }
+
+    if (log->graylog() && (changed.count("log_graylog_host") || changed.count("log_graylog_port"))) {
+      log->graylog()->set_destination(conf->log_graylog_host, conf->log_graylog_port);
+    }
+
+    if (changed.find("log_coarse_timestamps") != changed.end()) {
+      log->set_coarse_timestamps(conf.get_val<bool>("log_coarse_timestamps"));
+    }
+
+    // metadata
+    if (log->graylog() && changed.count("host")) {
+      log->graylog()->set_hostname(conf->host);
+    }
+
+    if (log->graylog() && changed.count("fsid")) {
+      log->graylog()->set_fsid(conf.get_val<uuid_d>("fsid"));
+    }
+  }
+};
+
+
+// cct config watcher
+class CephContextObs : public md_config_obs_t {
+  CephContext *cct;
+
+public:
+  explicit CephContextObs(CephContext *cct) : cct(cct) {}
+
+  const char** get_tracked_conf_keys() const override {
+    static const char *KEYS[] = {
+      "enable_experimental_unrecoverable_data_corrupting_features",
+      "crush_location",
+      NULL
+    };
+    return KEYS;
+  }
+
+  void handle_conf_change(const ConfigProxy& conf,
+                          const std::set <std::string> &changed) override {
+    if (changed.count(
+	  "enable_experimental_unrecoverable_data_corrupting_features")) {
+      std::lock_guard lg(cct->_feature_lock);
+      get_str_set(
+	conf->enable_experimental_unrecoverable_data_corrupting_features,
+	cct->_experimental_features);
+      if (getenv("CEPH_DEV") == NULL) {
+        if (!cct->_experimental_features.empty()) {
+          if (cct->_experimental_features.count("*")) {
+            lderr(cct) << "WARNING: all dangerous and experimental features are enabled." << dendl;
+          } else {
+            lderr(cct) << "WARNING: the following dangerous and experimental features are enabled: "
+              << cct->_experimental_features << dendl;
+          }
+        }
+      }
+
+    }
+    if (changed.count("crush_location")) {
+      cct->crush_location.update_from_conf();
+    }
+  }
+};
+
+bool CephContext::check_experimental_feature_enabled(const std::string& feat)
+{
+  stringstream message;
+  bool enabled = check_experimental_feature_enabled(feat, &message);
+  lderr(this) << message.str() << dendl;
+  return enabled;
+}
+
+bool CephContext::check_experimental_feature_enabled(const std::string& feat,
+						     std::ostream *message)
+{
+  std::unique_lock<ceph::spinlock> lg(_feature_lock);
+
+  bool enabled = (_experimental_features.count(feat) ||
+		  _experimental_features.count("*"));
+
+  if (enabled) {
+    (*message) << "WARNING: experimental feature '" << feat << "' is enabled\n";
+    (*message) << "Please be aware that this feature is experimental, untested,\n";
+    (*message) << "unsupported, and may result in data corruption, data loss,\n";
+    (*message) << "and/or irreparable damage to your cluster.  Do not use\n";
+    (*message) << "feature with important data.\n";
+  } else {
+    (*message) << "*** experimental feature '" << feat << "' is not enabled ***\n";
+    (*message) << "This feature is marked as experimental, which means it\n";
+    (*message) << " - is untested\n";
+    (*message) << " - is unsupported\n";
+    (*message) << " - may corrupt your data\n";
+    (*message) << " - may break your cluster is an unrecoverable fashion\n";
+    (*message) << "To enable this feature, add this to your ceph.conf:\n";
+    (*message) << "  enable experimental unrecoverable data corrupting features = " << feat << "\n";
+  }
+  return enabled;
+}
+
+// perfcounter hooks
+
+class CephContextHook : public AdminSocketHook {
+  CephContext *m_cct;
+
+public:
+  explicit CephContextHook(CephContext *cct) : m_cct(cct) {}
+
+  bool call(std::string_view command, const cmdmap_t& cmdmap,
+	    std::string_view format, bufferlist& out) override {
+    try {
+      m_cct->do_command(command, cmdmap, format, &out);
+    } catch (const bad_cmd_get& e) {
+      return false;
+    }
+    return true;
+  }
+};
+
+void CephContext::do_command(std::string_view command, const cmdmap_t& cmdmap,
+			     std::string_view format, bufferlist *out)
+{
+  Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
+  stringstream ss;
+  for (auto it = cmdmap.begin(); it != cmdmap.end(); ++it) {
+    if (it->first != "prefix") {
+      ss << it->first  << ":" << cmd_vartype_stringify(it->second) << " ";
+    }
+  }
+  lgeneric_dout(this, 1) << "do_command '" << command << "' '"
+			 << ss.str() << dendl;
+  ceph_assert_always(!(command == "assert" && _conf->debug_asok_assert_abort));
+  if (command == "abort" && _conf->debug_asok_assert_abort) {
+   ceph_abort();
+  }
+  if (command == "perfcounters_dump" || command == "1" ||
+      command == "perf dump") {
+    std::string logger;
+    std::string counter;
+    cmd_getval(this, cmdmap, "logger", logger);
+    cmd_getval(this, cmdmap, "counter", counter);
+    _perf_counters_collection->dump_formatted(f, false, logger, counter);
+  }
+  else if (command == "perfcounters_schema" || command == "2" ||
+    command == "perf schema") {
+    _perf_counters_collection->dump_formatted(f, true);
+  }
+  else if (command == "perf histogram dump") {
+    std::string logger;
+    std::string counter;
+    cmd_getval(this, cmdmap, "logger", logger);
+    cmd_getval(this, cmdmap, "counter", counter);
+    _perf_counters_collection->dump_formatted_histograms(f, false, logger,
+                                                         counter);
+  }
+  else if (command == "perf histogram schema") {
+    _perf_counters_collection->dump_formatted_histograms(f, true);
+  }
+  else if (command == "perf reset") {
+    std::string var;
+    std::string section(command);
+    f->open_object_section(section.c_str());
+    if (!cmd_getval(this, cmdmap, "var", var)) {
+      f->dump_string("error", "syntax error: 'perf reset <var>'");
+    } else {
+     if(!_perf_counters_collection->reset(var))
+        f->dump_stream("error") << "Not find: " << var;
+     else
+       f->dump_string("success", std::string(command) + ' ' + var);
+    }
+    f->close_section();
+  }
+  else {
+    std::string section(command);
+    boost::replace_all(section, " ", "_");
+    f->open_object_section(section.c_str());
+    if (command == "config show") {
+      _conf.show_config(f);
+    }
+    else if (command == "config unset") {
+      std::string var;
+      if (!(cmd_getval(this, cmdmap, "var", var))) {
+        f->dump_string("error", "syntax error: 'config unset <var>'");
+      } else {
+        int r = _conf.rm_val(var.c_str());
+        if (r < 0 && r != -ENOENT) {
+          f->dump_stream("error") << "error unsetting '" << var << "': "
+				  << cpp_strerror(r);
+        } else {
+          ostringstream ss;
+          _conf.apply_changes(&ss);
+          f->dump_string("success", ss.str());
+        }
+      }
+
+    }
+    else if (command == "config set") {
+      std::string var;
+      std::vector<std::string> val;
+
+      if (!(cmd_getval(this, cmdmap, "var", var)) ||
+          !(cmd_getval(this, cmdmap, "val", val))) {
+        f->dump_string("error", "syntax error: 'config set <var> <value>'");
+      } else {
+	// val may be multiple words
+	string valstr = str_join(val, " ");
+        int r = _conf.set_val(var.c_str(), valstr.c_str());
+        if (r < 0) {
+          f->dump_stream("error") << "error setting '" << var << "' to '" << valstr << "': " << cpp_strerror(r);
+        } else {
+          ostringstream ss;
+          _conf.apply_changes(&ss);
+          f->dump_string("success", ss.str());
+        }
+      }
+    } else if (command == "config get") {
+      std::string var;
+      if (!cmd_getval(this, cmdmap, "var", var)) {
+	f->dump_string("error", "syntax error: 'config get <var>'");
+      } else {
+	char buf[4096];
+	// FIPS zeroization audit 20191115: this memset is not security related.
+	memset(buf, 0, sizeof(buf));
+	char *tmp = buf;
+	int r = _conf.get_val(var.c_str(), &tmp, sizeof(buf));
+	if (r < 0) {
+	    f->dump_stream("error") << "error getting '" << var << "': " << cpp_strerror(r);
+	} else {
+	    f->dump_string(var.c_str(), buf);
+	}
+      }
+    } else if (command == "config help") {
+      std::string var;
+      if (cmd_getval(this, cmdmap, "var", var)) {
+        // Output a single one
+        std::string key = ConfFile::normalize_key_name(var);
+	auto schema = _conf.get_schema(key);
+        if (!schema) {
+          std::ostringstream msg;
+          msg << "Setting not found: '" << key << "'";
+          f->dump_string("error", msg.str());
+        } else {
+          f->dump_object("option", *schema);
+        }
+      } else {
+        // Output all
+        f->open_array_section("options");
+        for (const auto &option : ceph_options) {
+          f->dump_object("option", option);
+        }
+        f->close_section();
+      }
+    } else if (command == "config diff") {
+      f->open_object_section("diff");
+      _conf.diff(f);
+      f->close_section(); // unknown
+    } else if (command == "config diff get") {
+      std::string setting;
+      f->open_object_section("diff");
+      _conf.diff(f, setting);
+      f->close_section(); // unknown
+    } else if (command == "log flush") {
+      _log->flush();
+    }
+    else if (command == "log dump") {
+      _log->dump_recent();
+    }
+    else if (command == "log reopen") {
+      _log->reopen_log_file();
+    }
+    else {
+      ceph_abort_msg("registered under wrong command?");    
+    }
+    f->close_section();
+  }
+  f->flush(*out);
+  delete f;
+  lgeneric_dout(this, 1) << "do_command '" << command << "' '" << ss.str()
+		         << "result is " << out->length() << " bytes" << dendl;
+}
+
+CephContext::CephContext(uint32_t module_type_,
+                         enum code_environment_t code_env,
+                         int init_flags_)
+  : nref(1),
+    _conf{code_env == CODE_ENVIRONMENT_DAEMON},
+    _log(NULL),
+    _module_type(module_type_),
+    _init_flags(init_flags_),
+    _set_uid(0),
+    _set_gid(0),
+    _set_uid_string(),
+    _set_gid_string(),
+    _crypto_inited(0),
+    _service_thread(NULL),
+    _log_obs(NULL),
+    _admin_socket(NULL),
+    _perf_counters_collection(NULL),
+    _perf_counters_conf_obs(NULL),
+    _heartbeat_map(NULL),
+    _crypto_none(NULL),
+    _crypto_aes(NULL),
+    _plugin_registry(NULL),
+    _lockdep_obs(NULL),
+    crush_location(this)
+{
+  _log = new ceph::logging::Log(&_conf->subsys);
+
+  _log_obs = new LogObs(_log);
+  _conf.add_observer(_log_obs);
+
+  _cct_obs = new CephContextObs(this);
+  _conf.add_observer(_cct_obs);
+
+  _lockdep_obs = new LockdepObs(this);
+  _conf.add_observer(_lockdep_obs);
+
+  _perf_counters_collection = new PerfCountersCollection(this);
+ 
+  _admin_socket = new AdminSocket(this);
+  _heartbeat_map = new HeartbeatMap(this);
+
+  _plugin_registry = new PluginRegistry(this);
+
+  _admin_hook = new CephContextHook(this);
+  _admin_socket->register_command("assert", "assert", _admin_hook, "");
+  _admin_socket->register_command("abort", "abort", _admin_hook, "");
+  _admin_socket->register_command("perfcounters_dump", "perfcounters_dump", _admin_hook, "");
+  _admin_socket->register_command("1", "1", _admin_hook, "");
+  _admin_socket->register_command("perf dump", "perf dump name=logger,type=CephString,req=false name=counter,type=CephString,req=false", _admin_hook, "dump perfcounters value");
+  _admin_socket->register_command("perfcounters_schema", "perfcounters_schema", _admin_hook, "");
+  _admin_socket->register_command("perf histogram dump", "perf histogram dump name=logger,type=CephString,req=false name=counter,type=CephString,req=false", _admin_hook, "dump perf histogram values");
+  _admin_socket->register_command("2", "2", _admin_hook, "");
+  _admin_socket->register_command("perf schema", "perf schema", _admin_hook, "dump perfcounters schema");
+  _admin_socket->register_command("perf histogram schema", "perf histogram schema", _admin_hook, "dump perf histogram schema");
+  _admin_socket->register_command("perf reset", "perf reset name=var,type=CephString", _admin_hook, "perf reset <name>: perf reset all or one perfcounter name");
+  _admin_socket->register_command("config show", "config show", _admin_hook, "dump current config settings");
+  _admin_socket->register_command("config help", "config help name=var,type=CephString,req=false", _admin_hook, "get config setting schema and descriptions");
+  _admin_socket->register_command("config set", "config set name=var,type=CephString name=val,type=CephString,n=N",  _admin_hook, "config set <field> <val> [<val> ...]: set a config variable");
+  _admin_socket->register_command("config unset", "config unset name=var,type=CephString",  _admin_hook, "config unset <field>: unset a config variable");
+  _admin_socket->register_command("config get", "config get name=var,type=CephString", _admin_hook, "config get <field>: get the config value");
+  _admin_socket->register_command("config diff",
+      "config diff", _admin_hook,
+      "dump diff of current config and default config");
+  _admin_socket->register_command("config diff get",
+      "config diff get name=var,type=CephString", _admin_hook,
+      "dump diff get <field>: dump diff of current and default config setting <field>");
+  _admin_socket->register_command("log flush", "log flush", _admin_hook, "flush log entries to log file");
+  _admin_socket->register_command("log dump", "log dump", _admin_hook, "dump recent log entries to log file");
+  _admin_socket->register_command("log reopen", "log reopen", _admin_hook, "reopen log file");
+
+  _crypto_none = CryptoHandler::create(CEPH_CRYPTO_NONE);
+  _crypto_aes = CryptoHandler::create(CEPH_CRYPTO_AES);
+  _crypto_random.reset(new CryptoRandom());
+
+  lookup_or_create_singleton_object<MempoolObs>("mempool_obs", false, this);
+}
+
+CephContext::~CephContext()
+{
+  associated_objs.clear();
+  join_service_thread();
+
+  if (_cct_perf) {
+    _perf_counters_collection->remove(_cct_perf);
+    delete _cct_perf;
+    _cct_perf = NULL;
+  }
+
+  delete _plugin_registry;
+
+  _admin_socket->unregister_commands(_admin_hook);
+  delete _admin_hook;
+  delete _admin_socket;
+
+  delete _heartbeat_map;
+
+  delete _perf_counters_collection;
+  _perf_counters_collection = NULL;
+
+  delete _perf_counters_conf_obs;
+  _perf_counters_conf_obs = NULL;
+
+  _conf.remove_observer(_log_obs);
+  delete _log_obs;
+  _log_obs = NULL;
+
+  _conf.remove_observer(_cct_obs);
+  delete _cct_obs;
+  _cct_obs = NULL;
+
+  _conf.remove_observer(_lockdep_obs);
+  delete _lockdep_obs;
+  _lockdep_obs = NULL;
+
+  _log->stop();
+  delete _log;
+  _log = NULL;
+
+  delete _crypto_none;
+  delete _crypto_aes;
+  if (_crypto_inited > 0) {
+    ceph_assert(_crypto_inited == 1);  // or else someone explicitly did
+				  // init but not shutdown
+    shutdown_crypto();
+  }
+}
+
+void CephContext::put() {
+  if (--nref == 0) {
+    ANNOTATE_HAPPENS_AFTER(&nref);
+    ANNOTATE_HAPPENS_BEFORE_FORGET_ALL(&nref);
+    delete this;
+  } else {
+    ANNOTATE_HAPPENS_BEFORE(&nref);
+  }
+}
+
+void CephContext::init_crypto()
+{
+  if (_crypto_inited++ == 0) {
+    ceph::crypto::init(this);
+  }
+}
+
+void CephContext::shutdown_crypto()
+{
+  if (--_crypto_inited == 0) {
+    ceph::crypto::shutdown(g_code_env == CODE_ENVIRONMENT_LIBRARY);
+  }
+}
+
+void CephContext::start_service_thread()
+{
+  {
+    std::lock_guard lg(_service_thread_lock);
+    if (_service_thread) {
+      return;
+    }
+    _service_thread = new CephContextServiceThread(this);
+    _service_thread->create("service");
+  }
+
+  if (!(get_init_flags() & CINIT_FLAG_NO_CCT_PERF_COUNTERS))
+    _enable_perf_counter();
+
+  // make logs flush on_exit()
+  if (_conf->log_flush_on_exit)
+    _log->set_flush_on_exit();
+
+  // Trigger callbacks on any config observers that were waiting for
+  // it to become safe to start threads.
+  _conf.set_safe_to_start_threads();
+  _conf.call_all_observers();
+
+  // start admin socket
+  if (_conf->admin_socket.length())
+    _admin_socket->init(_conf->admin_socket);
+}
+
+void CephContext::reopen_logs()
+{
+  std::lock_guard lg(_service_thread_lock);
+  if (_service_thread)
+    _service_thread->reopen_logs();
+}
+
+void CephContext::join_service_thread()
+{
+  std::unique_lock<ceph::spinlock> lg(_service_thread_lock);
+
+  CephContextServiceThread *thread = _service_thread;
+  if (!thread) {
+    return;
+  }
+  _service_thread = NULL;
+
+  lg.unlock();
+
+  thread->exit_thread();
+  thread->join();
+  delete thread;
+
+  if (!(get_init_flags() & CINIT_FLAG_NO_CCT_PERF_COUNTERS))
+    _disable_perf_counter();
+}
+
+uint32_t CephContext::get_module_type() const
+{
+  return _module_type;
+}
+
+void CephContext::set_init_flags(int flags)
+{
+  _init_flags = flags;
+}
+
+int CephContext::get_init_flags() const
+{
+  return _init_flags;
+}
+
+PerfCountersCollection *CephContext::get_perfcounters_collection()
+{
+  return _perf_counters_collection;
+}
+
+void CephContext::_enable_perf_counter()
+{
+  assert(!_cct_perf);
+  PerfCountersBuilder plb(this, "cct", l_cct_first, l_cct_last);
+  plb.add_u64(l_cct_total_workers, "total_workers", "Total workers");
+  plb.add_u64(l_cct_unhealthy_workers, "unhealthy_workers", "Unhealthy workers");
+  _cct_perf = plb.create_perf_counters();
+  _perf_counters_collection->add(_cct_perf);
+
+  assert(_mempool_perf_names.empty());
+  assert(_mempool_perf_descriptions.empty());
+  _mempool_perf_names.reserve(mempool::num_pools * 2);
+  _mempool_perf_descriptions.reserve(mempool::num_pools * 2);
+  for (unsigned i = 0; i < mempool::num_pools; ++i) {
+    string n = mempool::get_pool_name(mempool::pool_index_t(i));
+    _mempool_perf_names.push_back(n + "_bytes");
+    _mempool_perf_descriptions.push_back(
+      string("mempool ") + n + " total bytes");
+    _mempool_perf_names.push_back(n + "_items");
+    _mempool_perf_descriptions.push_back(
+      string("mempool ") + n + " total items");
+  }
+
+  PerfCountersBuilder plb2(this, "mempool", l_mempool_first,
+			  l_mempool_first + 1 + 2*mempool::num_pools);
+  unsigned l = l_mempool_first + 1;
+  for (unsigned i = 0; i < mempool::num_pools; ++i) {
+    plb2.add_u64(l++, _mempool_perf_names[i*2].c_str(),
+		 _mempool_perf_descriptions[i*2].c_str());
+    plb2.add_u64(l++, _mempool_perf_names[i*2+1].c_str(),
+		 _mempool_perf_descriptions[i*2+1].c_str());
+  }
+  _mempool_perf = plb2.create_perf_counters();
+  _perf_counters_collection->add(_mempool_perf);
+}
+
+void CephContext::_disable_perf_counter()
+{
+  if (!_cct_perf) {
+    return;
+  }
+  _perf_counters_collection->remove(_cct_perf);
+  delete _cct_perf;
+  _cct_perf = nullptr;
+
+  _perf_counters_collection->remove(_mempool_perf);
+  delete _mempool_perf;
+  _mempool_perf = nullptr;
+  _mempool_perf_names.clear();
+  _mempool_perf_descriptions.clear();
+}
+
+void CephContext::_refresh_perf_values()
+{
+  if (_cct_perf) {
+    _cct_perf->set(l_cct_total_workers, _heartbeat_map->get_total_workers());
+    _cct_perf->set(l_cct_unhealthy_workers, _heartbeat_map->get_unhealthy_workers());
+  }
+  unsigned l = l_mempool_first + 1;
+  for (unsigned i = 0; i < mempool::num_pools; ++i) {
+    mempool::pool_t& p = mempool::get_pool(mempool::pool_index_t(i));
+    _mempool_perf->set(l++, p.allocated_bytes());
+    _mempool_perf->set(l++, p.allocated_items());
+  }
+}
+
+AdminSocket *CephContext::get_admin_socket()
+{
+  return _admin_socket;
+}
+
+CryptoHandler *CephContext::get_crypto_handler(int type)
+{
+  switch (type) {
+  case CEPH_CRYPTO_NONE:
+    return _crypto_none;
+  case CEPH_CRYPTO_AES:
+    return _crypto_aes;
+  default:
+    return NULL;
+  }
+}
+
+void CephContext::notify_pre_fork()
+{
+  {
+    std::lock_guard lg(_fork_watchers_lock);
+    for (auto &&t : _fork_watchers) {
+      t->handle_pre_fork();
+    }
+  }
+  {
+    // note: we don't hold a lock here, but we assume we are idle at
+    // fork time, which happens during process init and startup.
+    auto i = associated_objs.begin();
+    while (i != associated_objs.end()) {
+      if (associated_objs_drop_on_fork.count(i->first.first)) {
+	i = associated_objs.erase(i);
+      } else {
+	++i;
+      }
+    }
+    associated_objs_drop_on_fork.clear();
+  }
+}
+
+void CephContext::notify_post_fork()
+{
+  ceph::spin_unlock(&_fork_watchers_lock);
+  for (auto &&t : _fork_watchers)
+    t->handle_post_fork();
+}
+
+void CephContext::set_mon_addrs(const MonMap& mm) {
+  std::vector<entity_addrvec_t> mon_addrs;
+  for (auto& i : mm.mon_info) {
+    mon_addrs.push_back(i.second.public_addrs);
+  }
+
+  set_mon_addrs(mon_addrs);
+}
+#endif	// WITH_SEASTAR
diff --git a/src/common/ceph_context.h b/src/common/ceph_context.h
new file mode 100644
index 00000000..7fdf324c
--- /dev/null
+++ b/src/common/ceph_context.h
@@ -0,0 +1,378 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_CEPHCONTEXT_H
+#define CEPH_CEPHCONTEXT_H
+
+#include <atomic>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <set>
+#include <string>
+#include <string_view>
+#include <typeinfo>
+#include <typeindex>
+
+#include "include/any.h"
+
+#include "common/cmdparse.h"
+#include "common/code_environment.h"
+#include "msg/msg_types.h"
+#ifdef WITH_SEASTAR
+#include "crimson/common/config_proxy.h"
+#include "crimson/common/perf_counters_collection.h"
+#else
+#include "common/config_proxy.h"
+#include "include/spinlock.h"
+#include "common/perf_counters_collection.h"
+#endif
+
+
+#include "crush/CrushLocation.h"
+
+class AdminSocket;
+class CephContextServiceThread;
+class CephContextHook;
+class CephContextObs;
+class CryptoHandler;
+class CryptoRandom;
+class MonMap;
+
+namespace ceph {
+  class PluginRegistry;
+  class HeartbeatMap;
+  namespace logging {
+    class Log;
+  }
+}
+
+#ifdef WITH_SEASTAR
+class CephContext {
+public:
+  CephContext();
+  CephContext(uint32_t,
+	      code_environment_t=CODE_ENVIRONMENT_UTILITY,
+	      int = 0)
+    : CephContext{}
+  {}
+  ~CephContext();
+
+  uint32_t get_module_type() const;
+  bool check_experimental_feature_enabled(const std::string& feature) {
+    // everything crimson is experimental...
+    return true;
+  }
+  CryptoRandom* random() const;
+  PerfCountersCollectionImpl* get_perfcounters_collection();
+  ceph::common::ConfigProxy& _conf;
+  ceph::common::PerfCountersCollection& _perf_counters_collection;
+  CephContext* get();
+  void put();
+private:
+  std::unique_ptr<CryptoRandom> _crypto_random;
+  unsigned nref;
+};
+#else
+/* A CephContext represents the context held by a single library user.
+ * There can be multiple CephContexts in the same process.
+ *
+ * For daemons and utility programs, there will be only one CephContext.  The
+ * CephContext contains the configuration, the dout object, and anything else
+ * that you might want to pass to libcommon with every function call.
+ */
+class CephContext {
+public:
+  CephContext(uint32_t module_type_,
+              enum code_environment_t code_env=CODE_ENVIRONMENT_UTILITY,
+              int init_flags_ = 0);
+
+  CephContext(const CephContext&) = delete;
+  CephContext& operator =(const CephContext&) = delete;
+  CephContext(CephContext&&) = delete;
+  CephContext& operator =(CephContext&&) = delete;
+
+  bool _finished = false;
+
+  // ref count!
+private:
+  ~CephContext();
+  std::atomic<unsigned> nref;
+public:
+  CephContext *get() {
+    ++nref;
+    return this;
+  }
+  void put();
+
+  ConfigProxy _conf;
+  ceph::logging::Log *_log;
+
+  /* init ceph::crypto */
+  void init_crypto();
+
+  /// shutdown crypto (should match init_crypto calls)
+  void shutdown_crypto();
+
+  /* Start the Ceph Context's service thread */
+  void start_service_thread();
+
+  /* Reopen the log files */
+  void reopen_logs();
+
+  /* Get the module type (client, mon, osd, mds, etc.) */
+  uint32_t get_module_type() const;
+
+  // this is here only for testing purposes!
+  void _set_module_type(uint32_t t) {
+    _module_type = t;
+  }
+
+  void set_init_flags(int flags);
+  int get_init_flags() const;
+
+  /* Get the PerfCountersCollection of this CephContext */
+  PerfCountersCollection *get_perfcounters_collection();
+
+  ceph::HeartbeatMap *get_heartbeat_map() {
+    return _heartbeat_map;
+  }
+
+  /**
+   * Get the admin socket associated with this CephContext.
+   *
+   * Currently there is always an admin socket object,
+   * so this will never return NULL.
+   *
+   * @return the admin socket
+   */
+  AdminSocket *get_admin_socket();
+
+  /**
+   * process an admin socket command
+   */
+  void do_command(std::string_view command, const cmdmap_t& cmdmap,
+		  std::string_view format, ceph::bufferlist *out);
+
+  static constexpr std::size_t largest_singleton = 8 * 72;
+
+  template<typename T, typename... Args>
+  T& lookup_or_create_singleton_object(std::string_view name,
+				       bool drop_on_fork,
+				       Args&&... args) {
+    static_assert(sizeof(T) <= largest_singleton,
+		  "Please increase largest singleton.");
+    std::lock_guard lg(associated_objs_lock);
+    std::type_index type = typeid(T);
+
+    auto i = associated_objs.find(std::make_pair(name, type));
+    if (i == associated_objs.cend()) {
+      if (drop_on_fork) {
+	associated_objs_drop_on_fork.insert(std::string(name));
+      }
+      i = associated_objs.emplace_hint(
+	i,
+	std::piecewise_construct,
+	std::forward_as_tuple(name, type),
+	std::forward_as_tuple(std::in_place_type<T>,
+			      std::forward<Args>(args)...));
+    }
+    return ceph::any_cast<T&>(i->second);
+  }
+
+  /**
+   * get a crypto handler
+   */
+  CryptoHandler *get_crypto_handler(int type);
+
+  CryptoRandom* random() const { return _crypto_random.get(); }
+
+  /// check if experimental feature is enable, and emit appropriate warnings
+  bool check_experimental_feature_enabled(const std::string& feature);
+  bool check_experimental_feature_enabled(const std::string& feature,
+					  std::ostream *message);
+
+  PluginRegistry *get_plugin_registry() {
+    return _plugin_registry;
+  }
+
+  void set_uid_gid(uid_t u, gid_t g) {
+    _set_uid = u;
+    _set_gid = g;
+  }
+  uid_t get_set_uid() const {
+    return _set_uid;
+  }
+  gid_t get_set_gid() const {
+    return _set_gid;
+  }
+
+  void set_uid_gid_strings(const std::string &u, const std::string &g) {
+    _set_uid_string = u;
+    _set_gid_string = g;
+  }
+  std::string get_set_uid_string() const {
+    return _set_uid_string;
+  }
+  std::string get_set_gid_string() const {
+    return _set_gid_string;
+  }
+
+  class ForkWatcher {
+   public:
+    virtual ~ForkWatcher() {}
+    virtual void handle_pre_fork() = 0;
+    virtual void handle_post_fork() = 0;
+  };
+
+  void register_fork_watcher(ForkWatcher *w) {
+    std::lock_guard lg(_fork_watchers_lock);
+    _fork_watchers.push_back(w);
+  }
+
+  void notify_pre_fork();
+  void notify_post_fork();
+
+  /**
+   * update CephContext with a copy of the passed in MonMap mon addrs
+   *
+   * @param mm MonMap to extract and update mon addrs
+   */
+  void set_mon_addrs(const MonMap& mm);
+  void set_mon_addrs(const std::vector<entity_addrvec_t>& in) {
+    auto ptr = std::make_shared<std::vector<entity_addrvec_t>>(in);
+    atomic_store_explicit(&_mon_addrs, std::move(ptr), std::memory_order_relaxed);
+  }
+  std::shared_ptr<std::vector<entity_addrvec_t>> get_mon_addrs() const {
+    auto ptr = atomic_load_explicit(&_mon_addrs, std::memory_order_relaxed);
+    return ptr;
+  }
+
+private:
+
+
+  /* Stop and join the Ceph Context's service thread */
+  void join_service_thread();
+
+  uint32_t _module_type;
+
+  int _init_flags;
+
+  uid_t _set_uid; ///< uid to drop privs to
+  gid_t _set_gid; ///< gid to drop privs to
+  std::string _set_uid_string;
+  std::string _set_gid_string;
+
+  int _crypto_inited;
+
+  std::shared_ptr<std::vector<entity_addrvec_t>> _mon_addrs;
+
+  /* libcommon service thread.
+   * SIGHUP wakes this thread, which then reopens logfiles */
+  friend class CephContextServiceThread;
+  CephContextServiceThread *_service_thread;
+
+  using md_config_obs_t = ceph::md_config_obs_impl<ConfigProxy>;
+
+  md_config_obs_t *_log_obs;
+
+  /* The admin socket associated with this context */
+  AdminSocket *_admin_socket;
+
+  /* lock which protects service thread creation, destruction, etc. */
+  ceph::spinlock _service_thread_lock;
+
+  /* The collection of profiling loggers associated with this context */
+  PerfCountersCollection *_perf_counters_collection;
+
+  md_config_obs_t *_perf_counters_conf_obs;
+
+  CephContextHook *_admin_hook;
+
+  ceph::HeartbeatMap *_heartbeat_map;
+
+  ceph::spinlock associated_objs_lock;
+
+  struct associated_objs_cmp {
+    using is_transparent = std::true_type;
+    template<typename T, typename U>
+    bool operator ()(const std::pair<T, std::type_index>& l,
+		     const std::pair<U, std::type_index>& r) const noexcept {
+      return ((l.first < r.first)  ||
+	      (l.first == r.first && l.second < r.second));
+    }
+  };
+
+  std::map<std::pair<std::string, std::type_index>,
+	   ceph::immobile_any<largest_singleton>,
+	   associated_objs_cmp> associated_objs;
+  std::set<std::string> associated_objs_drop_on_fork;
+
+  ceph::spinlock _fork_watchers_lock;
+  std::vector<ForkWatcher*> _fork_watchers;
+
+  // crypto
+  CryptoHandler *_crypto_none;
+  CryptoHandler *_crypto_aes;
+  std::unique_ptr<CryptoRandom> _crypto_random;
+
+  // experimental
+  CephContextObs *_cct_obs;
+  ceph::spinlock _feature_lock;
+  std::set<std::string> _experimental_features;
+
+  PluginRegistry *_plugin_registry;
+
+  md_config_obs_t *_lockdep_obs;
+
+public:
+  CrushLocation crush_location;
+private:
+
+  enum {
+    l_cct_first,
+    l_cct_total_workers,
+    l_cct_unhealthy_workers,
+    l_cct_last
+  };
+  enum {
+    l_mempool_first = 873222,
+    l_mempool_bytes,
+    l_mempool_items,
+    l_mempool_last
+  };
+  PerfCounters *_cct_perf = nullptr;
+  PerfCounters* _mempool_perf = nullptr;
+  std::vector<std::string> _mempool_perf_names, _mempool_perf_descriptions;
+
+  /**
+   * Enable the performance counters.
+   */
+  void _enable_perf_counter();
+
+  /**
+   * Disable the performance counter.
+   */
+  void _disable_perf_counter();
+
+  /**
+   * Refresh perf counter values.
+   */
+  void _refresh_perf_values();
+
+  friend class CephContextObs;
+};
+#endif	// WITH_SEASTAR
+
+#endif
diff --git a/src/common/ceph_crypto.cc b/src/common/ceph_crypto.cc
new file mode 100644
index 00000000..62fc94ea
--- /dev/null
+++ b/src/common/ceph_crypto.cc
@@ -0,0 +1,141 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2010-2011 Dreamhost
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/ceph_context.h"
+#include "common/config.h"
+#include "ceph_crypto.h"
+
+#ifdef USE_NSS
+
+// for SECMOD_RestartModules()
+#include <secmod.h>
+#include <nspr.h>
+
+#endif /*USE_NSS*/
+
+#ifdef USE_OPENSSL
+#include <openssl/evp.h>
+#endif /*USE_OPENSSL*/
+
+#ifdef USE_NSS
+
+static pthread_mutex_t crypto_init_mutex = PTHREAD_MUTEX_INITIALIZER;
+static uint32_t crypto_refs = 0;
+static NSSInitContext *crypto_context = NULL;
+static pid_t crypto_init_pid = 0;
+
+void ceph::crypto::init(CephContext *cct)
+{
+  pid_t pid = getpid();
+  pthread_mutex_lock(&crypto_init_mutex);
+  if (crypto_init_pid != pid) {
+    if (crypto_init_pid > 0) {
+      SECMOD_RestartModules(PR_FALSE);
+    }
+    crypto_init_pid = pid;
+  }
+
+  if (++crypto_refs == 1) {
+    NSSInitParameters init_params;
+    memset(&init_params, 0, sizeof(init_params));
+    init_params.length = sizeof(init_params);
+
+    uint32_t flags = (NSS_INIT_READONLY | NSS_INIT_PK11RELOAD);
+    if (cct->_conf->nss_db_path.empty()) {
+      flags |= (NSS_INIT_NOCERTDB | NSS_INIT_NOMODDB);
+    }
+    crypto_context = NSS_InitContext(cct->_conf->nss_db_path.c_str(), "", "",
+                                     SECMOD_DB, &init_params, flags);
+  }
+  pthread_mutex_unlock(&crypto_init_mutex);
+  ceph_assert_always(crypto_context != NULL);
+}
+
+void ceph::crypto::shutdown(bool shared)
+{
+  pthread_mutex_lock(&crypto_init_mutex);
+  ceph_assert_always(crypto_refs > 0);
+  if (--crypto_refs == 0) {
+    NSS_ShutdownContext(crypto_context);
+    if (!shared) {
+      PR_Cleanup();
+    }
+    crypto_context = NULL;
+    crypto_init_pid = 0;
+  }
+  pthread_mutex_unlock(&crypto_init_mutex);
+}
+
+ceph::crypto::nss::HMAC::~HMAC()
+{
+  PK11_DestroyContext(ctx, PR_TRUE);
+  PK11_FreeSymKey(symkey);
+  PK11_FreeSlot(slot);
+}
+
+#else
+# error "No supported crypto implementation found."
+#endif /*USE_NSS*/
+
+#ifdef USE_OPENSSL
+
+ceph::crypto::ssl::OpenSSLDigest::OpenSSLDigest(const EVP_MD * _type)
+  : mpContext(EVP_MD_CTX_create())
+  , mpType(_type) {
+  this->Restart();
+}
+
+ceph::crypto::ssl::OpenSSLDigest::~OpenSSLDigest() {
+  EVP_MD_CTX_destroy(mpContext);
+}
+
+void ceph::crypto::ssl::OpenSSLDigest::Restart() {
+  EVP_DigestInit_ex(mpContext, mpType, NULL);
+}
+
+void ceph::crypto::ssl::OpenSSLDigest::Update(const unsigned char *input, size_t length) {
+  if (length) {
+    EVP_DigestUpdate(mpContext, const_cast<void *>(reinterpret_cast<const void *>(input)), length);
+  }
+}
+
+void ceph::crypto::ssl::OpenSSLDigest::Final(unsigned char *digest) {
+  unsigned int s;
+  EVP_DigestFinal_ex(mpContext, digest, &s);
+}
+#endif /*USE_OPENSSL*/
+
+
+void ceph::crypto::zeroize_for_security(void* const s, const size_t n) {
+#ifdef USE_OPENSSL
+  // NSS lacks its own cleaning procedure that would be resilient to
+  // dead-store-elimination of nowadays compilers [1]. To avoid writing
+  // our own security code, let's always use the OpenSSL's one.
+  // [1]: "NSS [3.27.1] does not have a reliable memory scrubbing
+  //      implementation since it either calls memset or uses the macro
+  //      PORT_Memset, which expands to memset"
+  // https://klevchen.ece.illinois.edu/pubs/yjoll-usesec17.pdf, page 11.
+  OPENSSL_cleanse(s, n);
+#else
+  // OpenSSL is available even when NSS is turned on. The performance-
+  // critical Cephx's signature crafting machinery already follows this
+  // assumption and uses OpenSSL directly (see src/auth/Crypto.cc).
+  // Also, in CMakeList.txt we explicitly require both NSS and OpenSSL:
+  //
+  //  find_package(NSS REQUIRED)
+  //  find_package(NSPR REQUIRED)
+  //  find_package(OpenSSL REQUIRED)
+# error "No supported crypto implementation found."
+#endif /*USE_OPENSSL*/
+}
diff --git a/src/common/ceph_crypto.h b/src/common/ceph_crypto.h
new file mode 100644
index 00000000..03351eb4
--- /dev/null
+++ b/src/common/ceph_crypto.h
@@ -0,0 +1,365 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+#ifndef CEPH_CRYPTO_H
+#define CEPH_CRYPTO_H
+
+#include "acconfig.h"
+#include <stdexcept>
+
+#define CEPH_CRYPTO_MD5_DIGESTSIZE 16
+#define CEPH_CRYPTO_HMACSHA1_DIGESTSIZE 20
+#define CEPH_CRYPTO_SHA1_DIGESTSIZE 20
+#define CEPH_CRYPTO_HMACSHA256_DIGESTSIZE 32
+#define CEPH_CRYPTO_SHA256_DIGESTSIZE 32
+
+#ifdef USE_NSS
+// you *must* use CRYPTO_CXXFLAGS in CMakeLists.txt for including this include
+# include <nss.h>
+# include <pk11pub.h>
+
+// NSS thinks a lot of fairly fundamental operations might potentially
+// fail, because it has been written to support e.g. smartcards doing all
+// the crypto operations. We don't want to contaminate too much code
+// with error checking, and just say these really should never fail.
+// This assert MUST NOT be compiled out, even on non-debug builds.
+# include "include/ceph_assert.h"
+#endif /*USE_NSS*/
+
+#ifdef USE_OPENSSL
+#include <openssl/evp.h>
+#include <openssl/ossl_typ.h>
+#include <openssl/hmac.h>
+
+extern "C" {
+  const EVP_MD *EVP_md5(void);
+  const EVP_MD *EVP_sha1(void);
+  const EVP_MD *EVP_sha256(void);
+}
+#endif /*USE_OPENSSL*/
+
+namespace ceph {
+  namespace crypto {
+    void assert_init();
+    void init(CephContext *cct);
+    void shutdown(bool shared=true);
+
+    void zeroize_for_security(void *s, size_t n);
+  }
+}
+
+#ifdef USE_NSS
+namespace ceph {
+  namespace crypto {
+
+    class DigestException : public std::runtime_error
+    {
+    public:
+      DigestException(const char* what_arg) : runtime_error(what_arg)
+	{}
+    };
+
+    namespace nss {
+
+      class NSSDigest {
+      private:
+        PK11Context *ctx;
+        size_t digest_size;
+      public:
+        NSSDigest (SECOidTag _type, size_t _digest_size)
+	  : digest_size(_digest_size) {
+	  ctx = PK11_CreateDigestContext(_type);
+	  if (! ctx) {
+	    throw DigestException("PK11_CreateDigestContext() failed");
+	  }
+	  Restart();
+        }
+        ~NSSDigest () {
+	  PK11_DestroyContext(ctx, PR_TRUE);
+	}
+	void Restart() {
+	  SECStatus s;
+	  s = PK11_DigestBegin(ctx);
+	  if (s != SECSuccess) {
+	    throw DigestException("PK11_DigestBegin() failed");
+	  }
+	}
+	void Update (const unsigned char *input, size_t length) {
+	  if (length) {
+	    SECStatus s;
+	    s = PK11_DigestOp(ctx, input, length);
+	    if (s != SECSuccess) {
+	      throw DigestException("PK11_DigestOp() failed");
+	    }
+	  }
+	}
+	void Final (unsigned char *digest) {
+	  SECStatus s;
+	  unsigned int dummy;
+	  s = PK11_DigestFinal(ctx, digest, &dummy, digest_size);
+	  if (! (s == SECSuccess) &&
+	      (dummy == digest_size)) {
+	    throw DigestException("PK11_DigestFinal() failed");
+	  }
+	  Restart();
+	}
+      };
+
+      class MD5 : public NSSDigest {
+      public:
+	MD5 () : NSSDigest(SEC_OID_MD5, CEPH_CRYPTO_MD5_DIGESTSIZE) { }
+      };
+
+      class SHA1 : public NSSDigest {
+      public:
+        SHA1 () : NSSDigest(SEC_OID_SHA1, CEPH_CRYPTO_SHA1_DIGESTSIZE) { }
+      };
+
+      class SHA256 : public NSSDigest {
+      public:
+        SHA256 () : NSSDigest(SEC_OID_SHA256, CEPH_CRYPTO_SHA256_DIGESTSIZE) { }
+      };
+    }
+  }
+}
+#endif /*USE_NSS*/
+
+#ifdef USE_OPENSSL
+namespace ceph {
+  namespace crypto {
+    namespace ssl {
+      class OpenSSLDigest {
+      private:
+	EVP_MD_CTX *mpContext;
+	const EVP_MD *mpType;
+      public:
+	OpenSSLDigest (const EVP_MD *_type);
+	~OpenSSLDigest ();
+	void Restart();
+	void Update (const unsigned char *input, size_t length);
+	void Final (unsigned char *digest);
+      };
+
+      class MD5 : public OpenSSLDigest {
+      public:
+	MD5 () : OpenSSLDigest(EVP_md5()) { }
+      };
+
+      class SHA1 : public OpenSSLDigest {
+      public:
+        SHA1 () : OpenSSLDigest(EVP_sha1()) { }
+      };
+
+      class SHA256 : public OpenSSLDigest {
+      public:
+        SHA256 () : OpenSSLDigest(EVP_sha256()) { }
+      };
+    }
+  }
+}
+#endif /*USE_OPENSSL*/
+
+
+#ifdef USE_NSS
+namespace ceph {
+  namespace crypto::nss {
+    class HMAC {
+    private:
+      PK11SlotInfo *slot;
+      PK11SymKey *symkey;
+      PK11Context *ctx;
+      unsigned int digest_size;
+    public:
+      HMAC (CK_MECHANISM_TYPE cktype, unsigned int digestsize, const unsigned char *key, size_t length) {
+        digest_size = digestsize;
+	slot = PK11_GetBestSlot(cktype, NULL);
+	if (! slot) {
+	  throw DigestException("PK11_GetBestSlot() failed");
+	}
+	SECItem keyItem;
+	keyItem.type = siBuffer;
+	keyItem.data = (unsigned char*)key;
+	keyItem.len = length;
+	symkey = PK11_ImportSymKey(slot, cktype, PK11_OriginUnwrap,
+				   CKA_SIGN,  &keyItem, NULL);
+	if (! symkey) {
+	  throw DigestException("PK11_ImportSymKey() failed");
+	}
+	SECItem param;
+	param.type = siBuffer;
+	param.data = NULL;
+	param.len = 0;
+	ctx = PK11_CreateContextBySymKey(cktype, CKA_SIGN, symkey, &param);
+	if (! ctx) {
+	  throw DigestException("PK11_CreateContextBySymKey() failed");
+	}
+	Restart();
+      }
+      ~HMAC ();
+      void Restart() {
+	SECStatus s;
+	s = PK11_DigestBegin(ctx);
+	if (s != SECSuccess) {
+	  throw DigestException("PK11_DigestBegin() failed");
+	}
+      }
+      void Update (const unsigned char *input, size_t length) {
+	SECStatus s;
+	s = PK11_DigestOp(ctx, input, length);
+	if (s != SECSuccess) {
+	  throw DigestException("PK11_DigestOp() failed");
+	}
+      }
+      void Final (unsigned char *digest) {
+	SECStatus s;
+	unsigned int dummy;
+	s = PK11_DigestFinal(ctx, digest, &dummy, digest_size);
+	if (! (s == SECSuccess) &&
+	    (dummy == digest_size)) {
+	  throw DigestException("PK11_DigestFinal() failed");
+	}
+	Restart();
+      }
+    };
+
+    class HMACSHA1 : public HMAC {
+    public:
+      HMACSHA1 (const unsigned char *key, size_t length) : HMAC(CKM_SHA_1_HMAC, CEPH_CRYPTO_HMACSHA1_DIGESTSIZE, key, length) { }
+    };
+
+    class HMACSHA256 : public HMAC {
+    public:
+      HMACSHA256 (const unsigned char *key, size_t length) : HMAC(CKM_SHA256_HMAC, CEPH_CRYPTO_HMACSHA256_DIGESTSIZE, key, length) { }
+    };
+  }
+}
+#endif
+
+#ifdef USE_OPENSSL
+namespace ceph::crypto::ssl {
+# if OPENSSL_VERSION_NUMBER < 0x10100000L
+  class HMAC {
+  private:
+    HMAC_CTX mContext;
+    const EVP_MD *mpType;
+
+  public:
+    HMAC (const EVP_MD *type, const unsigned char *key, size_t length)
+      : mpType(type) {
+      // the strict FIPS zeroization doesn't seem to be necessary here.
+      // just in the case.
+      ::ceph::crypto::zeroize_for_security(&mContext, sizeof(mContext));
+      const auto r = HMAC_Init_ex(&mContext, key, length, mpType, nullptr);
+      if (r != 1) {
+	  throw DigestException("HMAC_Init_ex() failed");
+      }
+    }
+    ~HMAC () {
+      HMAC_CTX_cleanup(&mContext);
+    }
+
+    void Restart () {
+      const auto r = HMAC_Init_ex(&mContext, nullptr, 0, mpType, nullptr);
+      if (r != 1) {
+	throw DigestException("HMAC_Init_ex() failed");
+      }
+    }
+    void Update (const unsigned char *input, size_t length) {
+      if (length) {
+        const auto r = HMAC_Update(&mContext, input, length);
+	if (r != 1) {
+	  throw DigestException("HMAC_Update() failed");
+	}
+      }
+    }
+    void Final (unsigned char *digest) {
+      unsigned int s;
+      const auto r = HMAC_Final(&mContext, digest, &s);
+      if (r != 1) {
+	throw DigestException("HMAC_Final() failed");
+      }
+    }
+  };
+# else
+  class HMAC {
+  private:
+    HMAC_CTX *mpContext;
+
+  public:
+    HMAC (const EVP_MD *type, const unsigned char *key, size_t length)
+      : mpContext(HMAC_CTX_new()) {
+      const auto r = HMAC_Init_ex(mpContext, key, length, type, nullptr);
+      if (r != 1) {
+	throw DigestException("HMAC_Init_ex() failed");
+      }
+    }
+    ~HMAC () {
+      HMAC_CTX_free(mpContext);
+    }
+
+    void Restart () {
+      const EVP_MD * const type = HMAC_CTX_get_md(mpContext);
+      const auto r = HMAC_Init_ex(mpContext, nullptr, 0, type, nullptr);
+      if (r != 1) {
+	throw DigestException("HMAC_Init_ex() failed");
+      }
+    }
+    void Update (const unsigned char *input, size_t length) {
+      if (length) {
+        const auto r = HMAC_Update(mpContext, input, length);
+	if (r != 1) {
+	  throw DigestException("HMAC_Update() failed");
+	}
+      }
+    }
+    void Final (unsigned char *digest) {
+      unsigned int s;
+      const auto r = HMAC_Final(mpContext, digest, &s);
+      if (r != 1) {
+	throw DigestException("HMAC_Final() failed");
+      }
+    }
+  };
+# endif // OPENSSL_VERSION_NUMBER < 0x10100000L
+
+  struct HMACSHA1 : public HMAC {
+    HMACSHA1 (const unsigned char *key, size_t length)
+      : HMAC(EVP_sha1(), key, length) {
+    }
+  };
+
+  struct HMACSHA256 : public HMAC {
+    HMACSHA256 (const unsigned char *key, size_t length)
+      : HMAC(EVP_sha256(), key, length) {
+    }
+  };
+}
+#endif /*USE_OPENSSL*/
+
+
+#if defined(USE_OPENSSL)
+namespace ceph {
+  namespace crypto {
+    using ceph::crypto::ssl::SHA256;
+    using ceph::crypto::ssl::MD5;
+    using ceph::crypto::ssl::SHA1;
+
+    using ceph::crypto::ssl::HMACSHA256;
+    using ceph::crypto::ssl::HMACSHA1;
+  }
+}
+#elif defined(USE_NSS)
+namespace ceph {
+  namespace crypto {
+    using ceph::crypto::nss::SHA256;
+    using ceph::crypto::nss::MD5;
+    using ceph::crypto::nss::SHA1;
+
+    using ceph::crypto::nss::HMACSHA256;
+    using ceph::crypto::nss::HMACSHA1;
+  }
+}
+#else
+// cppcheck-suppress preprocessorErrorDirective
+# error "No supported crypto implementation found."
+#endif
+
+#endif
diff --git a/src/common/ceph_crypto_cms.cc b/src/common/ceph_crypto_cms.cc
new file mode 100644
index 00000000..0216c589
--- /dev/null
+++ b/src/common/ceph_crypto_cms.cc
@@ -0,0 +1,343 @@
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is the Netscape security libraries.
+ *
+ * The Initial Developer of the Original Code is
+ * Netscape Communications Corporation.
+ * Portions created by the Initial Developer are Copyright (C) 1994-2000
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+
+#include "common/config.h"
+#include "common/debug.h"
+
+#ifdef USE_NSS
+#include <nspr.h>
+#include <cert.h>
+#include <nss.h>
+#include <smime.h>
+#endif
+
+#define dout_subsys ceph_subsys_crypto
+
+#ifndef USE_NSS
+int ceph_decode_cms(CephContext *cct, bufferlist& cms_bl, bufferlist& decoded_bl)
+{
+  return -ENOTSUP;
+}
+
+#else
+
+static int cms_verbose = 0;
+
+static SECStatus
+DigestFile(PLArenaPool *poolp, SECItem ***digests, SECItem *input,
+           SECAlgorithmID **algids)
+{
+    NSSCMSDigestContext *digcx;
+
+    digcx = NSS_CMSDigestContext_StartMultiple(algids);
+    if (digcx == NULL)
+	return SECFailure;
+
+    NSS_CMSDigestContext_Update(digcx, input->data, input->len);
+
+    return NSS_CMSDigestContext_FinishMultiple(digcx, poolp, digests);
+}
+
+
+struct optionsStr {
+    SECCertUsage certUsage;
+    CERTCertDBHandle *certHandle;
+};
+
+struct decodeOptionsStr {
+    struct optionsStr *options;
+    SECItem            content;
+    int headerLevel;
+    PRBool suppressContent;
+    NSSCMSGetDecryptKeyCallback dkcb;
+    PK11SymKey *bulkkey;
+    PRBool      keepCerts;
+};
+
+static NSSCMSMessage *
+decode(CephContext *cct, SECItem *input, const struct decodeOptionsStr *decodeOptions, bufferlist& out)
+{
+    NSSCMSDecoderContext *dcx;
+    SECStatus rv;
+    NSSCMSMessage *cmsg;
+    int nlevels, i;
+    SECItem sitem;
+    bufferptr bp;
+    SECItem *item;
+
+    memset(&sitem, 0, sizeof(sitem));
+
+    PORT_SetError(0);
+    dcx = NSS_CMSDecoder_Start(NULL, 
+                               NULL, NULL,         /* content callback     */
+                               NULL, NULL,         /* password callback    */
+			       decodeOptions->dkcb, /* decrypt key callback */
+                               decodeOptions->bulkkey);
+    if (dcx == NULL) {
+	ldout(cct, 0) << "ERROR: failed to set up message decoder" << dendl;
+	return NULL;
+    }
+    rv = NSS_CMSDecoder_Update(dcx, (char *)input->data, input->len);
+    if (rv != SECSuccess) {
+	ldout(cct, 0) << "ERROR: failed to decode message" << dendl;
+	NSS_CMSDecoder_Cancel(dcx);
+	return NULL;
+    }
+    cmsg = NSS_CMSDecoder_Finish(dcx);
+    if (cmsg == NULL) {
+	ldout(cct, 0) << "ERROR: failed to decode message" << dendl;
+	return NULL;
+    }
+
+    if (decodeOptions->headerLevel >= 0) {
+	ldout(cct, 20) << "SMIME: " << dendl;
+    }
+
+    nlevels = NSS_CMSMessage_ContentLevelCount(cmsg);
+    for (i = 0; i < nlevels; i++) {
+	NSSCMSContentInfo *cinfo;
+	SECOidTag typetag;
+
+	cinfo = NSS_CMSMessage_ContentLevel(cmsg, i);
+	typetag = NSS_CMSContentInfo_GetContentTypeTag(cinfo);
+
+	ldout(cct, 20) << "level=" << decodeOptions->headerLevel << "." << nlevels - i << dendl;
+
+	switch (typetag) {
+	case SEC_OID_PKCS7_SIGNED_DATA:
+	  {
+	    NSSCMSSignedData *sigd = NULL;
+	    SECItem **digests;
+	    int nsigners;
+	    int j;
+
+	    if (decodeOptions->headerLevel >= 0)
+		ldout(cct, 20) << "type=signedData; " << dendl;
+	    sigd = (NSSCMSSignedData *)NSS_CMSContentInfo_GetContent(cinfo);
+	    if (sigd == NULL) {
+		ldout(cct, 0) << "ERROR: signedData component missing" << dendl;
+		goto loser;
+	    }
+
+	    /* if we have a content file, but no digests for this signedData */
+	    if (decodeOptions->content.data != NULL && 
+	        !NSS_CMSSignedData_HasDigests(sigd)) {
+		PLArenaPool     *poolp;
+		SECAlgorithmID **digestalgs;
+
+		/* detached content: grab content file */
+		sitem = decodeOptions->content;
+
+		if ((poolp = PORT_NewArena(1024)) == NULL) {
+		    ldout(cct, 0) << "ERROR: Out of memory" << dendl;
+		    goto loser;
+		}
+		digestalgs = NSS_CMSSignedData_GetDigestAlgs(sigd);
+		if (DigestFile (poolp, &digests, &sitem, digestalgs) 
+		      != SECSuccess) {
+		    ldout(cct, 0) << "ERROR: problem computing message digest" << dendl;
+		    PORT_FreeArena(poolp, PR_FALSE);
+		    goto loser;
+		}
+		if (NSS_CMSSignedData_SetDigests(sigd, digestalgs, digests) 
+		    != SECSuccess) {
+		    ldout(cct, 0) << "ERROR: problem setting message digests" << dendl;
+		    PORT_FreeArena(poolp, PR_FALSE);
+		    goto loser;
+		}
+		PORT_FreeArena(poolp, PR_FALSE);
+	    }
+
+	    /* import the certificates */
+	    if (NSS_CMSSignedData_ImportCerts(sigd, 
+	                                   decodeOptions->options->certHandle, 
+	                                   decodeOptions->options->certUsage, 
+	                                   decodeOptions->keepCerts) 
+	          != SECSuccess) {
+		ldout(cct, 0) << "ERROR: cert import failed" << dendl;
+		goto loser;
+	    }
+
+	    /* find out about signers */
+	    nsigners = NSS_CMSSignedData_SignerInfoCount(sigd);
+	    if (decodeOptions->headerLevel >= 0)
+		ldout(cct, 20) << "nsigners=" << nsigners << dendl;
+	    if (nsigners == 0) {
+		/* Might be a cert transport message
+		** or might be an invalid message, such as a QA test message
+		** or a message from an attacker.
+		*/
+		SECStatus rv;
+		rv = NSS_CMSSignedData_VerifyCertsOnly(sigd, 
+		                            decodeOptions->options->certHandle, 
+		                            decodeOptions->options->certUsage);
+		if (rv != SECSuccess) {
+		    ldout(cct, 0) << "ERROR: Verify certs-only failed!" << dendl;
+		    goto loser;
+		}
+		return cmsg;
+	    }
+
+	    /* still no digests? */
+	    if (!NSS_CMSSignedData_HasDigests(sigd)) {
+		ldout(cct, 0) << "ERROR: no message digests" << dendl;
+		goto loser;
+	    }
+
+	    for (j = 0; j < nsigners; j++) {
+		const char * svs;
+		NSSCMSSignerInfo *si;
+		NSSCMSVerificationStatus vs;
+		SECStatus bad;
+
+		si = NSS_CMSSignedData_GetSignerInfo(sigd, j);
+		if (decodeOptions->headerLevel >= 0) {
+		    char *signercn;
+		    static char empty[] = { "" };
+
+		    signercn = NSS_CMSSignerInfo_GetSignerCommonName(si);
+		    if (signercn == NULL)
+			signercn = empty;
+		    ldout(cct, 20) << "\t\tsigner" << j << ".id=" << signercn << dendl;
+		    if (signercn != empty)
+		        PORT_Free(signercn);
+		}
+		bad = NSS_CMSSignedData_VerifySignerInfo(sigd, j, 
+		                           decodeOptions->options->certHandle, 
+		                           decodeOptions->options->certUsage);
+		vs  = NSS_CMSSignerInfo_GetVerificationStatus(si);
+		svs = NSS_CMSUtil_VerificationStatusToString(vs);
+		if (decodeOptions->headerLevel >= 0) {
+		    ldout(cct, 20) << "signer" << j << "status=" << svs << dendl;
+		    /* goto loser ? */
+		} else if (bad) {
+		    ldout(cct, 0) << "ERROR: signer " << j << " status = " << svs << dendl;
+		    goto loser;
+		}
+	    }
+	  }
+	  break;
+	case SEC_OID_PKCS7_ENVELOPED_DATA:
+	  {
+	    NSSCMSEnvelopedData *envd;
+	    if (decodeOptions->headerLevel >= 0)
+		ldout(cct, 20) << "type=envelopedData; " << dendl;
+	    envd = (NSSCMSEnvelopedData *)NSS_CMSContentInfo_GetContent(cinfo);
+	    if (envd == NULL) {
+		ldout(cct, 0) << "ERROR: envelopedData component missing" << dendl;
+		goto loser;
+	    }
+	  }
+	  break;
+	case SEC_OID_PKCS7_ENCRYPTED_DATA:
+	  {
+	    NSSCMSEncryptedData *encd;
+	    if (decodeOptions->headerLevel >= 0)
+		ldout(cct, 20) << "type=encryptedData; " << dendl;
+	    encd = (NSSCMSEncryptedData *)NSS_CMSContentInfo_GetContent(cinfo);
+	    if (encd == NULL) {
+		ldout(cct, 0) << "ERROR: encryptedData component missing" << dendl;
+		goto loser;
+	    }
+	  }
+	  break;
+	case SEC_OID_PKCS7_DATA:
+	    if (decodeOptions->headerLevel >= 0)
+		ldout(cct, 20) << "type=data; " << dendl;
+	    break;
+	default:
+	    break;
+	}
+    }
+
+    item = (sitem.data ? &sitem : NSS_CMSMessage_GetContent(cmsg));
+    out.append((char *)item->data, item->len);
+    return cmsg;
+
+loser:
+    if (cmsg)
+	NSS_CMSMessage_Destroy(cmsg);
+    return NULL;
+}
+
+int ceph_decode_cms(CephContext *cct, bufferlist& cms_bl, bufferlist& decoded_bl)
+{
+    NSSCMSMessage *cmsg = NULL;
+    struct decodeOptionsStr decodeOptions = { };
+    struct optionsStr options;
+    SECItem input;
+
+    memset(&options, 0, sizeof(options));
+    memset(&input, 0, sizeof(input));
+
+    input.data = (unsigned char *)cms_bl.c_str();
+    input.len = cms_bl.length();
+
+    decodeOptions.content.data = NULL;
+    decodeOptions.content.len  = 0;
+    decodeOptions.suppressContent = PR_FALSE;
+    decodeOptions.headerLevel = -1;
+    decodeOptions.keepCerts = PR_FALSE;
+    options.certUsage = certUsageEmailSigner;
+
+    options.certHandle = CERT_GetDefaultCertDB();
+    if (!options.certHandle) {
+	ldout(cct, 0) << "ERROR: No default cert DB" << dendl;
+	return -EIO;
+    }
+    if (cms_verbose) {
+	fprintf(stderr, "Got default certdb\n");
+    }
+
+    decodeOptions.options = &options;
+
+    int ret = 0;
+
+    cmsg = decode(cct, &input, &decodeOptions, decoded_bl);
+    if (!cmsg) {
+        ldout(cct, 0) << "ERROR: problem decoding" << dendl;
+	ret = -EINVAL;
+    }
+
+    if (cmsg)
+	NSS_CMSMessage_Destroy(cmsg);
+
+    SECITEM_FreeItem(&decodeOptions.content, PR_FALSE);
+
+    return ret;
+}
+#endif
diff --git a/src/common/ceph_crypto_cms.h b/src/common/ceph_crypto_cms.h
new file mode 100644
index 00000000..11fb000c
--- /dev/null
+++ b/src/common/ceph_crypto_cms.h
@@ -0,0 +1,10 @@
+#ifndef CEPH_CRYPTO_CMS_H
+#define CEPH_CRYPTO_CMS_H
+
+#include "include/buffer_fwd.h"
+
+class CephContext;
+
+int ceph_decode_cms(CephContext *cct, bufferlist& cms_bl, bufferlist& decoded_bl);
+
+#endif
diff --git a/src/common/ceph_frag.cc b/src/common/ceph_frag.cc
new file mode 100644
index 00000000..444b910c
--- /dev/null
+++ b/src/common/ceph_frag.cc
@@ -0,0 +1,21 @@
+/*
+ * Ceph 'frag' type
+ */
+#include "include/types.h"
+
+int ceph_frag_compare(__u32 a, __u32 b)
+{
+	unsigned va = ceph_frag_value(a);
+	unsigned vb = ceph_frag_value(b);
+	if (va < vb)
+		return -1;
+	if (va > vb)
+		return 1;
+	va = ceph_frag_bits(a);
+	vb = ceph_frag_bits(b);
+	if (va < vb)
+		return -1;
+	if (va > vb)
+		return 1;
+	return 0;
+}
diff --git a/src/common/ceph_fs.cc b/src/common/ceph_fs.cc
new file mode 100644
index 00000000..7a4b59f8
--- /dev/null
+++ b/src/common/ceph_fs.cc
@@ -0,0 +1,85 @@
+/*
+ * ceph_fs.cc - Some Ceph functions that are shared between kernel space and
+ * user space.
+ *
+ */
+
+/*
+ * Some non-inline ceph helpers
+ */
+#include "include/types.h"
+
+int ceph_flags_to_mode(int flags)
+{
+	/* because CEPH_FILE_MODE_PIN is zero, so mode = -1 is error */
+	int mode = -1;
+
+	if ((flags & CEPH_O_DIRECTORY) == CEPH_O_DIRECTORY)
+		return CEPH_FILE_MODE_PIN;
+
+	switch (flags & O_ACCMODE) {
+	case CEPH_O_WRONLY:
+		mode = CEPH_FILE_MODE_WR;
+		break;
+	case CEPH_O_RDONLY:
+		mode = CEPH_FILE_MODE_RD;
+		break;
+	case CEPH_O_RDWR:
+	case O_ACCMODE: /* this is what the VFS does */
+		mode = CEPH_FILE_MODE_RDWR;
+		break;
+	}
+
+	if (flags & CEPH_O_LAZY)
+		mode |= CEPH_FILE_MODE_LAZY;
+
+	return mode;
+}
+
+int ceph_caps_for_mode(int mode)
+{
+	int caps = CEPH_CAP_PIN;
+
+	if (mode & CEPH_FILE_MODE_RD)
+		caps |= CEPH_CAP_FILE_SHARED |
+			CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
+	if (mode & CEPH_FILE_MODE_WR)
+		caps |= CEPH_CAP_FILE_EXCL |
+			CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+			CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+			CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+	if (mode & CEPH_FILE_MODE_LAZY)
+		caps |= CEPH_CAP_FILE_LAZYIO;
+
+	return caps;
+}
+
+int ceph_flags_sys2wire(int flags)
+{
+       int wire_flags = 0;
+
+       switch (flags & O_ACCMODE) {
+       case O_RDONLY:
+               wire_flags |= CEPH_O_RDONLY;
+               break;
+       case O_WRONLY:
+               wire_flags |= CEPH_O_WRONLY;
+               break;
+       case O_RDWR:
+               wire_flags |= CEPH_O_RDWR;
+               break;
+       }
+       flags &= ~O_ACCMODE;
+
+#define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; }
+
+       ceph_sys2wire(O_CREAT);
+       ceph_sys2wire(O_EXCL);
+       ceph_sys2wire(O_TRUNC);
+       ceph_sys2wire(O_DIRECTORY);
+       ceph_sys2wire(O_NOFOLLOW);
+
+#undef ceph_sys2wire
+
+       return wire_flags;
+}
diff --git a/src/common/ceph_hash.cc b/src/common/ceph_hash.cc
new file mode 100644
index 00000000..061926d2
--- /dev/null
+++ b/src/common/ceph_hash.cc
@@ -0,0 +1,128 @@
+
+#include "include/types.h"
+
+/*
+ * Robert Jenkin's hash function.
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * This is in the public domain.
+ */
+#define mix(a, b, c)						\
+	do {							\
+		a = a - b;  a = a - c;  a = a ^ (c >> 13);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 8);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 13);	\
+		a = a - b;  a = a - c;  a = a ^ (c >> 12);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 16);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 5);	\
+		a = a - b;  a = a - c;  a = a ^ (c >> 3);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 10);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 15);	\
+	} while (0)
+
+unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
+{
+	const unsigned char *k = (const unsigned char *)str;
+	__u32 a, b, c;  /* the internal state */
+	__u32 len;      /* how many key bytes still need mixing */
+
+	/* Set up the internal state */
+	len = length;
+	a = 0x9e3779b9;      /* the golden ratio; an arbitrary value */
+	b = a;
+	c = 0;               /* variable initialization of internal state */
+
+	/* handle most of the key */
+	while (len >= 12) {
+		a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
+			 ((__u32)k[3] << 24));
+		b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
+			 ((__u32)k[7] << 24));
+		c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
+			 ((__u32)k[11] << 24));
+		mix(a, b, c);
+		k = k + 12;
+		len = len - 12;
+	}
+
+	/* handle the last 11 bytes */
+	c = c + length;
+	switch (len) {            /* all the case statements fall through */
+	case 11:
+		c = c + ((__u32)k[10] << 24);
+	case 10:
+		c = c + ((__u32)k[9] << 16);
+	case 9:
+		c = c + ((__u32)k[8] << 8);
+		/* the first byte of c is reserved for the length */
+	case 8:
+		b = b + ((__u32)k[7] << 24);
+	case 7:
+		b = b + ((__u32)k[6] << 16);
+	case 6:
+		b = b + ((__u32)k[5] << 8);
+	case 5:
+		b = b + k[4];
+	case 4:
+		a = a + ((__u32)k[3] << 24);
+	case 3:
+		a = a + ((__u32)k[2] << 16);
+	case 2:
+		a = a + ((__u32)k[1] << 8);
+	case 1:
+		a = a + k[0];
+		/* case 0: nothing left to add */
+	}
+	mix(a, b, c);
+
+	return c;
+}
+
+/*
+ * linux dcache hash
+ */
+unsigned ceph_str_hash_linux(const char *str, unsigned length)
+{
+	unsigned hash = 0;
+
+	while (length--) {
+		unsigned char c = *str++;
+		hash = (hash + (c << 4) + (c >> 4)) * 11;
+	}
+	return hash;
+}
+
+
+unsigned ceph_str_hash(int type, const char *s, unsigned len)
+{
+	switch (type) {
+	case CEPH_STR_HASH_LINUX:
+		return ceph_str_hash_linux(s, len);
+	case CEPH_STR_HASH_RJENKINS:
+		return ceph_str_hash_rjenkins(s, len);
+	default:
+		return -1;
+	}
+}
+
+const char *ceph_str_hash_name(int type)
+{
+	switch (type) {
+	case CEPH_STR_HASH_LINUX:
+		return "linux";
+	case CEPH_STR_HASH_RJENKINS:
+		return "rjenkins";
+	default:
+		return "unknown";
+	}
+}
+
+bool ceph_str_hash_valid(int type)
+{
+        switch (type) {
+        case CEPH_STR_HASH_LINUX:
+        case CEPH_STR_HASH_RJENKINS:
+                return true;
+        default:
+                return false;
+        }
+}
diff --git a/src/common/ceph_json.cc b/src/common/ceph_json.cc
new file mode 100644
index 00000000..c1b77c0e
--- /dev/null
+++ b/src/common/ceph_json.cc
@@ -0,0 +1,934 @@
+#include "common/ceph_json.h"
+#include "include/utime.h"
+
+// for testing DELETE ME
+#include <fstream>
+#include <include/types.h>
+
+#include <boost/algorithm/string.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/lexical_cast.hpp>
+
+#include "json_spirit/json_spirit_writer_template.h"
+
+using namespace json_spirit;
+
+#define dout_subsys ceph_subsys_rgw
+
+
+static JSONFormattable default_formattable;
+
+
+void encode_json(const char *name, const JSONObj::data_val& v, Formatter *f)
+{
+  if (v.quoted) {
+    encode_json(name, v.str, f);
+  } else {
+    f->dump_format_unquoted(name, "%s", v.str.c_str());
+  }
+}
+
+JSONObjIter::JSONObjIter()
+{
+}
+
+JSONObjIter::~JSONObjIter()
+{
+}
+
+void JSONObjIter::set(const JSONObjIter::map_iter_t &_cur, const JSONObjIter::map_iter_t &_last)
+{
+  cur = _cur;
+  last = _last;
+}
+
+void JSONObjIter::operator++()
+{
+  if (cur != last)
+    ++cur;
+}
+
+JSONObj *JSONObjIter::operator*()
+{
+  return cur->second;
+}
+
+// does not work, FIXME
+ostream& operator<<(ostream &out, const JSONObj &obj) {
+   out << obj.name << ": " << obj.val;
+   return out;
+}
+
+JSONObj::~JSONObj()
+{
+  multimap<string, JSONObj *>::iterator iter;
+  for (iter = children.begin(); iter != children.end(); ++iter) {
+    JSONObj *obj = iter->second;
+    delete obj;
+  }
+}
+
+
+void JSONObj::add_child(string el, JSONObj *obj)
+{
+  children.insert(pair<string, JSONObj *>(el, obj));
+}
+
+bool JSONObj::get_attr(string name, data_val& attr)
+{
+  auto iter = attr_map.find(name);
+  if (iter == attr_map.end())
+    return false;
+  attr = iter->second;
+  return true;
+}
+
+JSONObjIter JSONObj::find(const string& name)
+{
+  JSONObjIter iter;
+  map<string, JSONObj *>::iterator first;
+  map<string, JSONObj *>::iterator last;
+  first = children.find(name);
+  if (first != children.end()) {
+    last = children.upper_bound(name);
+    iter.set(first, last);
+  }
+  return iter;
+}
+
+JSONObjIter JSONObj::find_first()
+{
+  JSONObjIter iter;
+  iter.set(children.begin(), children.end());
+  return iter;
+}
+
+JSONObjIter JSONObj::find_first(const string& name)
+{
+  JSONObjIter iter;
+  map<string, JSONObj *>::iterator first;
+  first = children.find(name);
+  iter.set(first, children.end());
+  return iter;
+}
+
+JSONObj *JSONObj::find_obj(const string& name)
+{
+  JSONObjIter iter = find(name);
+  if (iter.end())
+    return NULL;
+
+  return *iter;
+}
+
+bool JSONObj::get_data(const string& key, data_val *dest)
+{
+  JSONObj *obj = find_obj(key);
+  if (!obj)
+    return false;
+
+  *dest = obj->get_data_val();
+
+  return true;
+}
+
+/* accepts a JSON Array or JSON Object contained in
+ * a JSON Spirit Value, v,  and creates a JSONObj for each
+ * child contained in v
+ */
+void JSONObj::handle_value(Value v)
+{
+  if (v.type() == obj_type) {
+    Object temp_obj = v.get_obj();
+    for (Object::size_type i = 0; i < temp_obj.size(); i++) {
+      Pair temp_pair = temp_obj[i];
+      string temp_name = temp_pair.name_;
+      Value temp_value = temp_pair.value_;
+      JSONObj *child = new JSONObj;
+      child->init(this, temp_value, temp_name);
+      add_child(temp_name, child);
+    }
+  } else if (v.type() == array_type) {
+    Array temp_array = v.get_array();
+    Value value;
+
+    for (unsigned j = 0; j < temp_array.size(); j++) {
+      Value cur = temp_array[j];
+      string temp_name;
+
+      JSONObj *child = new JSONObj;
+      child->init(this, cur, temp_name);
+      add_child(child->get_name(), child);
+    }
+  }
+}
+
+void JSONObj::init(JSONObj *p, Value v, string n)
+{
+  name = n;
+  parent = p;
+  data = v;
+
+  handle_value(v);
+  if (v.type() == str_type) {
+    val.set(v.get_str(), true);
+  } else {
+    val.set(json_spirit::write_string(v), false);
+  }
+  attr_map.insert(pair<string,data_val>(name, val));
+}
+
+JSONObj *JSONObj::get_parent()
+{
+  return parent;
+}
+
+bool JSONObj::is_object()
+{
+  return (data.type() == obj_type);
+}
+
+bool JSONObj::is_array()
+{
+  return (data.type() == array_type);
+}
+
+vector<string> JSONObj::get_array_elements()
+{
+  vector<string> elements;
+  Array temp_array;
+
+  if (data.type() == array_type)
+    temp_array = data.get_array();
+
+  int array_size = temp_array.size();
+  if (array_size > 0)
+    for (int i = 0; i < array_size; i++) {
+      Value temp_value = temp_array[i];
+      string temp_string;
+      temp_string = write(temp_value, raw_utf8);
+      elements.push_back(temp_string);
+    }
+
+  return elements;
+}
+
+JSONParser::JSONParser() : buf_len(0), success(true)
+{
+}
+
+JSONParser::~JSONParser()
+{
+}
+
+
+
+void JSONParser::handle_data(const char *s, int len)
+{
+  json_buffer.append(s, len); // check for problems with null termination FIXME
+  buf_len += len;
+}
+
+// parse a supplied JSON fragment
+bool JSONParser::parse(const char *buf_, int len)
+{
+  if (!buf_) {
+    set_failure();
+    return false;
+  }
+
+  string json_string(buf_, len);
+  success = read(json_string, data);
+  if (success) {
+    handle_value(data);
+    if (data.type() != obj_type &&
+        data.type() != array_type) {
+      if (data.type() == str_type) {
+        val.set(data.get_str(), true);
+      } else {
+        val.set(json_spirit::write_string(data), false);
+      }
+    }
+  } else {
+    set_failure();
+  }
+
+  return success;
+}
+
+// parse the internal json_buffer up to len
+bool JSONParser::parse(int len)
+{
+  string json_string = json_buffer.substr(0, len);
+  success = read(json_string, data);
+  if (success)
+    handle_value(data);
+  else
+    set_failure();
+
+  return success;
+}
+
+// parse the complete internal json_buffer
+bool JSONParser::parse()
+{
+  success = read(json_buffer, data);
+  if (success)
+    handle_value(data);
+  else
+    set_failure();
+
+  return success;
+}
+
+// parse a supplied ifstream, for testing. DELETE ME
+bool JSONParser::parse(const char *file_name)
+{
+  ifstream is(file_name);
+  success = read(is, data);
+  if (success)
+    handle_value(data);
+  else
+    set_failure();
+
+  return success;
+}
+
+
+void decode_json_obj(long& val, JSONObj *obj)
+{
+  string s = obj->get_data();
+  const char *start = s.c_str();
+  char *p;
+
+  errno = 0;
+  val = strtol(start, &p, 10);
+
+  /* Check for various possible errors */
+
+ if ((errno == ERANGE && (val == LONG_MAX || val == LONG_MIN)) ||
+     (errno != 0 && val == 0)) {
+   throw JSONDecoder::err("failed to parse number");
+ }
+
+ if (p == start) {
+   throw JSONDecoder::err("failed to parse number");
+ }
+
+ while (*p != '\0') {
+   if (!isspace(*p)) {
+     throw JSONDecoder::err("failed to parse number");
+   }
+   p++;
+ }
+}
+
+void decode_json_obj(unsigned long& val, JSONObj *obj)
+{
+  string s = obj->get_data();
+  const char *start = s.c_str();
+  char *p;
+
+  errno = 0;
+  val = strtoul(start, &p, 10);
+
+  /* Check for various possible errors */
+
+ if ((errno == ERANGE && val == ULONG_MAX) ||
+     (errno != 0 && val == 0)) {
+   throw JSONDecoder::err("failed to number");
+ }
+
+ if (p == start) {
+   throw JSONDecoder::err("failed to parse number");
+ }
+
+ while (*p != '\0') {
+   if (!isspace(*p)) {
+     throw JSONDecoder::err("failed to parse number");
+   }
+   p++;
+ }
+}
+
+void decode_json_obj(long long& val, JSONObj *obj)
+{
+  string s = obj->get_data();
+  const char *start = s.c_str();
+  char *p;
+
+  errno = 0;
+  val = strtoll(start, &p, 10);
+
+  /* Check for various possible errors */
+
+ if ((errno == ERANGE && (val == LLONG_MAX || val == LLONG_MIN)) ||
+     (errno != 0 && val == 0)) {
+   throw JSONDecoder::err("failed to parse number");
+ }
+
+ if (p == start) {
+   throw JSONDecoder::err("failed to parse number");
+ }
+
+ while (*p != '\0') {
+   if (!isspace(*p)) {
+     throw JSONDecoder::err("failed to parse number");
+   }
+   p++;
+ }
+}
+
+void decode_json_obj(unsigned long long& val, JSONObj *obj)
+{
+  string s = obj->get_data();
+  const char *start = s.c_str();
+  char *p;
+
+  errno = 0;
+  val = strtoull(start, &p, 10);
+
+  /* Check for various possible errors */
+
+ if ((errno == ERANGE && val == ULLONG_MAX) ||
+     (errno != 0 && val == 0)) {
+   throw JSONDecoder::err("failed to number");
+ }
+
+ if (p == start) {
+   throw JSONDecoder::err("failed to parse number");
+ }
+
+ while (*p != '\0') {
+   if (!isspace(*p)) {
+     throw JSONDecoder::err("failed to parse number");
+   }
+   p++;
+ }
+}
+
+void decode_json_obj(int& val, JSONObj *obj)
+{
+  long l;
+  decode_json_obj(l, obj);
+#if LONG_MAX > INT_MAX
+  if (l > INT_MAX || l < INT_MIN) {
+    throw JSONDecoder::err("integer out of range");
+  }
+#endif
+
+  val = (int)l;
+}
+
+void decode_json_obj(unsigned& val, JSONObj *obj)
+{
+  unsigned long l;
+  decode_json_obj(l, obj);
+#if ULONG_MAX > UINT_MAX
+  if (l > UINT_MAX) {
+    throw JSONDecoder::err("unsigned integer out of range");
+  }
+#endif
+
+  val = (unsigned)l;
+}
+
+void decode_json_obj(bool& val, JSONObj *obj)
+{
+  string s = obj->get_data();
+  if (strcasecmp(s.c_str(), "true") == 0) {
+    val = true;
+    return;
+  }
+  if (strcasecmp(s.c_str(), "false") == 0) {
+    val = false;
+    return;
+  }
+  int i;
+  decode_json_obj(i, obj);
+  val = (bool)i;
+}
+
+void decode_json_obj(bufferlist& val, JSONObj *obj)
+{
+  string s = obj->get_data();
+
+  bufferlist bl;
+  bl.append(s.c_str(), s.size());
+  try {
+    val.decode_base64(bl);
+  } catch (buffer::error& err) {
+   throw JSONDecoder::err("failed to decode base64");
+  }
+}
+
+void decode_json_obj(utime_t& val, JSONObj *obj)
+{
+  string s = obj->get_data();
+  uint64_t epoch;
+  uint64_t nsec;
+  int r = utime_t::parse_date(s, &epoch, &nsec);
+  if (r == 0) {
+    val = utime_t(epoch, nsec);
+  } else {
+    throw JSONDecoder::err("failed to decode utime_t");
+  }
+}
+
+void encode_json(const char *name, const string& val, Formatter *f)
+{
+  f->dump_string(name, val);
+}
+
+void encode_json(const char *name, const char *val, Formatter *f)
+{
+  f->dump_string(name, val);
+}
+
+void encode_json(const char *name, bool val, Formatter *f)
+{
+  string s;
+  if (val)
+    s = "true";
+  else
+    s = "false";
+
+  f->dump_string(name, s);
+}
+
+void encode_json(const char *name, int val, Formatter *f)
+{
+  f->dump_int(name, val);
+}
+
+void encode_json(const char *name, long val, Formatter *f)
+{
+  f->dump_int(name, val);
+}
+
+void encode_json(const char *name, unsigned val, Formatter *f)
+{
+  f->dump_unsigned(name, val);
+}
+
+void encode_json(const char *name, unsigned long val, Formatter *f)
+{
+  f->dump_unsigned(name, val);
+}
+
+void encode_json(const char *name, unsigned long long val, Formatter *f)
+{
+  f->dump_unsigned(name, val);
+}
+
+void encode_json(const char *name, long long val, Formatter *f)
+{
+  f->dump_int(name, val);
+}
+
+void encode_json(const char *name, const utime_t& val, Formatter *f)
+{
+  val.gmtime(f->dump_stream(name));
+}
+
+void encode_json(const char *name, const bufferlist& bl, Formatter *f)
+{
+  /* need to copy data from bl, as it is const bufferlist */
+  bufferlist src = bl;
+
+  bufferlist b64;
+  src.encode_base64(b64);
+
+  string s(b64.c_str(), b64.length());
+
+  encode_json(name, s, f);
+}
+
+
+
+/* JSONFormattable */
+
+const JSONFormattable& JSONFormattable::operator[](const string& name) const
+{
+  auto i = obj.find(name);
+  if (i == obj.end()) {
+    return default_formattable;
+  }
+  return i->second;
+}
+
+const JSONFormattable& JSONFormattable::operator[](size_t index) const
+{
+  if (index >= arr.size()) {
+    return default_formattable;
+  }
+  return arr[index];
+}
+
+JSONFormattable& JSONFormattable::operator[](const string& name)
+{
+  auto i = obj.find(name);
+  if (i == obj.end()) {
+    return default_formattable;
+  }
+  return i->second;
+}
+
+JSONFormattable& JSONFormattable::operator[](size_t index)
+{
+  if (index >= arr.size()) {
+    return default_formattable;
+  }
+  return arr[index];
+}
+
+bool JSONFormattable::exists(const string& name) const
+{
+  auto i = obj.find(name);
+  return (i != obj.end());
+}
+
+bool JSONFormattable::exists(size_t index) const
+{
+  return (index < arr.size());
+}
+
+bool JSONFormattable::find(const string& name, string *val) const
+{
+  auto i = obj.find(name);
+  if (i == obj.end()) {
+    return false;
+  }
+  *val = i->second.val();
+  return true;
+}
+
+int JSONFormattable::val_int() const {
+  return atoi(value.str.c_str());
+}
+
+long JSONFormattable::val_long() const {
+  return atol(value.str.c_str());
+}
+
+long long JSONFormattable::val_long_long() const {
+  return atoll(value.str.c_str());
+}
+
+bool JSONFormattable::val_bool() const {
+  return (boost::iequals(value.str, "true") ||
+          boost::iequals(value.str, "on") ||
+          boost::iequals(value.str, "yes") ||
+          boost::iequals(value.str, "1"));
+}
+
+string JSONFormattable::def(const string& def_val) const {
+  if (type == FMT_NONE) {
+    return def_val;
+  }
+  return val();
+}
+
+int JSONFormattable::def(int def_val) const {
+  if (type == FMT_NONE) {
+    return def_val;
+  }
+  return val_int();
+}
+
+bool JSONFormattable::def(bool def_val) const {
+  if (type == FMT_NONE) {
+    return def_val;
+  }
+  return val_bool();
+}
+
+string JSONFormattable::get(const string& name, const string& def_val) const
+{
+  return (*this)[name].def(def_val);
+}
+
+int JSONFormattable::get_int(const string& name, int def_val) const
+{
+  return (*this)[name].def(def_val);
+}
+
+bool JSONFormattable::get_bool(const string& name, bool def_val) const
+{
+  return (*this)[name].def(def_val);
+}
+
+struct field_entity {
+  bool is_obj{false}; /* either obj field or array entity */
+  string name; /* if obj */
+  int index{0}; /* if array */
+  bool append{false};
+
+  field_entity() {}
+  explicit field_entity(const string& n) : is_obj(true), name(n) {}
+  explicit field_entity(int i) : is_obj(false), index(i) {}
+};
+
+static int parse_entity(const string& s, vector<field_entity> *result)
+{
+  size_t ofs = 0;
+
+  while (ofs < s.size()) {
+    size_t next_arr = s.find('[', ofs);
+    if (next_arr == string::npos) {
+      if (ofs != 0) {
+        return -EINVAL;
+      }
+      result->push_back(field_entity(s));
+      return 0;
+    }
+    if (next_arr > ofs) {
+      string field = s.substr(ofs, next_arr - ofs);
+      result->push_back(field_entity(field));
+      ofs = next_arr;
+    }
+    size_t end_arr = s.find(']', next_arr + 1);
+    if (end_arr == string::npos) {
+      return -EINVAL;
+    }
+
+    string index_str = s.substr(next_arr + 1, end_arr - next_arr - 1);
+
+    ofs = end_arr + 1;
+
+    if (!index_str.empty()) {
+      result->push_back(field_entity(atoi(index_str.c_str())));
+    } else {
+      field_entity f;
+      f.append = true;
+      result->push_back(f);
+    }
+  }
+  return 0;
+}
+
+static bool is_numeric(const string& val)
+{
+  try {
+    boost::lexical_cast<double>(val);
+  } catch (const boost::bad_lexical_cast& e) {
+    return false;
+  }
+  return true;
+}
+
+int JSONFormattable::set(const string& name, const string& val)
+{
+  boost::escaped_list_separator<char> els('\\', '.', '"');
+  boost::tokenizer<boost::escaped_list_separator<char> > tok(name, els);
+
+  JSONFormattable *f = this;
+
+  JSONParser jp;
+
+  bool is_valid_json = jp.parse(val.c_str(), val.size());
+
+  for (const auto& i : tok) {
+    vector<field_entity> v;
+    int ret = parse_entity(i, &v);
+    if (ret < 0) {
+      return ret;
+    }
+    for (const auto& vi : v) {
+      if (f->type == FMT_NONE) {
+        if (vi.is_obj) {
+          f->type = FMT_OBJ;
+        } else {
+          f->type = FMT_ARRAY;
+        }
+      }
+
+      if (f->type == FMT_OBJ) {
+        if (!vi.is_obj) {
+          return -EINVAL;
+        }
+        f = &f->obj[vi.name];
+      } else if (f->type == FMT_ARRAY) {
+        if (vi.is_obj) {
+          return -EINVAL;
+        }
+        int index = vi.index;
+        if (vi.append) {
+          index = f->arr.size();
+        } else if (index < 0) {
+          index = f->arr.size() + index;
+          if (index < 0) {
+            return -EINVAL; /* out of bounds */
+          }
+        }
+        if ((size_t)index >= f->arr.size()) {
+          f->arr.resize(index + 1);
+        }
+        f = &f->arr[index];
+      }
+    }
+  }
+
+  if (is_valid_json) {
+    f->decode_json(&jp);
+  } else {
+    f->type = FMT_VALUE;
+    f->value.set(val, !is_numeric(val));
+  }
+
+  return 0;
+}
+
+int JSONFormattable::erase(const string& name)
+{
+  boost::escaped_list_separator<char> els('\\', '.', '"');
+  boost::tokenizer<boost::escaped_list_separator<char> > tok(name, els);
+
+  JSONFormattable *f = this;
+  JSONFormattable *parent = nullptr;
+  field_entity last_entity;
+
+  for (auto& i : tok) {
+    vector<field_entity> v;
+    int ret = parse_entity(i, &v);
+    if (ret < 0) {
+      return ret;
+    }
+    for (const auto& vi : v) {
+      if (f->type == FMT_NONE ||
+          f->type == FMT_VALUE) {
+        if (vi.is_obj) {
+          f->type = FMT_OBJ;
+        } else {
+          f->type = FMT_ARRAY;
+        }
+      }
+
+      parent = f;
+
+      if (f->type == FMT_OBJ) {
+        if (!vi.is_obj) {
+          return -EINVAL;
+        }
+        auto iter = f->obj.find(vi.name);
+        if (iter == f->obj.end()) {
+          return 0; /* nothing to erase */
+        }
+        f = &iter->second;
+      } else if (f->type == FMT_ARRAY) {
+        if (vi.is_obj) {
+          return -EINVAL;
+        }
+        int index = vi.index;
+        if (index < 0) {
+          index = f->arr.size() + index;
+          if (index < 0) { /* out of bounds, nothing to remove */
+            return 0;
+          }
+        }
+        if ((size_t)index >= f->arr.size()) {
+          return 0; /* index beyond array boundaries */
+        }
+        f = &f->arr[index];
+      }
+      last_entity = vi;
+    }
+  }
+
+  if (!parent) {
+    *this = JSONFormattable(); /* erase everything */
+  } else {
+    if (last_entity.is_obj) {
+      parent->obj.erase(last_entity.name);
+    } else {
+      int index = (last_entity.index >= 0 ? last_entity.index : parent->arr.size() + last_entity.index);
+      if (index < 0 || (size_t)index >= parent->arr.size()) {
+        return 0;
+      }
+      parent->arr.erase(parent->arr.begin() + index);
+    }
+  }
+
+  return 0;
+}
+
+void JSONFormattable::derive_from(const JSONFormattable& parent)
+{
+  for (auto& o : parent.obj) {
+    if (obj.find(o.first) == obj.end()) {
+      obj[o.first] = o.second;
+    }
+  }
+}
+
+void encode_json(const char *name, const JSONFormattable& v, Formatter *f)
+{
+  v.encode_json(name, f);
+}
+
+void JSONFormattable::encode_json(const char *name, Formatter *f) const
+{
+  switch (type) {
+    case JSONFormattable::FMT_VALUE:
+      ::encode_json(name, value, f);
+      break;
+    case JSONFormattable::FMT_ARRAY:
+      ::encode_json(name, arr, f);
+      break;
+    case JSONFormattable::FMT_OBJ:
+      f->open_object_section(name);
+      for (auto iter : obj) {
+        ::encode_json(iter.first.c_str(), iter.second, f);
+      }
+      f->close_section();
+      break;
+    case JSONFormattable::FMT_NONE:
+      break;
+  }
+}
+
+bool JSONFormattable::handle_value(const char *name, std::string_view s, bool quoted) {
+  JSONFormattable *new_val;
+  if (cur_enc->is_array()) {
+    cur_enc->arr.push_back(JSONFormattable());
+    new_val = &cur_enc->arr.back();
+  } else {
+    cur_enc->set_type(JSONFormattable::FMT_OBJ);
+    new_val  = &cur_enc->obj[name];
+  }
+  new_val->set_type(JSONFormattable::FMT_VALUE);
+  new_val->value.set(s, quoted);
+
+  return false;
+}
+bool JSONFormattable::handle_open_section(const char *name, const char *ns, bool section_is_array) {
+  if (cur_enc->is_array()) {
+    cur_enc->arr.push_back(JSONFormattable());
+    cur_enc = &cur_enc->arr.back();
+  } else if (enc_stack.size() > 1) {
+      /* only open a new section if already nested,
+       * otherwise root is the container
+       */
+      cur_enc = &cur_enc->obj[name];
+  }
+  enc_stack.push_back(cur_enc);
+
+  if (section_is_array) {
+    cur_enc->set_type(JSONFormattable::FMT_ARRAY);
+  } else {
+    cur_enc->set_type(JSONFormattable::FMT_OBJ);
+  }
+
+  return false; /* continue processing */
+}
+
+bool JSONFormattable::handle_close_section() {
+  if (enc_stack.size() <= 1) {
+    return false;
+  }
+
+  enc_stack.pop_back();
+  cur_enc = enc_stack.back();
+  return false; /* continue processing */
+}
+
diff --git a/src/common/ceph_json.h b/src/common/ceph_json.h
new file mode 100644
index 00000000..c77dddbe
--- /dev/null
+++ b/src/common/ceph_json.h
@@ -0,0 +1,725 @@
+#ifndef CEPH_JSON_H
+#define CEPH_JSON_H
+
+#include <include/types.h>
+#include <boost/container/flat_map.hpp>
+
+#ifdef _ASSERT_H
+#define NEED_ASSERT_H
+#pragma push_macro("_ASSERT_H")
+#endif
+
+#include "json_spirit/json_spirit.h"
+#undef _ASSERT_H
+
+#ifdef NEED_ASSERT_H
+#pragma pop_macro("_ASSERT_H")
+#endif
+
+#include "Formatter.h"
+
+using namespace json_spirit;
+
+
+class JSONObj;
+
+class JSONObjIter {
+  typedef map<string, JSONObj *>::iterator map_iter_t;
+  map_iter_t cur;
+  map_iter_t last;
+
+public:
+  JSONObjIter();
+  ~JSONObjIter();
+  void set(const JSONObjIter::map_iter_t &_cur, const JSONObjIter::map_iter_t &_end);
+
+  void operator++();
+  JSONObj *operator*();
+
+  bool end() const {
+    return (cur == last);
+  }
+};
+
+class JSONObj
+{
+  JSONObj *parent;
+public:
+  struct data_val {
+    string str;
+    bool quoted{false};
+
+    void set(std::string_view s, bool q) {
+      str = s;
+      quoted = q;
+    }
+  };
+protected:
+  string name; // corresponds to obj_type in XMLObj
+  Value data;
+  struct data_val val;
+  bool data_quoted{false};
+  multimap<string, JSONObj *> children;
+  map<string, data_val> attr_map;
+  void handle_value(Value v);
+
+public:
+
+  JSONObj() : parent(NULL){}
+
+  virtual ~JSONObj();
+
+  void init(JSONObj *p, Value v, string n);
+
+  string& get_name() { return name; }
+  data_val& get_data_val() { return val; }
+  const string& get_data() { return val.str; }
+  bool get_data(const string& key, data_val *dest);
+  JSONObj *get_parent();
+  void add_child(string el, JSONObj *child);
+  bool get_attr(string name, data_val& attr);
+  JSONObjIter find(const string& name);
+  JSONObjIter find_first();
+  JSONObjIter find_first(const string& name);
+  JSONObj *find_obj(const string& name);
+
+  friend ostream& operator<<(ostream &out,
+			     const JSONObj &obj); // does not work, FIXME
+
+  bool is_array();
+  bool is_object();
+  vector<string> get_array_elements();
+};
+
+static inline ostream& operator<<(ostream &out, const JSONObj::data_val& dv) {
+  const char *q = (dv.quoted ? "\"" : "");
+   out << q << dv.str << q;
+   return out;
+}
+
+class JSONParser : public JSONObj
+{
+  int buf_len;
+  string json_buffer;
+  bool success;
+public:
+  JSONParser();
+  ~JSONParser() override;
+  void handle_data(const char *s, int len);
+
+  bool parse(const char *buf_, int len);
+  bool parse(int len);
+  bool parse();
+  bool parse(const char *file_name);
+
+  const char *get_json() { return json_buffer.c_str(); }
+  void set_failure() { success = false; }
+};
+
+void encode_json(const char *name, const JSONObj::data_val& v, Formatter *f);
+
+class JSONDecoder {
+public:
+  struct err {
+    string message;
+
+    err(const string& m) : message(m) {}
+  };
+
+  JSONParser parser;
+
+  JSONDecoder(bufferlist& bl) {
+    if (!parser.parse(bl.c_str(), bl.length())) {
+      cout << "JSONDecoder::err()" << std::endl;
+      throw JSONDecoder::err("failed to parse JSON input");
+    }
+  }
+
+  template<class T>
+  static bool decode_json(const char *name, T& val, JSONObj *obj, bool mandatory = false);
+
+  template<class C>
+  static bool decode_json(const char *name, C& container, void (*cb)(C&, JSONObj *obj), JSONObj *obj, bool mandatory = false);
+
+  template<class T>
+  static void decode_json(const char *name, T& val, const T& default_val, JSONObj *obj);
+
+  template<class T>
+  static bool decode_json(const char *name, boost::optional<T>& val, JSONObj *obj, bool mandatory = false);
+
+};
+
+template<class T>
+void decode_json_obj(T& val, JSONObj *obj)
+{
+  val.decode_json(obj);
+}
+
+static inline void decode_json_obj(string& val, JSONObj *obj)
+{
+  val = obj->get_data();
+}
+
+static inline void decode_json_obj(JSONObj::data_val& val, JSONObj *obj)
+{
+  val = obj->get_data_val();
+}
+
+void decode_json_obj(unsigned long long& val, JSONObj *obj);
+void decode_json_obj(long long& val, JSONObj *obj);
+void decode_json_obj(unsigned long& val, JSONObj *obj);
+void decode_json_obj(long& val, JSONObj *obj);
+void decode_json_obj(unsigned& val, JSONObj *obj);
+void decode_json_obj(int& val, JSONObj *obj);
+void decode_json_obj(bool& val, JSONObj *obj);
+void decode_json_obj(bufferlist& val, JSONObj *obj);
+class utime_t;
+void decode_json_obj(utime_t& val, JSONObj *obj);
+
+template<class T>
+void decode_json_obj(list<T>& l, JSONObj *obj)
+{
+  l.clear();
+
+  JSONObjIter iter = obj->find_first();
+
+  for (; !iter.end(); ++iter) {
+    T val;
+    JSONObj *o = *iter;
+    decode_json_obj(val, o);
+    l.push_back(val);
+  }
+}
+
+template<class T>
+void decode_json_obj(deque<T>& l, JSONObj *obj)
+{
+  l.clear();
+
+  JSONObjIter iter = obj->find_first();
+
+  for (; !iter.end(); ++iter) {
+    T val;
+    JSONObj *o = *iter;
+    decode_json_obj(val, o);
+    l.push_back(val);
+  }
+}
+
+template<class T>
+void decode_json_obj(set<T>& l, JSONObj *obj)
+{
+  l.clear();
+
+  JSONObjIter iter = obj->find_first();
+
+  for (; !iter.end(); ++iter) {
+    T val;
+    JSONObj *o = *iter;
+    decode_json_obj(val, o);
+    l.insert(val);
+  }
+}
+
+template<class T>
+void decode_json_obj(vector<T>& l, JSONObj *obj)
+{
+  l.clear();
+
+  JSONObjIter iter = obj->find_first();
+
+  for (; !iter.end(); ++iter) {
+    T val;
+    JSONObj *o = *iter;
+    decode_json_obj(val, o);
+    l.push_back(val);
+  }
+}
+
+template<class K, class V, class C = std::less<K> >
+void decode_json_obj(map<K, V, C>& m, JSONObj *obj)
+{
+  m.clear();
+
+  JSONObjIter iter = obj->find_first();
+
+  for (; !iter.end(); ++iter) {
+    K key;
+    V val;
+    JSONObj *o = *iter;
+    JSONDecoder::decode_json("key", key, o);
+    JSONDecoder::decode_json("val", val, o);
+    m[key] = val;
+  }
+}
+
+template<class K, class V>
+void decode_json_obj(multimap<K, V>& m, JSONObj *obj)
+{
+  m.clear();
+
+  JSONObjIter iter = obj->find_first();
+
+  for (; !iter.end(); ++iter) {
+    K key;
+    V val;
+    JSONObj *o = *iter;
+    JSONDecoder::decode_json("key", key, o);
+    JSONDecoder::decode_json("val", val, o);
+    m.insert(make_pair(key, val));
+  }
+}
+
+template<class K, class V>
+void decode_json_obj(boost::container::flat_map<K, V>& m, JSONObj *obj)
+{
+  m.clear();
+
+  JSONObjIter iter = obj->find_first();
+
+  for (; !iter.end(); ++iter) {
+    K key;
+    V val;
+    JSONObj *o = *iter;
+    JSONDecoder::decode_json("key", key, o);
+    JSONDecoder::decode_json("val", val, o);
+    m[key] = val;
+  }
+}
+template<class C>
+void decode_json_obj(C& container, void (*cb)(C&, JSONObj *obj), JSONObj *obj)
+{
+  container.clear();
+
+  JSONObjIter iter = obj->find_first();
+
+  for (; !iter.end(); ++iter) {
+    JSONObj *o = *iter;
+    cb(container, o);
+  }
+}
+
+template<class T>
+bool JSONDecoder::decode_json(const char *name, T& val, JSONObj *obj, bool mandatory)
+{
+  JSONObjIter iter = obj->find_first(name);
+  if (iter.end()) {
+    if (mandatory) {
+      string s = "missing mandatory field " + string(name);
+      throw err(s);
+    }
+    val = T();
+    return false;
+  }
+
+  try {
+    decode_json_obj(val, *iter);
+  } catch (err& e) {
+    string s = string(name) + ": ";
+    s.append(e.message);
+    throw err(s);
+  }
+
+  return true;
+}
+
+template<class C>
+bool JSONDecoder::decode_json(const char *name, C& container, void (*cb)(C&, JSONObj *), JSONObj *obj, bool mandatory)
+{
+  container.clear();
+
+  JSONObjIter iter = obj->find_first(name);
+  if (iter.end()) {
+    if (mandatory) {
+      string s = "missing mandatory field " + string(name);
+      throw err(s);
+    }
+    return false;
+  }
+
+  try {
+    decode_json_obj(container, cb, *iter);
+  } catch (err& e) {
+    string s = string(name) + ": ";
+    s.append(e.message);
+    throw err(s);
+  }
+
+  return true;
+}
+
+template<class T>
+void JSONDecoder::decode_json(const char *name, T& val, const T& default_val, JSONObj *obj)
+{
+  JSONObjIter iter = obj->find_first(name);
+  if (iter.end()) {
+    val = default_val;
+    return;
+  }
+
+  try {
+    decode_json_obj(val, *iter);
+  } catch (err& e) {
+    val = default_val;
+    string s = string(name) + ": ";
+    s.append(e.message);
+    throw err(s);
+  }
+}
+
+template<class T>
+bool JSONDecoder::decode_json(const char *name, boost::optional<T>& val, JSONObj *obj, bool mandatory)
+{
+  JSONObjIter iter = obj->find_first(name);
+  if (iter.end()) {
+    if (mandatory) {
+      string s = "missing mandatory field " + string(name);
+      throw err(s);
+    }
+    val = boost::none;
+    return false;
+  }
+
+  try {
+    val.reset(T());
+    decode_json_obj(val.get(), *iter);
+  } catch (err& e) {
+    val.reset();
+    string s = string(name) + ": ";
+    s.append(e.message);
+    throw err(s);
+  }
+
+  return true;
+}
+
+template<class T>
+static void encode_json(const char *name, const T& val, ceph::Formatter *f)
+{
+  f->open_object_section(name);
+  val.dump(f);
+  f->close_section();
+}
+
+class utime_t;
+
+void encode_json(const char *name, const string& val, ceph::Formatter *f);
+void encode_json(const char *name, const char *val, ceph::Formatter *f);
+void encode_json(const char *name, bool val, ceph::Formatter *f);
+void encode_json(const char *name, int val, ceph::Formatter *f);
+void encode_json(const char *name, unsigned val, ceph::Formatter *f);
+void encode_json(const char *name, long val, ceph::Formatter *f);
+void encode_json(const char *name, unsigned long val, ceph::Formatter *f);
+void encode_json(const char *name, long long val, ceph::Formatter *f);
+void encode_json(const char *name, const utime_t& val, ceph::Formatter *f);
+void encode_json(const char *name, const bufferlist& bl, ceph::Formatter *f);
+void encode_json(const char *name, long long unsigned val, ceph::Formatter *f);
+
+template<class T>
+static void encode_json(const char *name, const std::list<T>& l, ceph::Formatter *f)
+{
+  f->open_array_section(name);
+  for (typename std::list<T>::const_iterator iter = l.begin(); iter != l.end(); ++iter) {
+    encode_json("obj", *iter, f);
+  }
+  f->close_section();
+}
+
+template<class T>
+static void encode_json(const char *name, const std::deque<T>& l, ceph::Formatter *f)
+{
+  f->open_array_section(name);
+  for (typename std::deque<T>::const_iterator iter = l.begin(); iter != l.end(); ++iter) {
+    encode_json("obj", *iter, f);
+  }
+  f->close_section();
+}
+
+template<class T, class Compare = std::less<T> >
+static void encode_json(const char *name, const std::set<T, Compare>& l, ceph::Formatter *f)
+{
+  f->open_array_section(name);
+  for (typename std::set<T, Compare>::const_iterator iter = l.begin(); iter != l.end(); ++iter) {
+    encode_json("obj", *iter, f);
+  }
+  f->close_section();
+}
+
+template<class T>
+static void encode_json(const char *name, const std::vector<T>& l, ceph::Formatter *f)
+{
+  f->open_array_section(name);
+  for (typename std::vector<T>::const_iterator iter = l.begin(); iter != l.end(); ++iter) {
+    encode_json("obj", *iter, f);
+  }
+  f->close_section();
+}
+
+template<class K, class V, class C = std::less<K>>
+static void encode_json(const char *name, const std::map<K, V, C>& m, ceph::Formatter *f)
+{
+  f->open_array_section(name);
+  for (typename std::map<K, V, C>::const_iterator i = m.begin(); i != m.end(); ++i) {
+    f->open_object_section("entry");
+    encode_json("key", i->first, f);
+    encode_json("val", i->second, f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+template<class K, class V>
+static void encode_json(const char *name, const std::multimap<K, V>& m, ceph::Formatter *f)
+{
+  f->open_array_section(name);
+  for (typename std::multimap<K, V>::const_iterator i = m.begin(); i != m.end(); ++i) {
+    f->open_object_section("entry");
+    encode_json("key", i->first, f);
+    encode_json("val", i->second, f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+template<class K, class V>
+static void encode_json(const char *name, const boost::container::flat_map<K, V>& m, ceph::Formatter *f)
+{
+  f->open_array_section(name);
+  for (auto i = m.begin(); i != m.end(); ++i) {
+    f->open_object_section("entry");
+    encode_json("key", i->first, f);
+    encode_json("val", i->second, f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+template<class K, class V>
+void encode_json_map(const char *name, const map<K, V>& m, ceph::Formatter *f)
+{
+  f->open_array_section(name);
+  typename map<K,V>::const_iterator iter;
+  for (iter = m.begin(); iter != m.end(); ++iter) {
+    encode_json("obj", iter->second, f);
+  }
+  f->close_section(); 
+}
+
+
+template<class K, class V>
+void encode_json_map(const char *name, const char *index_name,
+                     const char *object_name, const char *value_name,
+                     void (*cb)(const char *, const V&, ceph::Formatter *, void *), void *parent,
+                     const map<K, V>& m, ceph::Formatter *f)
+{
+  f->open_array_section(name);
+  typename map<K,V>::const_iterator iter;
+  for (iter = m.begin(); iter != m.end(); ++iter) {
+    if (index_name) {
+      f->open_object_section("key_value");
+      f->dump_string(index_name, iter->first);
+    }
+
+    if (object_name) {
+      f->open_object_section(object_name);
+    }
+
+    if (cb) {
+      cb(value_name, iter->second, f, parent);
+    } else {
+      encode_json(value_name, iter->second, f);
+    }
+
+    if (object_name) {
+      f->close_section();
+    }
+    if (index_name) {
+      f->close_section();
+    }
+  }
+  f->close_section(); 
+}
+
+template<class K, class V>
+void encode_json_map(const char *name, const char *index_name,
+                     const char *object_name, const char *value_name,
+                     const map<K, V>& m, ceph::Formatter *f)
+{
+  encode_json_map<K, V>(name, index_name, object_name, value_name, NULL, NULL, m, f);
+}
+
+template<class K, class V>
+void encode_json_map(const char *name, const char *index_name, const char *value_name,
+                     const map<K, V>& m, ceph::Formatter *f)
+{
+  encode_json_map<K, V>(name, index_name, NULL, value_name, NULL, NULL, m, f);
+}
+
+class JSONFormattable : public ceph::JSONFormatter {
+  JSONObj::data_val value;
+  vector<JSONFormattable> arr;
+  map<std::string, JSONFormattable> obj;
+
+  vector<JSONFormattable *> enc_stack;
+  JSONFormattable *cur_enc;
+
+protected:
+  bool handle_value(const char *name, std::string_view s, bool quoted) override;
+  bool handle_open_section(const char *name, const char *ns, bool section_is_array) override;
+  bool handle_close_section() override;
+
+public:
+  JSONFormattable(bool p = false) : JSONFormatter(p) {
+    cur_enc = this;
+    enc_stack.push_back(cur_enc);
+  }
+
+  enum Type {
+    FMT_NONE,
+    FMT_VALUE,
+    FMT_ARRAY,
+    FMT_OBJ,
+  } type{FMT_NONE};
+
+  void set_type(Type t) {
+    type = t;
+  }
+
+  void decode_json(JSONObj *jo) {
+    if (jo->is_array()) {
+      set_type(JSONFormattable::FMT_ARRAY);
+      decode_json_obj(arr, jo);
+    } else if (jo->is_object()) {
+      set_type(JSONFormattable::FMT_OBJ);
+      auto iter = jo->find_first();
+      for (;!iter.end(); ++iter) {
+        JSONObj *field = *iter;
+        decode_json_obj(obj[field->get_name()], field);
+      }
+    } else {
+      set_type(JSONFormattable::FMT_VALUE);
+      decode_json_obj(value, jo);
+    }
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode((uint8_t)type, bl);
+    encode(value.str, bl);
+    encode(arr, bl);
+    encode(obj, bl);
+    encode(value.quoted, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(2, bl);
+    uint8_t t;
+    decode(t, bl);
+    type = (Type)t;
+    decode(value.str, bl);
+    decode(arr, bl);
+    decode(obj, bl);
+    if (struct_v >= 2) {
+      decode(value.quoted, bl);
+    } else {
+      value.quoted = true;
+    }
+    DECODE_FINISH(bl);
+  }
+
+  const std::string& val() const {
+    return value.str;
+  }
+
+  int val_int() const;
+  long val_long() const;
+  long long val_long_long() const;
+  bool val_bool() const;
+
+  const map<std::string, JSONFormattable> object() const {
+    return obj;
+  }
+
+  const vector<JSONFormattable>& array() const {
+    return arr;
+  }
+
+  const JSONFormattable& operator[](const std::string& name) const;
+  const JSONFormattable& operator[](size_t index) const;
+
+  JSONFormattable& operator[](const std::string& name);
+  JSONFormattable& operator[](size_t index);
+
+  operator std::string() const {
+    return value.str;
+  }
+
+  explicit operator int() const {
+    return val_int();
+  }
+
+  explicit operator long() const {
+    return val_long();
+  }
+
+  explicit operator long long() const {
+    return val_long_long();
+  }
+
+  explicit operator bool() const {
+    return val_bool();
+  }
+
+  template<class T>
+  T operator[](const std::string& name) const {
+    return this->operator[](name)(T());
+  }
+
+  template<class T>
+  T operator[](const std::string& name) {
+    return this->operator[](name)(T());
+  }
+
+  string operator ()(const char *def_val) const {
+    return def(string(def_val));
+  }
+
+  int operator()(int def_val) const {
+    return def(def_val);
+  }
+
+  bool operator()(bool def_val) const {
+    return def(def_val);
+  }
+
+  bool exists(const string& name) const;
+  bool exists(size_t index) const;
+
+  std::string def(const std::string& def_val) const;
+  int def(int def_val) const;
+  bool def(bool def_val) const;
+
+  bool find(const std::string& name, std::string *val) const;
+
+  std::string get(const std::string& name, const std::string& def_val) const;
+
+  int get_int(const std::string& name, int def_val) const;
+  bool get_bool(const std::string& name, bool def_val) const;
+
+  int set(const string& name, const string& val);
+  int erase(const string& name);
+
+  void derive_from(const JSONFormattable& jf);
+
+  void encode_json(const char *name, Formatter *f) const;
+
+  bool is_array() const {
+    return (type == FMT_ARRAY);
+  }
+};
+WRITE_CLASS_ENCODER(JSONFormattable)
+
+void encode_json(const char *name, const JSONFormattable& v, Formatter *f);
+
+#endif
diff --git a/src/common/ceph_mutex.h b/src/common/ceph_mutex.h
new file mode 100644
index 00000000..fbb69be6
--- /dev/null
+++ b/src/common/ceph_mutex.h
@@ -0,0 +1,142 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+// What and why
+// ============
+//
+// For general code making use of mutexes, use these ceph:: types.
+// The key requirement is that you make use of the ceph::make_mutex()
+// and make_recursive_mutex() factory methods, which take a string
+// naming the mutex for the purposes of the lockdep debug variant.
+
+#ifdef WITH_SEASTAR
+
+namespace ceph {
+  // an empty class satisfying the mutex concept
+  struct dummy_mutex {
+    void lock() {}
+    bool try_lock() {
+      return true;
+    }
+    void unlock() {}
+    void lock_shared() {}
+    void unlock_shared() {}
+  };
+
+  using mutex = dummy_mutex;
+  using recursive_mutex = dummy_mutex;
+  // in seastar, we should use a difference interface for enforcing the
+  // semantics of condition_variable
+
+  template <typename ...Args>
+  dummy_mutex make_mutex(Args&& ...args) {
+    return {};
+  }
+
+  template <typename ...Args>
+  recursive_mutex make_recursive_mutex(Args&& ...args) {
+    return {};
+  }
+
+  #define ceph_mutex_is_locked(m) true
+  #define ceph_mutex_is_locked_by_me(m) true
+}
+
+#else  // WITH_SEASTAR
+//
+// For legacy Mutex users that passed recursive=true, use
+// ceph::make_recursive_mutex.  For legacy Mutex users that passed
+// lockdep=false, use std::mutex directly.
+
+#ifdef CEPH_DEBUG_MUTEX
+
+// ============================================================================
+// debug (lockdep-capable, various sanity checks and asserts)
+// ============================================================================
+
+#include "common/condition_variable_debug.h"
+#include "common/mutex_debug.h"
+#include "common/shared_mutex_debug.h"
+
+namespace ceph {
+  typedef ceph::mutex_debug mutex;
+  typedef ceph::mutex_recursive_debug recursive_mutex;
+  typedef ceph::condition_variable_debug condition_variable;
+  typedef ceph::shared_mutex_debug shared_mutex;
+
+  // pass arguments to mutex_debug ctor
+  template <typename ...Args>
+  mutex make_mutex(Args&& ...args) {
+    return {std::forward<Args>(args)...};
+  }
+
+  // pass arguments to recursive_mutex_debug ctor
+  template <typename ...Args>
+  recursive_mutex make_recursive_mutex(Args&& ...args) {
+    return {std::forward<Args>(args)...};
+  }
+
+  // pass arguments to shared_mutex_debug ctor
+  template <typename ...Args>
+  shared_mutex make_shared_mutex(Args&& ...args) {
+    return {std::forward<Args>(args)...};
+  }
+
+  // debug methods
+  #define ceph_mutex_is_locked(m) ((m).is_locked())
+  #define ceph_mutex_is_not_locked(m) (!(m).is_locked())
+  #define ceph_mutex_is_rlocked(m) ((m).is_rlocked())
+  #define ceph_mutex_is_wlocked(m) ((m).is_wlocked())
+  #define ceph_mutex_is_locked_by_me(m) ((m).is_locked_by_me())
+  #define ceph_mutex_is_not_locked_by_me(m) (!(m).is_locked_by_me())
+}
+
+#else
+
+// ============================================================================
+// release (fast and minimal)
+// ============================================================================
+
+#include <condition_variable>
+#include <mutex>
+#include <shared_mutex>
+
+
+namespace ceph {
+
+  typedef std::mutex mutex;
+  typedef std::recursive_mutex recursive_mutex;
+  typedef std::condition_variable condition_variable;
+  typedef std::shared_mutex shared_mutex;
+
+  // discard arguments to make_mutex (they are for debugging only)
+  template <typename ...Args>
+  std::mutex make_mutex(Args&& ...args) {
+    return {};
+  }
+  template <typename ...Args>
+  std::recursive_mutex make_recursive_mutex(Args&& ...args) {
+    return {};
+  }
+  template <typename ...Args>
+  std::shared_mutex make_shared_mutex(Args&& ...args) {
+    return {};
+  }
+
+  // debug methods.  Note that these can blindly return true
+  // because any code that does anything other than assert these
+  // are true is broken.
+  #define ceph_mutex_is_locked(m) true
+  #define ceph_mutex_is_not_locked(m) true
+  #define ceph_mutex_is_rlocked(m) true
+  #define ceph_mutex_is_wlocked(m) true
+  #define ceph_mutex_is_locked_by_me(m) true
+  #define ceph_mutex_is_not_locked_by_me(m) true
+
+}
+
+#endif	// CEPH_DEBUG_MUTEX
+
+#endif	// WITH_SEASTAR
diff --git a/src/common/ceph_strings.cc b/src/common/ceph_strings.cc
new file mode 100644
index 00000000..c7d1b787
--- /dev/null
+++ b/src/common/ceph_strings.cc
@@ -0,0 +1,427 @@
+/*
+ * Ceph string constants
+ */
+#include "include/types.h"
+#include "include/ceph_features.h"
+
+const char *ceph_entity_type_name(int type)
+{
+	switch (type) {
+	case CEPH_ENTITY_TYPE_MDS: return "mds";
+	case CEPH_ENTITY_TYPE_OSD: return "osd";
+	case CEPH_ENTITY_TYPE_MON: return "mon";
+	case CEPH_ENTITY_TYPE_MGR: return "mgr";
+	case CEPH_ENTITY_TYPE_CLIENT: return "client";
+	case CEPH_ENTITY_TYPE_AUTH: return "auth";
+	default: return "unknown";
+	}
+}
+
+const char *ceph_con_mode_name(int con_mode)
+{
+	switch (con_mode) {
+	case CEPH_CON_MODE_UNKNOWN: return "unknown";
+	case CEPH_CON_MODE_CRC: return "crc";
+	case CEPH_CON_MODE_SECURE: return "secure";
+	default: return "???";
+	}
+}
+
+const char *ceph_osd_op_name(int op)
+{
+	switch (op) {
+#define GENERATE_CASE(op, opcode, str)	case CEPH_OSD_OP_##op: return (str);
+__CEPH_FORALL_OSD_OPS(GENERATE_CASE)
+#undef GENERATE_CASE
+	default:
+		return "???";
+	}
+}
+
+const char *ceph_osd_state_name(int s)
+{
+	switch (s) {
+	case CEPH_OSD_EXISTS:
+		return "exists";
+	case CEPH_OSD_UP:
+		return "up";
+	case CEPH_OSD_AUTOOUT:
+		return "autoout";
+	case CEPH_OSD_NEW:
+		return "new";
+	case CEPH_OSD_FULL:
+		return "full";
+	case CEPH_OSD_NEARFULL:
+		return "nearfull";
+	case CEPH_OSD_BACKFILLFULL:
+		return "backfillfull";
+        case CEPH_OSD_DESTROYED:
+                return "destroyed";
+        case CEPH_OSD_NOUP:
+                return "noup";
+        case CEPH_OSD_NODOWN:
+                return "nodown";
+        case CEPH_OSD_NOIN:
+                return "noin";
+        case CEPH_OSD_NOOUT:
+                return "noout";
+	default:
+		return "???";
+	}
+}
+
+const char *ceph_release_name(int r)
+{
+	switch (r) {
+	case CEPH_RELEASE_ARGONAUT:
+		return "argonaut";
+	case CEPH_RELEASE_BOBTAIL:
+		return "bobtail";
+	case CEPH_RELEASE_CUTTLEFISH:
+		return "cuttlefish";
+	case CEPH_RELEASE_DUMPLING:
+		return "dumpling";
+	case CEPH_RELEASE_EMPEROR:
+		return "emperor";
+	case CEPH_RELEASE_FIREFLY:
+		return "firefly";
+	case CEPH_RELEASE_GIANT:
+		return "giant";
+	case CEPH_RELEASE_HAMMER:
+		return "hammer";
+	case CEPH_RELEASE_INFERNALIS:
+		return "infernalis";
+	case CEPH_RELEASE_JEWEL:
+		return "jewel";
+	case CEPH_RELEASE_KRAKEN:
+		return "kraken";
+	case CEPH_RELEASE_LUMINOUS:
+		return "luminous";
+	case CEPH_RELEASE_MIMIC:
+		return "mimic";
+	case CEPH_RELEASE_NAUTILUS:
+		return "nautilus";
+	default:
+		if (r < 0)
+			return "unspecified";
+		return "unknown";
+	}
+}
+
+int ceph_release_from_name(const char *s)
+{
+	if (!s) {
+		return -1;
+	}
+	if (strcmp(s, "nautilus") == 0) {
+		return CEPH_RELEASE_NAUTILUS;
+	}
+	if (strcmp(s, "mimic") == 0) {
+		return CEPH_RELEASE_MIMIC;
+	}
+	if (strcmp(s, "luminous") == 0) {
+		return CEPH_RELEASE_LUMINOUS;
+	}
+	if (strcmp(s, "kraken") == 0) {
+		return CEPH_RELEASE_KRAKEN;
+	}
+	if (strcmp(s, "jewel") == 0) {
+		return CEPH_RELEASE_JEWEL;
+	}
+	if (strcmp(s, "infernalis") == 0) {
+		return CEPH_RELEASE_INFERNALIS;
+	}
+	if (strcmp(s, "hammer") == 0) {
+		return CEPH_RELEASE_HAMMER;
+	}
+	if (strcmp(s, "giant") == 0) {
+		return CEPH_RELEASE_GIANT;
+	}
+	if (strcmp(s, "firefly") == 0) {
+		return CEPH_RELEASE_FIREFLY;
+	}
+	if (strcmp(s, "emperor") == 0) {
+		return CEPH_RELEASE_EMPEROR;
+	}
+	if (strcmp(s, "dumpling") == 0) {
+		return CEPH_RELEASE_DUMPLING;
+	}
+	if (strcmp(s, "cuttlefish") == 0) {
+		return CEPH_RELEASE_CUTTLEFISH;
+	}
+	if (strcmp(s, "bobtail") == 0) {
+		return CEPH_RELEASE_BOBTAIL;
+	}
+	if (strcmp(s, "argonaut") == 0) {
+		return CEPH_RELEASE_ARGONAUT;
+	}
+	return -1;
+}
+
+uint64_t ceph_release_features(int r)
+{
+	uint64_t req = 0;
+
+	req |= CEPH_FEATURE_CRUSH_TUNABLES;
+	if (r <= CEPH_RELEASE_CUTTLEFISH)
+		return req;
+
+	req |= CEPH_FEATURE_CRUSH_TUNABLES2 |
+		CEPH_FEATURE_OSDHASHPSPOOL;
+	if (r <= CEPH_RELEASE_EMPEROR)
+		return req;
+
+	req |= CEPH_FEATURE_CRUSH_TUNABLES3 |
+		CEPH_FEATURE_OSD_PRIMARY_AFFINITY |
+		CEPH_FEATURE_OSD_CACHEPOOL;
+	if (r <= CEPH_RELEASE_GIANT)
+		return req;
+
+	req |= CEPH_FEATURE_CRUSH_V4;
+	if (r <= CEPH_RELEASE_INFERNALIS)
+		return req;
+
+	req |= CEPH_FEATURE_CRUSH_TUNABLES5;
+	if (r <= CEPH_RELEASE_JEWEL)
+		return req;
+
+	req |= CEPH_FEATURE_MSG_ADDR2;
+	if (r <= CEPH_RELEASE_KRAKEN)
+		return req;
+
+	req |= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS; // and overlaps
+	if (r <= CEPH_RELEASE_LUMINOUS)
+		return req;
+
+	return req;
+}
+
+/* return oldest/first release that supports these features */
+int ceph_release_from_features(uint64_t features)
+{
+	int r = 1;
+	while (true) {
+		uint64_t need = ceph_release_features(r);
+		if ((need & features) != need ||
+		    r == CEPH_RELEASE_MAX) {
+			r--;
+			need = ceph_release_features(r);
+			/* we want the first release that looks like this */
+			while (r > 1 && ceph_release_features(r - 1) == need) {
+				r--;
+			}
+			break;
+		}
+		++r;
+	}
+	return r;
+}
+
+const char *ceph_osd_watch_op_name(int o)
+{
+	switch (o) {
+	case CEPH_OSD_WATCH_OP_UNWATCH:
+		return "unwatch";
+	case CEPH_OSD_WATCH_OP_WATCH:
+		return "watch";
+	case CEPH_OSD_WATCH_OP_RECONNECT:
+		return "reconnect";
+	case CEPH_OSD_WATCH_OP_PING:
+		return "ping";
+	default:
+		return "???";
+	}
+}
+
+const char *ceph_osd_alloc_hint_flag_name(int f)
+{
+	switch (f) {
+	case CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE:
+		return "sequential_write";
+	case CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE:
+		return "random_write";
+	case CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ:
+		return "sequential_read";
+	case CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ:
+		return "random_read";
+	case CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY:
+		return "append_only";
+	case CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE:
+		return "immutable";
+	case CEPH_OSD_ALLOC_HINT_FLAG_SHORTLIVED:
+		return "shortlived";
+	case CEPH_OSD_ALLOC_HINT_FLAG_LONGLIVED:
+		return "longlived";
+	case CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE:
+		return "compressible";
+	case CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE:
+		return "incompressible";
+	default:
+		return "???";
+	}
+}
+
+const char *ceph_mds_state_name(int s)
+{
+	switch (s) {
+		/* down and out */
+	case CEPH_MDS_STATE_DNE:        return "down:dne";
+	case CEPH_MDS_STATE_STOPPED:    return "down:stopped";
+	case CEPH_MDS_STATE_DAMAGED:   return "down:damaged";
+		/* up and out */
+	case CEPH_MDS_STATE_BOOT:       return "up:boot";
+	case CEPH_MDS_STATE_STANDBY:    return "up:standby";
+	case CEPH_MDS_STATE_STANDBY_REPLAY:    return "up:standby-replay";
+	case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay";
+	case CEPH_MDS_STATE_CREATING:   return "up:creating";
+	case CEPH_MDS_STATE_STARTING:   return "up:starting";
+		/* up and in */
+	case CEPH_MDS_STATE_REPLAY:     return "up:replay";
+	case CEPH_MDS_STATE_RESOLVE:    return "up:resolve";
+	case CEPH_MDS_STATE_RECONNECT:  return "up:reconnect";
+	case CEPH_MDS_STATE_REJOIN:     return "up:rejoin";
+	case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
+	case CEPH_MDS_STATE_ACTIVE:     return "up:active";
+	case CEPH_MDS_STATE_STOPPING:   return "up:stopping";
+               /* misc */
+	case CEPH_MDS_STATE_NULL:       return "null";
+	}
+	return "???";
+}
+
+const char *ceph_session_op_name(int op)
+{
+	switch (op) {
+	case CEPH_SESSION_REQUEST_OPEN: return "request_open";
+	case CEPH_SESSION_OPEN: return "open";
+	case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
+	case CEPH_SESSION_CLOSE: return "close";
+	case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
+	case CEPH_SESSION_RENEWCAPS: return "renewcaps";
+	case CEPH_SESSION_STALE: return "stale";
+	case CEPH_SESSION_RECALL_STATE: return "recall_state";
+	case CEPH_SESSION_FLUSHMSG: return "flushmsg";
+	case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack";
+	case CEPH_SESSION_FORCE_RO: return "force_ro";
+	case CEPH_SESSION_REJECT: return "reject";
+	case CEPH_SESSION_REQUEST_FLUSH_MDLOG: return "request_flushmdlog";
+	}
+	return "???";
+}
+
+const char *ceph_mds_op_name(int op)
+{
+	switch (op) {
+	case CEPH_MDS_OP_LOOKUP:  return "lookup";
+	case CEPH_MDS_OP_LOOKUPHASH:  return "lookuphash";
+	case CEPH_MDS_OP_LOOKUPPARENT:  return "lookupparent";
+	case CEPH_MDS_OP_LOOKUPINO:  return "lookupino";
+	case CEPH_MDS_OP_LOOKUPNAME:  return "lookupname";
+	case CEPH_MDS_OP_GETATTR:  return "getattr";
+	case CEPH_MDS_OP_SETXATTR: return "setxattr";
+	case CEPH_MDS_OP_SETATTR: return "setattr";
+	case CEPH_MDS_OP_RMXATTR: return "rmxattr";
+	case CEPH_MDS_OP_SETLAYOUT: return "setlayou";
+	case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout";
+	case CEPH_MDS_OP_READDIR: return "readdir";
+	case CEPH_MDS_OP_MKNOD: return "mknod";
+	case CEPH_MDS_OP_LINK: return "link";
+	case CEPH_MDS_OP_UNLINK: return "unlink";
+	case CEPH_MDS_OP_RENAME: return "rename";
+	case CEPH_MDS_OP_MKDIR: return "mkdir";
+	case CEPH_MDS_OP_RMDIR: return "rmdir";
+	case CEPH_MDS_OP_SYMLINK: return "symlink";
+	case CEPH_MDS_OP_CREATE: return "create";
+	case CEPH_MDS_OP_OPEN: return "open";
+	case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
+	case CEPH_MDS_OP_LSSNAP: return "lssnap";
+	case CEPH_MDS_OP_MKSNAP: return "mksnap";
+	case CEPH_MDS_OP_RMSNAP: return "rmsnap";
+	case CEPH_MDS_OP_RENAMESNAP: return "renamesnap";
+	case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
+	case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
+	case CEPH_MDS_OP_FRAGMENTDIR: return "fragmentdir";
+	case CEPH_MDS_OP_EXPORTDIR: return "exportdir";
+	case CEPH_MDS_OP_FLUSH: return "flush_path";
+	case CEPH_MDS_OP_ENQUEUE_SCRUB: return "enqueue_scrub";
+	case CEPH_MDS_OP_REPAIR_FRAGSTATS: return "repair_fragstats";
+	case CEPH_MDS_OP_REPAIR_INODESTATS: return "repair_inodestats";
+	}
+	return "???";
+}
+
+const char *ceph_cap_op_name(int op)
+{
+	switch (op) {
+	case CEPH_CAP_OP_GRANT: return "grant";
+	case CEPH_CAP_OP_REVOKE: return "revoke";
+	case CEPH_CAP_OP_TRUNC: return "trunc";
+	case CEPH_CAP_OP_EXPORT: return "export";
+	case CEPH_CAP_OP_IMPORT: return "import";
+	case CEPH_CAP_OP_UPDATE: return "update";
+	case CEPH_CAP_OP_DROP: return "drop";
+	case CEPH_CAP_OP_FLUSH: return "flush";
+	case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
+	case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
+	case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
+	case CEPH_CAP_OP_RELEASE: return "release";
+	case CEPH_CAP_OP_RENEW: return "renew";
+	}
+	return "???";
+}
+
+const char *ceph_lease_op_name(int o)
+{
+	switch (o) {
+	case CEPH_MDS_LEASE_REVOKE: return "revoke";
+	case CEPH_MDS_LEASE_RELEASE: return "release";
+	case CEPH_MDS_LEASE_RENEW: return "renew";
+	case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
+	}
+	return "???";
+}
+
+const char *ceph_snap_op_name(int o)
+{
+	switch (o) {
+	case CEPH_SNAP_OP_UPDATE: return "update";
+	case CEPH_SNAP_OP_CREATE: return "create";
+	case CEPH_SNAP_OP_DESTROY: return "destroy";
+	case CEPH_SNAP_OP_SPLIT: return "split";
+	}
+	return "???";
+}
+
+const char *ceph_watch_event_name(int e)
+{
+	switch (e) {
+	case CEPH_WATCH_EVENT_NOTIFY: return "notify";
+	case CEPH_WATCH_EVENT_NOTIFY_COMPLETE: return "notify_complete";
+	case CEPH_WATCH_EVENT_DISCONNECT: return "disconnect";
+	}
+	return "???";
+}
+
+const char *ceph_pool_op_name(int op)
+{
+	switch (op) {
+	case POOL_OP_CREATE: return "create";
+	case POOL_OP_DELETE: return "delete";
+	case POOL_OP_AUID_CHANGE: return "auid change";  // (obsolete)
+	case POOL_OP_CREATE_SNAP: return "create snap";
+	case POOL_OP_DELETE_SNAP: return "delete snap";
+	case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
+	case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
+	}
+	return "???";
+}
+
+const char *ceph_osd_backoff_op_name(int op)
+{
+	switch (op) {
+	case CEPH_OSD_BACKOFF_OP_BLOCK: return "block";
+	case CEPH_OSD_BACKOFF_OP_ACK_BLOCK: return "ack-block";
+	case CEPH_OSD_BACKOFF_OP_UNBLOCK: return "unblock";
+	}
+	return "???";
+}
diff --git a/src/common/ceph_time.cc b/src/common/ceph_time.cc
new file mode 100644
index 00000000..2fd5aa1a
--- /dev/null
+++ b/src/common/ceph_time.cc
@@ -0,0 +1,317 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.	See file COPYING.
+ *
+ */
+
+// For ceph_timespec
+#include "ceph_time.h"
+#include "log/LogClock.h"
+#include "config.h"
+#include "strtol.h"
+
+#if defined(__APPLE__)
+#include <mach/mach.h>
+#include <mach/mach_time.h>
+
+#include <ostringstream>
+
+#ifndef NSEC_PER_SEC
+#define NSEC_PER_SEC 1000000000ULL
+#endif
+
+int clock_gettime(int clk_id, struct timespec *tp)
+{
+  if (clk_id == CLOCK_REALTIME) {
+    // gettimeofday is much faster than clock_get_time
+    struct timeval now;
+    int ret = gettimeofday(&now, NULL);
+    if (ret)
+      return ret;
+    tp->tv_sec = now.tv_sec;
+    tp->tv_nsec = now.tv_usec * 1000L;
+  } else {
+    uint64_t t = mach_absolute_time();
+    static mach_timebase_info_data_t timebase_info;
+    if (timebase_info.denom == 0) {
+      (void)mach_timebase_info(&timebase_info);
+    }
+    auto nanos = t * timebase_info.numer / timebase_info.denom;
+    tp->tv_sec = nanos / NSEC_PER_SEC;
+    tp->tv_nsec = nanos - (tp->tv_sec * NSEC_PER_SEC);
+  }
+  return 0;
+}
+#endif
+
+namespace ceph {
+  namespace time_detail {
+    void real_clock::to_ceph_timespec(const time_point& t,
+				      struct ceph_timespec& ts) {
+      ts.tv_sec = to_time_t(t);
+      ts.tv_nsec = (t.time_since_epoch() % seconds(1)).count();
+    }
+    struct ceph_timespec real_clock::to_ceph_timespec(const time_point& t) {
+      struct ceph_timespec ts;
+      to_ceph_timespec(t, ts);
+      return ts;
+    }
+    real_clock::time_point real_clock::from_ceph_timespec(
+      const struct ceph_timespec& ts) {
+      return time_point(seconds(ts.tv_sec) + nanoseconds(ts.tv_nsec));
+    }
+
+    void coarse_real_clock::to_ceph_timespec(const time_point& t,
+					     struct ceph_timespec& ts) {
+      ts.tv_sec = to_time_t(t);
+      ts.tv_nsec = (t.time_since_epoch() % seconds(1)).count();
+    }
+    struct ceph_timespec coarse_real_clock::to_ceph_timespec(
+      const time_point& t) {
+      struct ceph_timespec ts;
+      to_ceph_timespec(t, ts);
+      return ts;
+    }
+    coarse_real_clock::time_point coarse_real_clock::from_ceph_timespec(
+      const struct ceph_timespec& ts) {
+      return time_point(seconds(ts.tv_sec) + nanoseconds(ts.tv_nsec));
+    }
+
+  }
+
+  using std::chrono::duration_cast;
+  using std::chrono::seconds;
+  using std::chrono::microseconds;
+
+  template<typename Clock,
+	   typename std::enable_if<Clock::is_steady>::type*>
+  std::ostream& operator<<(std::ostream& m,
+			   const std::chrono::time_point<Clock>& t) {
+    return m << std::fixed << std::chrono::duration<double>(
+		t.time_since_epoch()).count()
+	     << "s";
+  }
+
+  std::ostream& operator<<(std::ostream& m, const timespan& t) {
+    return m << std::chrono::duration<double>(t).count() << "s";
+  }
+
+  template<typename Clock,
+	   typename std::enable_if<!Clock::is_steady>::type*>
+  std::ostream& operator<<(std::ostream& m,
+			   const std::chrono::time_point<Clock>& t) {
+    m.setf(std::ios::right);
+    char oldfill = m.fill();
+    m.fill('0');
+    // localtime.  this looks like an absolute time.
+    //  aim for http://en.wikipedia.org/wiki/ISO_8601
+    struct tm bdt;
+    time_t tt = Clock::to_time_t(t);
+    localtime_r(&tt, &bdt);
+    m << std::setw(4) << (bdt.tm_year+1900)  // 2007 -> '07'
+      << '-' << std::setw(2) << (bdt.tm_mon+1)
+      << '-' << std::setw(2) << bdt.tm_mday
+      << ' '
+      << std::setw(2) << bdt.tm_hour
+      << ':' << std::setw(2) << bdt.tm_min
+      << ':' << std::setw(2) << bdt.tm_sec
+      << "." << std::setw(6) << duration_cast<microseconds>(
+	t.time_since_epoch() % seconds(1));
+    m.fill(oldfill);
+    m.unsetf(std::ios::right);
+    return m;
+  }
+
+  template std::ostream&
+  operator<< <mono_clock>(std::ostream& m, const mono_time& t);
+  template std::ostream&
+  operator<< <real_clock>(std::ostream& m, const real_time& t);
+  template std::ostream&
+  operator<< <coarse_mono_clock>(std::ostream& m, const coarse_mono_time& t);
+  template std::ostream&
+  operator<< <coarse_real_clock>(std::ostream& m, const coarse_real_time& t);
+
+  std::string timespan_str(timespan t)
+  {
+    // FIXME: somebody pretty please make a version of this function
+    // that isn't as lame as this one!
+    uint64_t nsec = std::chrono::nanoseconds(t).count();
+    ostringstream ss;
+    if (nsec < 2000000000) {
+      ss << ((float)nsec / 1000000000) << "s";
+      return ss.str();
+    }
+    uint64_t sec = nsec / 1000000000;
+    if (sec < 120) {
+      ss << sec << "s";
+      return ss.str();
+    }
+    uint64_t min = sec / 60;
+    if (min < 120) {
+      ss << min << "m";
+      return ss.str();
+    }
+    uint64_t hr = min / 60;
+    if (hr < 48) {
+      ss << hr << "h";
+      return ss.str();
+    }
+    uint64_t day = hr / 24;
+    if (day < 14) {
+      ss << day << "d";
+      return ss.str();
+    }
+    uint64_t wk = day / 7;
+    if (wk < 12) {
+      ss << wk << "w";
+      return ss.str();
+    }
+    uint64_t mn = day / 30;
+    if (mn < 24) {
+      ss << mn << "M";
+      return ss.str();
+    }
+    uint64_t yr = day / 365;
+    ss << yr << "y";
+    return ss.str();
+  }
+
+  std::string exact_timespan_str(timespan t)
+  {
+    uint64_t nsec = std::chrono::nanoseconds(t).count();
+    uint64_t sec = nsec / 1000000000;
+    nsec %= 1000000000;
+    uint64_t yr = sec / (60 * 60 * 24 * 365);
+    ostringstream ss;
+    if (yr) {
+      ss << yr << "y";
+      sec -= yr * (60 * 60 * 24 * 365);
+    }
+    uint64_t mn = sec / (60 * 60 * 24 * 30);
+    if (mn >= 3) {
+      ss << mn << "mo";
+      sec -= mn * (60 * 60 * 24 * 30);
+    }
+    uint64_t wk = sec / (60 * 60 * 24 * 7);
+    if (wk >= 2) {
+      ss << wk << "w";
+      sec -= wk * (60 * 60 * 24 * 7);
+    }
+    uint64_t day = sec / (60 * 60 * 24);
+    if (day >= 2) {
+      ss << day << "d";
+      sec -= day * (60 * 60 * 24);
+    }
+    uint64_t hr = sec / (60 * 60);
+    if (hr >= 2) {
+      ss << hr << "h";
+      sec -= hr * (60 * 60);
+    }
+    uint64_t min = sec / 60;
+    if (min >= 2) {
+      ss << min << "m";
+      sec -= min * 60;
+    }
+    if (sec) {
+      ss << sec;
+    }
+    if (nsec) {
+      ss << ((float)nsec / 1000000000);
+    }
+    if (sec || nsec) {
+      ss << "s";
+    }
+    return ss.str();
+  }
+
+  std::chrono::seconds parse_timespan(const std::string& s)
+  {
+    static std::map<string,int> units = {
+      { "s", 1 },
+      { "sec", 1 },
+      { "second", 1 },
+      { "seconds", 1 },
+      { "m", 60 },
+      { "min", 60 },
+      { "minute", 60 },
+      { "minutes", 60 },
+      { "h", 60*60 },
+      { "hr", 60*60 },
+      { "hour", 60*60 },
+      { "hours", 60*60 },
+      { "d", 24*60*60 },
+      { "day", 24*60*60 },
+      { "days", 24*60*60 },
+      { "w", 7*24*60*60 },
+      { "wk", 7*24*60*60 },
+      { "week", 7*24*60*60 },
+      { "weeks", 7*24*60*60 },
+      { "mo", 30*24*60*60 },
+      { "month", 30*24*60*60 },
+      { "months", 30*24*60*60 },
+      { "y", 365*24*60*60 },
+      { "yr", 365*24*60*60 },
+      { "year", 365*24*60*60 },
+      { "years", 365*24*60*60 },
+    };
+
+    auto r = 0s;
+    auto pos = 0u;
+    while (pos < s.size()) {
+      // skip whitespace
+      while (std::isspace(s[pos])) {
+	++pos;
+      }
+      if (pos >= s.size()) {
+	break;
+      }
+
+      // consume any digits
+      auto val_start = pos;
+      while (std::isdigit(s[pos])) {
+	++pos;
+      }
+      if (val_start == pos) {
+	throw invalid_argument("expected digit");
+      }
+      string n = s.substr(val_start, pos - val_start);
+      string err;
+      auto val = strict_strtoll(n.c_str(), 10, &err);
+      if (err.size()) {
+	throw invalid_argument(err);
+      }
+
+      // skip whitespace
+      while (std::isspace(s[pos])) {
+	++pos;
+      }
+
+      // consume unit
+      auto unit_start = pos;
+      while (std::isalpha(s[pos])) {
+	++pos;
+      }
+      if (unit_start != pos) {
+	string unit = s.substr(unit_start, pos - unit_start);
+	auto p = units.find(unit);
+	if (p == units.end()) {
+	  throw invalid_argument("unrecogized unit '"s + unit + "'");
+	}
+	val *= p->second;
+      } else if (pos < s.size()) {
+	throw invalid_argument("unexpected trailing '"s + s.substr(pos) + "'");
+      }
+      r += chrono::seconds(val);
+    }
+    return r;
+  }
+
+}
diff --git a/src/common/ceph_time.h b/src/common/ceph_time.h
new file mode 100644
index 00000000..a357a3bc
--- /dev/null
+++ b/src/common/ceph_time.h
@@ -0,0 +1,520 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.	See file COPYING.
+ *
+ */
+
+#ifndef COMMON_CEPH_TIME_H
+#define COMMON_CEPH_TIME_H
+
+#include <chrono>
+#include <iostream>
+#include <string>
+#include <sys/time.h>
+
+#include "include/ceph_assert.h"
+
+#if defined(__APPLE__)
+#include <sys/_types/_timespec.h>
+
+#define CLOCK_REALTIME_COARSE CLOCK_REALTIME
+#define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC
+
+int clock_gettime(int clk_id, struct timespec *tp);
+#endif
+
+struct ceph_timespec;
+
+namespace ceph {
+  namespace time_detail {
+    using std::chrono::duration_cast;
+    using std::chrono::seconds;
+    using std::chrono::microseconds;
+    using std::chrono::nanoseconds;
+    // Currently we use a 64-bit count of nanoseconds.
+
+    // We could, if we wished, use a struct holding a uint64_t count
+    // of seconds and a uint32_t count of nanoseconds.
+
+    // At least this way we can change it to something else if we
+    // want.
+    typedef uint64_t rep;
+
+    // A concrete duration, unsigned. The timespan Ceph thinks in.
+    typedef std::chrono::duration<rep, std::nano> timespan;
+
+
+    // Like the above but signed.
+    typedef int64_t signed_rep;
+
+    typedef std::chrono::duration<signed_rep, std::nano> signedspan;
+
+    // We define our own clocks so we can have our choice of all time
+    // sources supported by the operating system. With the standard
+    // library the resolution and cost are unspecified. (For example,
+    // the libc++ system_clock class gives only microsecond
+    // resolution.)
+
+    // One potential issue is that we should accept system_clock
+    // timepoints in user-facing APIs alongside (or instead of)
+    // ceph::real_clock times.
+    class real_clock {
+    public:
+      typedef timespan duration;
+      typedef duration::rep rep;
+      typedef duration::period period;
+      // The second template parameter defaults to the clock's duration
+      // type.
+      typedef std::chrono::time_point<real_clock> time_point;
+      static constexpr const bool is_steady = false;
+
+      static time_point now() noexcept {
+	struct timespec ts;
+	clock_gettime(CLOCK_REALTIME, &ts);
+	return from_timespec(ts);
+      }
+
+      static bool is_zero(const time_point& t) {
+	return (t == time_point::min());
+      }
+
+      static time_point zero() {
+        return time_point::min();
+      }
+
+      // Allow conversion to/from any clock with the same interface as
+      // std::chrono::system_clock)
+      template<typename Clock, typename Duration>
+      static time_point to_system_time_point(
+	const std::chrono::time_point<Clock, Duration>& t) {
+	return time_point(seconds(Clock::to_time_t(t)) +
+			  duration_cast<duration>(t.time_since_epoch() %
+						  seconds(1)));
+      }
+      template<typename Clock, typename Duration>
+      static std::chrono::time_point<Clock, Duration> to_system_time_point(
+	const time_point& t) {
+	return (Clock::from_time_t(to_time_t(t)) +
+		duration_cast<Duration>(t.time_since_epoch() % seconds(1)));
+      }
+
+      static time_t to_time_t(const time_point& t) noexcept {
+	return duration_cast<seconds>(t.time_since_epoch()).count();
+      }
+      static time_point from_time_t(const time_t& t) noexcept {
+	return time_point(seconds(t));
+      }
+
+      static void to_timespec(const time_point& t, struct timespec& ts) {
+	ts.tv_sec = to_time_t(t);
+	ts.tv_nsec = (t.time_since_epoch() % seconds(1)).count();
+      }
+      static struct timespec to_timespec(const time_point& t) {
+	struct timespec ts;
+	to_timespec(t, ts);
+	return ts;
+      }
+      static time_point from_timespec(const struct timespec& ts) {
+	return time_point(seconds(ts.tv_sec) + nanoseconds(ts.tv_nsec));
+      }
+
+      static void to_ceph_timespec(const time_point& t,
+				   struct ceph_timespec& ts);
+      static struct ceph_timespec to_ceph_timespec(const time_point& t);
+      static time_point from_ceph_timespec(const struct ceph_timespec& ts);
+
+      static void to_timeval(const time_point& t, struct timeval& tv) {
+	tv.tv_sec = to_time_t(t);
+	tv.tv_usec = duration_cast<microseconds>(t.time_since_epoch() %
+						 seconds(1)).count();
+      }
+      static struct timeval to_timeval(const time_point& t) {
+	struct timeval tv;
+	to_timeval(t, tv);
+	return tv;
+      }
+      static time_point from_timeval(const struct timeval& tv) {
+	return time_point(seconds(tv.tv_sec) + microseconds(tv.tv_usec));
+      }
+
+      static double to_double(const time_point& t) {
+	return std::chrono::duration<double>(t.time_since_epoch()).count();
+      }
+      static time_point from_double(const double d) {
+	return time_point(duration_cast<duration>(
+			    std::chrono::duration<double>(d)));
+      }
+    };
+
+    class coarse_real_clock {
+    public:
+      typedef timespan duration;
+      typedef duration::rep rep;
+      typedef duration::period period;
+      // The second template parameter defaults to the clock's duration
+      // type.
+      typedef std::chrono::time_point<coarse_real_clock> time_point;
+      static constexpr const bool is_steady = false;
+
+      static time_point now() noexcept {
+	struct timespec ts;
+#if defined(CLOCK_REALTIME_COARSE)
+	// Linux systems have _COARSE clocks.
+	clock_gettime(CLOCK_REALTIME_COARSE, &ts);
+#elif defined(CLOCK_REALTIME_FAST)
+	// BSD systems have _FAST clocks.
+	clock_gettime(CLOCK_REALTIME_FAST, &ts);
+#else
+	// And if we find neither, you may wish to consult your system's
+	// documentation.
+#warning Falling back to CLOCK_REALTIME, may be slow.
+	clock_gettime(CLOCK_REALTIME, &ts);
+#endif
+	return from_timespec(ts);
+      }
+
+      static bool is_zero(const time_point& t) {
+	return (t == time_point::min());
+      }
+
+      static time_point zero() {
+	return time_point::min();
+      }
+
+      static time_t to_time_t(const time_point& t) noexcept {
+	return duration_cast<seconds>(t.time_since_epoch()).count();
+      }
+      static time_point from_time_t(const time_t t) noexcept {
+	return time_point(seconds(t));
+      }
+
+      static void to_timespec(const time_point& t, struct timespec& ts) {
+	ts.tv_sec = to_time_t(t);
+	ts.tv_nsec = (t.time_since_epoch() % seconds(1)).count();
+      }
+      static struct timespec to_timespec(const time_point& t) {
+	struct timespec ts;
+	to_timespec(t, ts);
+	return ts;
+      }
+      static time_point from_timespec(const struct timespec& ts) {
+	return time_point(seconds(ts.tv_sec) + nanoseconds(ts.tv_nsec));
+      }
+
+      static void to_ceph_timespec(const time_point& t,
+				   struct ceph_timespec& ts);
+      static struct ceph_timespec to_ceph_timespec(const time_point& t);
+      static time_point from_ceph_timespec(const struct ceph_timespec& ts);
+
+      static void to_timeval(const time_point& t, struct timeval& tv) {
+	tv.tv_sec = to_time_t(t);
+	tv.tv_usec = duration_cast<microseconds>(t.time_since_epoch() %
+						 seconds(1)).count();
+      }
+      static struct timeval to_timeval(const time_point& t) {
+	struct timeval tv;
+	to_timeval(t, tv);
+	return tv;
+      }
+      static time_point from_timeval(const struct timeval& tv) {
+	return time_point(seconds(tv.tv_sec) + microseconds(tv.tv_usec));
+      }
+
+      static double to_double(const time_point& t) {
+	return std::chrono::duration<double>(t.time_since_epoch()).count();
+      }
+      static time_point from_double(const double d) {
+	return time_point(duration_cast<duration>(
+			    std::chrono::duration<double>(d)));
+      }
+    };
+
+    class mono_clock {
+    public:
+      typedef timespan duration;
+      typedef duration::rep rep;
+      typedef duration::period period;
+      typedef std::chrono::time_point<mono_clock> time_point;
+      static constexpr const bool is_steady = true;
+
+      static time_point now() noexcept {
+	struct timespec ts;
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+	return time_point(seconds(ts.tv_sec) + nanoseconds(ts.tv_nsec));
+      }
+
+      static bool is_zero(const time_point& t) {
+        return (t == time_point::min());
+      }
+
+      static time_point zero() {
+        return time_point::min();
+      }
+
+      // A monotonic clock's timepoints are only meaningful to the
+      // computer on which they were generated. Thus having an
+      // optional skew is meaningless.
+    };
+
+    class coarse_mono_clock {
+    public:
+      typedef timespan duration;
+      typedef duration::rep rep;
+      typedef duration::period period;
+      typedef std::chrono::time_point<coarse_mono_clock> time_point;
+      static constexpr const bool is_steady = true;
+
+      static time_point now() noexcept {
+	struct timespec ts;
+#if defined(CLOCK_MONOTONIC_COARSE)
+	// Linux systems have _COARSE clocks.
+	clock_gettime(CLOCK_MONOTONIC_COARSE, &ts);
+#elif defined(CLOCK_MONOTONIC_FAST)
+	// BSD systems have _FAST clocks.
+	clock_gettime(CLOCK_MONOTONIC_FAST, &ts);
+#else
+	// And if we find neither, you may wish to consult your system's
+	// documentation.
+#warning Falling back to CLOCK_MONOTONIC, may be slow.
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+#endif
+	return time_point(seconds(ts.tv_sec) + nanoseconds(ts.tv_nsec));
+      }
+
+      static bool is_zero(const time_point& t) {
+        return (t == time_point::min());
+      }
+
+      static time_point zero() {
+        return time_point::min();
+      }
+    };
+
+    // So that our subtractions produce negative spans rather than
+    // arithmetic underflow.
+    namespace {
+      template<typename Rep1, typename Period1, typename Rep2,
+	       typename Period2>
+      inline auto difference(std::chrono::duration<Rep1, Period1> minuend,
+			     std::chrono::duration<Rep2, Period2> subtrahend)
+	-> typename std::common_type<
+	  std::chrono::duration<typename std::make_signed<Rep1>::type,
+				Period1>,
+	  std::chrono::duration<typename std::make_signed<Rep2>::type,
+				Period2> >::type {
+	// Foo.
+	using srep =
+	  typename std::common_type<
+	    std::chrono::duration<typename std::make_signed<Rep1>::type,
+				  Period1>,
+	    std::chrono::duration<typename std::make_signed<Rep2>::type,
+				  Period2> >::type;
+	return srep(srep(minuend).count() - srep(subtrahend).count());
+      }
+
+      template<typename Clock, typename Duration1, typename Duration2>
+      inline auto difference(
+	typename std::chrono::time_point<Clock, Duration1> minuend,
+	typename std::chrono::time_point<Clock, Duration2> subtrahend)
+	-> typename std::common_type<
+	  std::chrono::duration<typename std::make_signed<
+				  typename Duration1::rep>::type,
+				typename Duration1::period>,
+	  std::chrono::duration<typename std::make_signed<
+				  typename Duration2::rep>::type,
+				typename Duration2::period> >::type {
+	return difference(minuend.time_since_epoch(),
+			  subtrahend.time_since_epoch());
+      }
+    }
+  } // namespace time_detail
+
+  // duration is the concrete time representation for our code in the
+  // case that we are only interested in durations between now and the
+  // future. Using it means we don't have to have EVERY function that
+  // deals with a duration be a template. We can do so for user-facing
+  // APIs, however.
+  using time_detail::timespan;
+
+  // Similar to the above but for durations that can specify
+  // differences between now and a time point in the past.
+  using time_detail::signedspan;
+
+  // High-resolution real-time clock
+  using time_detail::real_clock;
+
+  // Low-resolution but preusmably faster real-time clock
+  using time_detail::coarse_real_clock;
+
+
+  // High-resolution monotonic clock
+  using time_detail::mono_clock;
+
+  // Low-resolution but, I would hope or there's no point, faster
+  // monotonic clock
+  using time_detail::coarse_mono_clock;
+
+  // Please note that the coarse clocks are disjoint. You cannot
+  // subtract a real_clock timepoint from a coarse_real_clock
+  // timepoint as, from C++'s perspective, they are disjoint types.
+
+  // This is not necessarily bad. If I sample a mono_clock and then a
+  // coarse_mono_clock, the coarse_mono_clock's time could potentially
+  // be previous to the mono_clock's time (just due to differing
+  // resolution) which would be Incorrect.
+
+  // This is not horrible, though, since you can use an idiom like
+  // mono_clock::timepoint(coarsepoint.time_since_epoch()) to unwrap
+  // and rewrap if you know what you're doing.
+
+
+  // Actual wall-clock times
+  typedef real_clock::time_point real_time;
+  typedef coarse_real_clock::time_point coarse_real_time;
+
+  // Monotonic times should never be serialized or communicated
+  // between machines, since they are incomparable. Thus we also don't
+  // make any provision for converting between
+  // std::chrono::steady_clock time and ceph::mono_clock time.
+  typedef mono_clock::time_point mono_time;
+  typedef coarse_mono_clock::time_point coarse_mono_time;
+
+  template<typename Rep1, typename Ratio1, typename Rep2, typename Ratio2>
+  auto floor(const std::chrono::duration<Rep1, Ratio1>& duration,
+	     const std::chrono::duration<Rep2, Ratio2>& precision) ->
+    typename std::common_type<std::chrono::duration<Rep1, Ratio1>,
+			      std::chrono::duration<Rep2, Ratio2> >::type {
+    return duration - (duration % precision);
+  }
+
+  template<typename Rep1, typename Ratio1, typename Rep2, typename Ratio2>
+  auto ceil(const std::chrono::duration<Rep1, Ratio1>& duration,
+	    const std::chrono::duration<Rep2, Ratio2>& precision) ->
+    typename std::common_type<std::chrono::duration<Rep1, Ratio1>,
+			      std::chrono::duration<Rep2, Ratio2> >::type {
+    auto tmod = duration % precision;
+    return duration - tmod + (tmod > tmod.zero() ? 1 : 0) * precision;
+  }
+
+  template<typename Clock, typename Duration, typename Rep, typename Ratio>
+  auto floor(const std::chrono::time_point<Clock, Duration>& timepoint,
+	     const std::chrono::duration<Rep, Ratio>& precision) ->
+    std::chrono::time_point<Clock,
+			    typename std::common_type<
+			      Duration, std::chrono::duration<Rep, Ratio>
+			      >::type> {
+    return std::chrono::time_point<
+      Clock, typename std::common_type<
+	Duration, std::chrono::duration<Rep, Ratio> >::type>(
+	  floor(timepoint.time_since_epoch(), precision));
+  }
+  template<typename Clock, typename Duration, typename Rep, typename Ratio>
+  auto ceil(const std::chrono::time_point<Clock, Duration>& timepoint,
+	    const std::chrono::duration<Rep, Ratio>& precision) ->
+    std::chrono::time_point<Clock,
+			    typename std::common_type<
+			      Duration,
+			      std::chrono::duration<Rep, Ratio> >::type> {
+    return std::chrono::time_point<
+      Clock, typename std::common_type<
+	Duration, std::chrono::duration<Rep, Ratio> >::type>(
+	  ceil(timepoint.time_since_epoch(), precision));
+  }
+
+  namespace {
+    inline timespan make_timespan(const double d) {
+      return std::chrono::duration_cast<timespan>(
+	std::chrono::duration<double>(d));
+    }
+  }
+
+  std::ostream& operator<<(std::ostream& m, const timespan& t);
+  template<typename Clock,
+	   typename std::enable_if<!Clock::is_steady>::type* = nullptr>
+  std::ostream& operator<<(std::ostream& m,
+			   const std::chrono::time_point<Clock>& t);
+  template<typename Clock,
+	   typename std::enable_if<Clock::is_steady>::type* = nullptr>
+  std::ostream& operator<<(std::ostream& m,
+			   const std::chrono::time_point<Clock>& t);
+
+  // The way std::chrono handles the return type of subtraction is not
+  // wonderful. The difference of two unsigned types SHOULD be signed.
+
+  namespace {
+    inline signedspan operator -(real_time minuend,
+				 real_time subtrahend) {
+      return time_detail::difference(minuend, subtrahend);
+    }
+
+    inline signedspan operator -(coarse_real_time minuend,
+				 coarse_real_time subtrahend) {
+      return time_detail::difference(minuend, subtrahend);
+    }
+
+    inline signedspan operator -(mono_time minuend,
+				 mono_time subtrahend) {
+      return time_detail::difference(minuend, subtrahend);
+    }
+
+    inline signedspan operator -(coarse_mono_time minuend,
+				 coarse_mono_time subtrahend) {
+      return time_detail::difference(minuend, subtrahend);
+    }
+  }
+
+  // We could add specializations of time_point - duration and
+  // time_point + duration to assert on overflow, but I don't think we
+  // should.
+
+
+inline timespan abs(signedspan z) {
+  return z > signedspan::zero() ?
+    std::chrono::duration_cast<timespan>(z) :
+    timespan(-z.count());
+}
+inline timespan to_timespan(signedspan z) {
+  if (z < signedspan::zero()) {
+    //ceph_assert(z >= signedspan::zero());
+    // There is a kernel bug that seems to be triggering this assert.  We've
+    // seen it in:
+    //   centos 8.1: 4.18.0-147.el8.x86_64
+    //   debian 10.3: 4.19.0-8-amd64
+    //   debian 10.1: 4.19.67-2+deb10u1
+    //   ubuntu 18.04
+    // see bugs:
+    //   https://tracker.ceph.com/issues/43365
+    //   https://tracker.ceph.com/issues/44078
+    z = signedspan::zero();
+  }
+  return std::chrono::duration_cast<timespan>(z);
+}
+
+std::string timespan_str(timespan t);
+std::string exact_timespan_str(timespan t);
+std::chrono::seconds parse_timespan(const std::string& s);
+
+// detects presence of Clock::to_timespec() and from_timespec()
+template <typename Clock, typename = std::void_t<>>
+struct converts_to_timespec : std::false_type {};
+
+template <typename Clock>
+struct converts_to_timespec<Clock, std::void_t<decltype(
+    Clock::from_timespec(Clock::to_timespec(
+        std::declval<typename Clock::time_point>()))
+  )>> : std::true_type {};
+
+template <typename Clock>
+constexpr bool converts_to_timespec_v = converts_to_timespec<Clock>::value;
+
+} // namespace ceph
+
+#endif // COMMON_CEPH_TIME_H
diff --git a/src/common/ceph_timer.h b/src/common/ceph_timer.h
new file mode 100644
index 00000000..b29fac9c
--- /dev/null
+++ b/src/common/ceph_timer.h
@@ -0,0 +1,334 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef COMMON_CEPH_TIMER_H
+#define COMMON_CEPH_TIMER_H
+
+#include <condition_variable>
+#include <thread>
+#include <boost/intrusive/set.hpp>
+
+namespace ceph {
+
+  /// Newly constructed timer should be suspended at point of
+  /// construction.
+
+  struct construct_suspended_t { };
+  constexpr construct_suspended_t construct_suspended { };
+
+  namespace timer_detail {
+    using boost::intrusive::member_hook;
+    using boost::intrusive::set_member_hook;
+    using boost::intrusive::link_mode;
+    using boost::intrusive::normal_link;
+    using boost::intrusive::set;
+    using boost::intrusive::constant_time_size;
+    using boost::intrusive::compare;
+
+    // Compared to the SafeTimer this does fewer allocations (you
+    // don't have to allocate a new Context every time you
+    // want to cue the next tick.)
+    //
+    // It also does not share a lock with the caller. If you call
+    // cancel event, it either cancels the event (and returns true) or
+    // you missed it. If this does not work for you, you can set up a
+    // flag and mutex of your own.
+    //
+    // You get to pick your clock. I like mono_clock, since I usually
+    // want to wait FOR a given duration. real_clock is worthwhile if
+    // you want to wait UNTIL a specific moment of wallclock time.  If
+    // you want you can set up a timer that executes a function after
+    // you use up ten seconds of CPU time.
+
+    template <class TC>
+    class timer {
+      using sh = set_member_hook<link_mode<normal_link> >;
+
+      struct event {
+	typename TC::time_point t;
+	uint64_t id;
+	std::function<void()> f;
+
+	sh schedule_link;
+	sh event_link;
+
+	event() : t(TC::time_point::min()), id(0) {}
+	event(uint64_t _id) : t(TC::time_point::min()), id(_id) {}
+	event(typename TC::time_point _t, uint64_t _id,
+	      std::function<void()>&& _f) : t(_t), id(_id), f(_f) {}
+	event(typename TC::time_point _t, uint64_t _id,
+	      const std::function<void()>& _f) : t(_t), id(_id), f(_f) {}
+	bool operator <(const event& e) {
+	  return t == e.t ? id < e.id : t < e.t;
+	}
+      };
+      struct SchedCompare {
+	bool operator()(const event& e1, const event& e2) const {
+	  return e1.t == e2.t ? e1.id < e2.id : e1.t < e2.t;
+	}
+      };
+      struct EventCompare {
+	bool operator()(const event& e1, const event& e2) const {
+	  return e1.id < e2.id;
+	}
+      };
+
+      using schedule_type = set<event,
+				member_hook<event, sh, &event::schedule_link>,
+				constant_time_size<false>,
+				compare<SchedCompare> >;
+
+      schedule_type schedule;
+
+      using event_set_type = set<event,
+				 member_hook<event, sh, &event::event_link>,
+				 constant_time_size<false>,
+				 compare<EventCompare> >;
+
+      event_set_type events;
+
+      std::mutex lock;
+      using lock_guard = std::lock_guard<std::mutex>;
+      using unique_lock = std::unique_lock<std::mutex>;
+      std::condition_variable cond;
+
+      event* running{ nullptr };
+      uint64_t next_id{ 0 };
+
+      bool suspended;
+      std::thread thread;
+
+      void timer_thread() {
+	unique_lock l(lock);
+	while (!suspended) {
+	  typename TC::time_point now = TC::now();
+
+	  while (!schedule.empty()) {
+	    auto p = schedule.begin();
+	    // Should we wait for the future?
+	    if (p->t > now)
+	      break;
+
+	    event& e = *p;
+	    schedule.erase(e);
+	    events.erase(e);
+
+	    // Since we have only one thread it is impossible to have more
+	    // than one running event
+	    running = &e;
+
+	    l.unlock();
+	    e.f();
+	    l.lock();
+
+	    if (running) {
+	      running = nullptr;
+	      delete &e;
+	    } // Otherwise the event requeued itself
+	  }
+
+          if (suspended)
+            break;
+	  if (schedule.empty())
+	    cond.wait(l);
+	  else
+	    cond.wait_until(l, schedule.begin()->t);
+	}
+      }
+
+  public:
+      timer() {
+	lock_guard l(lock);
+	suspended = false;
+	thread = std::thread(&timer::timer_thread, this);
+      }
+
+      // Create a suspended timer, jobs will be executed in order when
+      // it is resumed.
+      timer(construct_suspended_t) {
+	lock_guard l(lock);
+	suspended = true;
+      }
+
+      timer(const timer &) = delete;
+      timer& operator=(const timer &) = delete;
+
+      ~timer() {
+	suspend();
+	cancel_all_events();
+      }
+
+      // Suspend operation of the timer (and let its thread die).
+      void suspend() {
+	unique_lock l(lock);
+	if (suspended)
+	  return;
+
+	suspended = true;
+	cond.notify_one();
+	l.unlock();
+	thread.join();
+      }
+
+
+      // Resume operation of the timer. (Must have been previously
+      // suspended.)
+      void resume() {
+	unique_lock l(lock);
+	  if (!suspended)
+	  return;
+
+	suspended = false;
+	ceph_assert(!thread.joinable());
+	thread = std::thread(&timer::timer_thread, this);
+      }
+
+      // Schedule an event in the relative future
+      template<typename Callable, typename... Args>
+      uint64_t add_event(typename TC::duration duration,
+			 Callable&& f, Args&&... args) {
+	typename TC::time_point when = TC::now();
+	when += duration;
+	return add_event(when,
+			 std::forward<Callable>(f),
+			 std::forward<Args>(args)...);
+      }
+
+      // Schedule an event in the absolute future
+      template<typename Callable, typename... Args>
+      uint64_t add_event(typename TC::time_point when,
+			 Callable&& f, Args&&... args) {
+	std::lock_guard l(lock);
+	event& e = *(new event(
+		       when, ++next_id,
+		       std::forward<std::function<void()> >(
+			 std::bind(std::forward<Callable>(f),
+				   std::forward<Args>(args)...))));
+	auto i = schedule.insert(e);
+	events.insert(e);
+
+	/* If the event we have just inserted comes before everything
+	 * else, we need to adjust our timeout. */
+	if (i.first == schedule.begin())
+	  cond.notify_one();
+
+	// Previously each event was a context, identified by a
+	// pointer, and each context to be called only once. Since you
+	// can queue the same function pointer, member function,
+	// lambda, or functor up multiple times, identifying things by
+	// function for the purposes of cancellation is no longer
+	// suitable. Thus:
+	return e.id;
+      }
+
+      // Adjust the timeout of a currently-scheduled event (relative)
+      bool adjust_event(uint64_t id, typename TC::duration duration) {
+	return adjust_event(id, TC::now() + duration);
+      }
+
+      // Adjust the timeout of a currently-scheduled event (absolute)
+      bool adjust_event(uint64_t id, typename TC::time_point when) {
+	std::lock_guard l(lock);
+
+	event key(id);
+	typename event_set_type::iterator it = events.find(key);
+
+	if (it == events.end())
+	  return false;
+
+	event& e = *it;
+
+	schedule.erase(e);
+	e.t = when;
+	schedule.insert(e);
+
+	return true;
+      }
+
+      // Cancel an event. If the event has already come and gone (or you
+      // never submitted it) you will receive false. Otherwise you will
+      // receive true and it is guaranteed the event will not execute.
+      bool cancel_event(const uint64_t id) {
+	std::lock_guard l(lock);
+	event dummy(id);
+	auto p = events.find(dummy);
+	if (p == events.end()) {
+	  return false;
+	}
+
+	event& e = *p;
+	events.erase(e);
+	schedule.erase(e);
+	delete &e;
+
+	return true;
+      }
+
+      // Reschedules a currently running event in the relative
+      // future. Must be called only from an event executed by this
+      // timer. If you have a function that can be called either from
+      // this timer or some other way, it is your responsibility to make
+      // sure it can tell the difference only does not call
+      // reschedule_me in the non-timer case.
+      //
+      // Returns an event id. If you had an event_id from the first
+      // scheduling, replace it with this return value.
+      uint64_t reschedule_me(typename TC::duration duration) {
+	return reschedule_me(TC::now() + duration);
+      }
+
+      // Reschedules a currently running event in the absolute
+      // future. Must be called only from an event executed by this
+      // timer. if you have a function that can be called either from
+      // this timer or some other way, it is your responsibility to make
+      // sure it can tell the difference only does not call
+      // reschedule_me in the non-timer case.
+      //
+      // Returns an event id. If you had an event_id from the first
+      // scheduling, replace it with this return value.
+      uint64_t reschedule_me(typename TC::time_point when) {
+	if (std::this_thread::get_id() != thread.get_id())
+	  throw std::make_error_condition(std::errc::operation_not_permitted);
+	std::lock_guard l(lock);
+	running->t = when;
+	uint64_t id = ++next_id;
+	running->id = id;
+	schedule.insert(*running);
+	events.insert(*running);
+
+	// Hacky, but keeps us from being deleted
+	running = nullptr;
+
+	// Same function, but you get a new ID.
+	return id;
+      }
+
+      // Remove all events from the queue.
+      void cancel_all_events() {
+	std::lock_guard l(lock);
+	while (!events.empty()) {
+	  auto p = events.begin();
+	  event& e = *p;
+	  schedule.erase(e);
+	  events.erase(e);
+	  delete &e;
+	}
+      }
+    }; // timer
+  }; // timer_detail
+
+  using timer_detail::timer;
+}; // ceph
+
+#endif
diff --git a/src/common/cmdparse.cc b/src/common/cmdparse.cc
new file mode 100644
index 00000000..5945c0ba
--- /dev/null
+++ b/src/common/cmdparse.cc
@@ -0,0 +1,665 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/cmdparse.h"
+#include "common/Formatter.h"
+#include "common/debug.h"
+#include "common/strtol.h"
+#include "json_spirit/json_spirit.h"
+
+/**
+ * Given a cmddesc like "foo baz name=bar,type=CephString",
+ * return the prefix "foo baz".
+ */
+std::string cmddesc_get_prefix(const std::string &cmddesc)
+{
+  stringstream ss(cmddesc);
+  std::string word;
+  std::ostringstream result;
+  bool first = true;
+  while (std::getline(ss, word, ' ')) {
+    if (word.find_first_of(",=") != string::npos) {
+      break;
+    }
+
+    if (!first) {
+      result << " ";
+    }
+    result << word;
+    first = false;
+  }
+
+  return result.str();
+}
+
+using arg_desc_t = std::map<std::string_view, std::string_view>;
+
+// Snarf up all the key=val,key=val pairs, put 'em in a dict.
+template<class String>
+arg_desc_t cmddesc_get_args(const String& cmddesc)
+{
+  arg_desc_t arg_desc;
+  for_each_substr(cmddesc, ",", [&](auto kv) {
+      // key=value; key by itself implies value is bool true
+      // name="name" means arg dict will be titled 'name'
+      auto equal = kv.find('=');
+      if (equal == kv.npos) {
+	// it should be the command
+	return;
+      }
+      auto key = kv.substr(0, equal);
+      auto val = kv.substr(equal + 1);
+      arg_desc[key] = val;
+    });
+  return arg_desc;
+}
+
+std::string cmddesc_get_prenautilus_compat(const std::string &cmddesc)
+{
+  std::vector<std::string> out;
+  stringstream ss(cmddesc);
+  std::string word;
+  bool changed = false;
+  while (std::getline(ss, word, ' ')) {
+    // if no , or =, must be a plain word to put out
+    if (word.find_first_of(",=") == string::npos) {
+      out.push_back(word);
+      continue;
+    }
+    auto desckv = cmddesc_get_args(word);
+    auto j = desckv.find("type");
+    if (j != desckv.end() && j->second == "CephBool") {
+      // Instruct legacy clients or mons to send --foo-bar string in place
+      // of a 'true'/'false' value
+      std::ostringstream oss;
+      oss << std::string("--") << desckv["name"];
+      std::string val = oss.str();
+      std::replace(val.begin(), val.end(), '_', '-');
+      desckv["type"] = "CephChoices";
+      desckv["strings"] = val;
+      std::ostringstream fss;
+      for (auto k = desckv.begin(); k != desckv.end(); ++k) {
+	if (k != desckv.begin()) {
+	  fss << ",";
+	}
+	fss << k->first << "=" << k->second;
+      }
+      out.push_back(fss.str());
+      changed = true;
+    } else {
+      out.push_back(word);
+    }
+  }
+  if (!changed) {
+    return cmddesc;
+  }
+  std::string o;
+  for (auto i = out.begin(); i != out.end(); ++i) {
+    if (i != out.begin()) {
+      o += " ";
+    }
+    o += *i;
+  }
+  return o;
+}
+
+/**
+ * Read a command description list out of cmd, and dump it to f.
+ * A signature description is a set of space-separated words;
+ * see MonCommands.h for more info.
+ */
+
+void
+dump_cmd_to_json(Formatter *f, uint64_t features, const string& cmd)
+{
+  // put whole command signature in an already-opened container
+  // elements are: "name", meaning "the typeless name that means a literal"
+  // an object {} with key:value pairs representing an argument
+
+  stringstream ss(cmd);
+  std::string word;
+
+  while (std::getline(ss, word, ' ')) {
+    // if no , or =, must be a plain word to put out
+    if (word.find_first_of(",=") == string::npos) {
+      f->dump_string("arg", word);
+      continue;
+    }
+    // accumulate descriptor keywords in desckv
+    auto desckv = cmddesc_get_args(word);
+    // name the individual desc object based on the name key
+    f->open_object_section(string(desckv["name"]).c_str());
+
+    // Compatibility for pre-nautilus clients that don't know about CephBool
+    std::string val;
+    if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+      auto i = desckv.find("type");
+      if (i != desckv.end() && i->second == "CephBool") {
+        // Instruct legacy clients to send --foo-bar string in place
+        // of a 'true'/'false' value
+        std::ostringstream oss;
+        oss << std::string("--") << desckv["name"];
+        val = oss.str();
+        std::replace(val.begin(), val.end(), '_', '-');
+
+        desckv["type"] = "CephChoices";
+        desckv["strings"] = val;
+      }
+    }
+
+    // dump all the keys including name into the array
+    for (auto [key, value] : desckv) {
+      f->dump_string(string(key).c_str(), string(value));
+    }
+    f->close_section(); // attribute object for individual desc
+  }
+}
+
+void
+dump_cmd_and_help_to_json(Formatter *jf,
+			  uint64_t features,
+			  const string& secname,
+			  const string& cmdsig,
+			  const string& helptext)
+{
+      jf->open_object_section(secname.c_str());
+      jf->open_array_section("sig");
+      dump_cmd_to_json(jf, features, cmdsig);
+      jf->close_section(); // sig array
+      jf->dump_string("help", helptext.c_str());
+      jf->close_section(); // cmd
+}
+
+void
+dump_cmddesc_to_json(Formatter *jf,
+		     uint64_t features,
+		     const string& secname,
+		     const string& cmdsig,
+		     const string& helptext,
+		     const string& module,
+		     const string& perm,
+		     uint64_t flags)
+{
+      jf->open_object_section(secname.c_str());
+      jf->open_array_section("sig");
+      dump_cmd_to_json(jf, features, cmdsig);
+      jf->close_section(); // sig array
+      jf->dump_string("help", helptext.c_str());
+      jf->dump_string("module", module.c_str());
+      jf->dump_string("perm", perm.c_str());
+      jf->dump_int("flags", flags);
+      jf->close_section(); // cmd
+}
+
+void cmdmap_dump(const cmdmap_t &cmdmap, Formatter *f)
+{
+  ceph_assert(f != nullptr);
+
+  class dump_visitor : public boost::static_visitor<void>
+  {
+    Formatter *f;
+    std::string const &key;
+    public:
+    dump_visitor(Formatter *f_, std::string const &key_)
+      : f(f_), key(key_)
+    {
+    }
+
+    void operator()(const std::string &operand) const
+    {
+      f->dump_string(key.c_str(), operand);
+    }
+
+    void operator()(const bool &operand) const
+    {
+      f->dump_bool(key.c_str(), operand);
+    }
+
+    void operator()(const int64_t &operand) const
+    {
+      f->dump_int(key.c_str(), operand);
+    }
+
+    void operator()(const double &operand) const
+    {
+      f->dump_float(key.c_str(), operand);
+    }
+
+    void operator()(const std::vector<std::string> &operand) const
+    {
+      f->open_array_section(key.c_str());
+      for (const auto i : operand) {
+        f->dump_string("item", i);
+      }
+      f->close_section();
+    }
+
+    void operator()(const std::vector<int64_t> &operand) const
+    {
+      f->open_array_section(key.c_str());
+      for (const auto i : operand) {
+        f->dump_int("item", i);
+      }
+      f->close_section();
+    }
+
+    void operator()(const std::vector<double> &operand) const
+    {
+      f->open_array_section(key.c_str());
+      for (const auto i : operand) {
+        f->dump_float("item", i);
+      }
+      f->close_section();
+    }
+  };
+
+  //f->open_object_section("cmdmap");
+  for (const auto &i : cmdmap) {
+    boost::apply_visitor(dump_visitor(f, i.first), i.second);
+  }
+  //f->close_section();
+}
+
+
+/** Parse JSON in vector cmd into a map from field to map of values
+ * (use mValue/mObject)
+ * 'cmd' should not disappear over lifetime of map
+ * 'mapp' points to the caller's map
+ * 'ss' captures any errors during JSON parsing; if function returns
+ * false, ss is valid */
+
+bool
+cmdmap_from_json(vector<string> cmd, cmdmap_t *mapp, stringstream &ss)
+{
+  json_spirit::mValue v;
+
+  string fullcmd;
+  // First, join all cmd strings
+  for (vector<string>::iterator it = cmd.begin();
+       it != cmd.end(); ++it)
+    fullcmd += *it;
+
+  try {
+    if (!json_spirit::read(fullcmd, v))
+      throw runtime_error("unparseable JSON " + fullcmd);
+    if (v.type() != json_spirit::obj_type)
+      throw(runtime_error("not JSON object " + fullcmd));
+
+    // allocate new mObject (map) to return
+    // make sure all contents are simple types (not arrays or objects)
+    json_spirit::mObject o = v.get_obj();
+    for (map<string, json_spirit::mValue>::iterator it = o.begin();
+	 it != o.end(); ++it) {
+
+      // ok, marshal it into our string->cmd_vartype map, or throw an
+      // exception if it's not a simple datatype.  This is kind of
+      // annoying, since json_spirit has a boost::variant inside it
+      // already, but it's not public.  Oh well.
+
+      switch (it->second.type()) {
+
+      case json_spirit::obj_type:
+      default:
+	throw(runtime_error("JSON array/object not allowed " + fullcmd));
+        break;
+
+      case json_spirit::array_type:
+	{
+	  // array is a vector of values.  Unpack it to a vector
+	  // of strings, doubles, or int64_t, the only types we handle.
+	  const vector<json_spirit::mValue>& spvals = it->second.get_array();
+	  if (spvals.empty()) {
+	    // if an empty array is acceptable, the caller should always check for
+	    // vector<string> if the expected value of "vector<int64_t>" in the
+	    // cmdmap is missing.
+	    (*mapp)[it->first] = vector<string>();
+	  } else if (spvals.front().type() == json_spirit::str_type) {
+	    vector<string> outv;
+	    for (const auto& sv : spvals) {
+	      if (sv.type() != json_spirit::str_type) {
+		throw(runtime_error("Can't handle arrays of multiple types"));
+	      }
+	      outv.push_back(sv.get_str());
+	    }
+	    (*mapp)[it->first] = std::move(outv);
+	  } else if (spvals.front().type() == json_spirit::int_type) {
+	    vector<int64_t> outv;
+	    for (const auto& sv : spvals) {
+	      if (spvals.front().type() != json_spirit::int_type) {
+		throw(runtime_error("Can't handle arrays of multiple types"));
+	      }
+	      outv.push_back(sv.get_int64());
+	    }
+	    (*mapp)[it->first] = std::move(outv);
+	  } else if (spvals.front().type() == json_spirit::real_type) {
+	    vector<double> outv;
+	    for (const auto& sv : spvals) {
+	      if (spvals.front().type() != json_spirit::real_type) {
+		throw(runtime_error("Can't handle arrays of multiple types"));
+	      }
+	      outv.push_back(sv.get_real());
+	    }
+	    (*mapp)[it->first] = std::move(outv);
+	  } else {
+	    throw(runtime_error("Can't handle arrays of types other than "
+				"int, string, or double"));
+	  }
+	}
+	break;
+      case json_spirit::str_type:
+	(*mapp)[it->first] = it->second.get_str();
+	break;
+
+      case json_spirit::bool_type:
+	(*mapp)[it->first] = it->second.get_bool();
+	break;
+
+      case json_spirit::int_type:
+	(*mapp)[it->first] = it->second.get_int64();
+	break;
+
+      case json_spirit::real_type:
+	(*mapp)[it->first] = it->second.get_real();
+	break;
+      }
+    }
+    return true;
+  } catch (runtime_error &e) {
+    ss << e.what();
+    return false;
+  }
+}
+
+class stringify_visitor : public boost::static_visitor<string>
+{
+  public:
+    template <typename T>
+    string operator()(T &operand) const
+      {
+	ostringstream oss;
+	oss << operand;
+	return oss.str();
+      }
+};
+
+string 
+cmd_vartype_stringify(const cmd_vartype &v)
+{
+  return boost::apply_visitor(stringify_visitor(), v);
+}
+
+
+void
+handle_bad_get(CephContext *cct, const string& k, const char *tname)
+{
+  ostringstream errstr;
+  int status;
+  const char *typestr = abi::__cxa_demangle(tname, 0, 0, &status);
+  if (status != 0) 
+    typestr = tname;
+  errstr << "bad boost::get: key " << k << " is not type " << typestr;
+  lderr(cct) << errstr.str() << dendl;
+
+  ostringstream oss;
+  oss << BackTrace(1);
+  lderr(cct) << oss.str() << dendl;
+
+  if (status == 0)
+    free((char *)typestr);
+}
+
+long parse_pos_long(const char *s, std::ostream *pss)
+{
+  if (*s == '-' || *s == '+') {
+    if (pss)
+      *pss << "expected numerical value, got: " << s;
+    return -EINVAL;
+  }
+
+  string err;
+  long r = strict_strtol(s, 10, &err);
+  if ((r == 0) && !err.empty()) {
+    if (pss)
+      *pss << err;
+    return -1;
+  }
+  if (r < 0) {
+    if (pss)
+      *pss << "unable to parse positive integer '" << s << "'";
+    return -1;
+  }
+  return r;
+}
+
+int parse_osd_id(const char *s, std::ostream *pss)
+{
+  // osd.NNN?
+  if (strncmp(s, "osd.", 4) == 0) {
+    s += 4;
+  }
+
+  // NNN?
+  ostringstream ss;
+  long id = parse_pos_long(s, &ss);
+  if (id < 0) {
+    *pss << ss.str();
+    return id;
+  }
+  if (id > 0xffff) {
+    *pss << "osd id " << id << " is too large";
+    return -ERANGE;
+  }
+  return id;
+}
+
+namespace {
+template <typename Func>
+bool find_first_in(std::string_view s, const char *delims, Func&& f)
+{
+  auto pos = s.find_first_not_of(delims);
+  while (pos != s.npos) {
+    s.remove_prefix(pos);
+    auto end = s.find_first_of(delims);
+    if (f(s.substr(0, end))) {
+      return true;
+    }
+    pos = s.find_first_not_of(delims, end);
+  }
+  return false;
+}
+
+template<typename T>
+T str_to_num(const std::string& s)
+{
+  if constexpr (is_same_v<T, int>) {
+    return std::stoi(s);
+  } else if constexpr (is_same_v<T, long>) {
+    return std::stol(s);
+  } else if constexpr (is_same_v<T, long long>) {
+    return std::stoll(s);
+  } else if constexpr (is_same_v<T, double>) {
+    return std::stod(s);
+  }
+}
+
+template<typename T>
+bool arg_in_range(T value, const arg_desc_t& desc, std::ostream& os) {
+  auto range = desc.find("range");
+  if (range == desc.end()) {
+    return true;
+  }
+  auto min_max = get_str_list(string(range->second), "|");
+  auto min = str_to_num<T>(min_max.front());
+  auto max = numeric_limits<T>::max();
+  if (min_max.size() > 1) {
+    max = str_to_num<T>(min_max.back());
+  }
+  if (value < min || value > max) {
+    os << "'" << value << "' out of range: " << min_max;
+    return false;
+  }
+  return true;
+}
+
+bool validate_str_arg(std::string_view value,
+		      std::string_view type,
+		      const arg_desc_t& desc,
+		      std::ostream& os)
+{
+  if (type == "CephIPAddr") {
+    entity_addr_t addr;
+    if (addr.parse(string(value).c_str())) {
+      return true;
+    } else {
+      os << "failed to parse addr '" << value << "', should be ip:[port]";
+      return false;
+    }
+  } else if (type == "CephChoices") {
+    auto choices = desc.find("strings");
+    ceph_assert(choices != end(desc));
+    auto strings = choices->second;
+    if (find_first_in(strings, "|", [=](auto choice) {
+	  return (value == choice);
+	})) {
+      return true;
+    } else {
+      os << "'" << value << "' not belong to '" << strings << "'";
+      return false;
+    }
+  } else {
+    // CephString or other types like CephPgid
+    return true;
+  }
+}
+
+template<bool is_vector,
+	 typename T,
+	 typename Value = conditional_t<is_vector,
+					vector<T>,
+					T>>
+bool validate_arg(CephContext* cct,
+		  const cmdmap_t& cmdmap,
+		  const arg_desc_t& desc,
+		  const std::string_view name,
+		  const std::string_view type,
+		  std::ostream& os)
+{
+  Value v;
+  try {
+    if (!cmd_getval(cct, cmdmap, string(name), v)) {
+      if constexpr (is_vector) {
+	  // an empty list is acceptable.
+	  return true;
+	} else {
+	if (auto req = desc.find("req");
+	    req != end(desc) && req->second == "false") {
+	  return true;
+	} else {
+	  os << "missing required parameter: '" << name << "'";
+	  return false;
+	}
+      }
+    }
+  } catch (const bad_cmd_get& e) {
+    return false;
+  }
+  auto validate = [&](const T& value) {
+    if constexpr (is_same_v<std::string, T>) {
+      return validate_str_arg(value, type, desc, os);
+    } else if constexpr (is_same_v<int64_t, T> ||
+			 is_same_v<double, T>) {
+      return arg_in_range(value, desc, os);
+    }
+  };
+  if constexpr(is_vector) {
+    return find_if_not(begin(v), end(v), validate) == end(v);
+  } else {
+    return validate(v);
+  }
+}
+} // anonymous namespace
+
+bool validate_cmd(CephContext* cct,
+		  const std::string& desc,
+		  const cmdmap_t& cmdmap,
+		  std::ostream& os)
+{
+  return !find_first_in(desc, " ", [&](auto desc) {
+    auto arg_desc = cmddesc_get_args(desc);
+    if (arg_desc.empty()) {
+      return false;
+    }
+    ceph_assert(arg_desc.count("name"));
+    ceph_assert(arg_desc.count("type"));
+    auto name = arg_desc["name"];
+    auto type = arg_desc["type"];
+    if (arg_desc.count("n")) {
+      if (type == "CephInt") {
+	return !validate_arg<true, int64_t>(cct, cmdmap, arg_desc,
+					    name, type, os);
+      } else if (type == "CephFloat") {
+	return !validate_arg<true, double>(cct, cmdmap, arg_desc,
+					    name, type, os);
+      } else {
+	return !validate_arg<true, string>(cct, cmdmap, arg_desc,
+					   name, type, os);
+      }
+    } else {
+      if (type == "CephInt") {
+	return !validate_arg<false, int64_t>(cct, cmdmap, arg_desc,
+					    name, type, os);
+      } else if (type == "CephFloat") {
+	return !validate_arg<false, double>(cct, cmdmap, arg_desc,
+					    name, type, os);
+      } else {
+	return !validate_arg<false, string>(cct, cmdmap, arg_desc,
+					    name, type, os);
+      }
+    }
+  });
+}
+
+bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
+		const std::string& k, bool& val)
+{
+  /*
+   * Specialized getval for booleans.  CephBool didn't exist before Nautilus,
+   * so earlier clients are sent a CephChoices argdesc instead, and will
+   * send us a "--foo-bar" value string for boolean arguments.
+   */
+  if (cmdmap.count(k)) {
+    try {
+      val = boost::get<bool>(cmdmap.find(k)->second);
+      return true;
+    } catch (boost::bad_get&) {
+      try {
+        std::string expected = "--" + k;
+        std::replace(expected.begin(), expected.end(), '_', '-');
+
+        std::string v_str = boost::get<std::string>(cmdmap.find(k)->second);
+        if (v_str == expected) {
+          val = true;
+          return true;
+        } else {
+          throw bad_cmd_get(k, cmdmap);
+        }
+      } catch (boost::bad_get&) {
+        throw bad_cmd_get(k, cmdmap);
+      }
+    }
+  }
+  return false;
+}
+
+
diff --git a/src/common/cmdparse.h b/src/common/cmdparse.h
new file mode 100644
index 00000000..2c59567d
--- /dev/null
+++ b/src/common/cmdparse.h
@@ -0,0 +1,111 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_COMMON_CMDPARSE_H
+#define CEPH_COMMON_CMDPARSE_H
+
+#include <vector>
+#include <stdexcept>
+#include <ostream>
+#include <boost/variant.hpp>
+#include "include/ceph_assert.h"	// boost clobbers this
+#include "common/Formatter.h"
+#include "common/BackTrace.h"
+
+class CephContext;
+
+typedef boost::variant<std::string,
+		       bool,
+		       int64_t,
+		       double,
+		       std::vector<std::string>,
+		       std::vector<int64_t>,
+		       std::vector<double>>  cmd_vartype;
+typedef std::map<std::string, cmd_vartype, std::less<>> cmdmap_t;
+
+std::string cmddesc_get_prefix(const std::string &cmddesc);
+std::string cmddesc_get_prenautilus_compat(const std::string &cmddesc);
+void dump_cmd_to_json(ceph::Formatter *f, uint64_t features,
+                      const std::string& cmd);
+void dump_cmd_and_help_to_json(ceph::Formatter *f,
+			       uint64_t features,
+			       const std::string& secname,
+			       const std::string& cmd,
+			       const std::string& helptext);
+void dump_cmddesc_to_json(ceph::Formatter *jf,
+		          uint64_t features,
+		          const std::string& secname,
+		          const std::string& cmdsig,
+		          const std::string& helptext,
+		          const std::string& module,
+		          const std::string& perm,
+		          uint64_t flags);
+bool cmdmap_from_json(std::vector<std::string> cmd, cmdmap_t *mapp,
+		      std::stringstream &ss);
+void cmdmap_dump(const cmdmap_t &cmdmap, ceph::Formatter *f);
+void handle_bad_get(CephContext *cct, const std::string& k, const char *name);
+
+std::string cmd_vartype_stringify(const cmd_vartype& v);
+
+struct bad_cmd_get : public std::exception {
+  std::string desc;
+  bad_cmd_get(const std::string& f, const cmdmap_t& cmdmap) {
+    desc = "bad or missing field '" + f + "'";
+  }
+  const char *what() const throw() override {
+    return desc.c_str();
+  }
+};
+
+bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
+		const std::string& k, bool& val);
+
+template <typename T>
+bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
+		const std::string& k, T& val)
+{
+  if (cmdmap.count(k)) {
+    try {
+      val = boost::get<T>(cmdmap.find(k)->second);
+      return true;
+    } catch (boost::bad_get&) {
+      throw bad_cmd_get(k, cmdmap);
+    }
+  }
+  return false;
+}
+
+// with default
+
+template <typename T>
+bool cmd_getval(
+  CephContext *cct, const cmdmap_t& cmdmap, const std::string& k,
+  T& val, const T& defval)
+{
+  if (cmdmap.count(k)) {
+    try {
+      val = boost::get<T>(cmdmap.find(k)->second);
+      return true;
+    } catch (boost::bad_get&) {
+      throw bad_cmd_get(k, cmdmap);
+    }
+  } else {
+    val = defval;
+    return true;
+  }
+}
+
+template <typename T>
+void
+cmd_putval(CephContext *cct, cmdmap_t& cmdmap, const std::string& k, const T& val)
+{
+  cmdmap[k] = val;
+}
+
+bool validate_cmd(CephContext* cct,
+		  const std::string& desc,
+		  const cmdmap_t& cmdmap,
+		  std::ostream& os);
+extern int parse_osd_id(const char *s, std::ostream *pss);
+extern long parse_pos_long(const char *s, std::ostream *pss = NULL);
+
+#endif
diff --git a/src/common/code_environment.cc b/src/common/code_environment.cc
new file mode 100644
index 00000000..414e2b62
--- /dev/null
+++ b/src/common/code_environment.cc
@@ -0,0 +1,98 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/code_environment.h"
+
+#include <iostream>
+
+#include "acconfig.h"
+
+#ifdef HAVE_SYS_PRCTL_H
+#include <sys/prctl.h>
+#endif
+
+#include <string.h>
+
+code_environment_t g_code_env = CODE_ENVIRONMENT_UTILITY;
+
+extern "C" const char *code_environment_to_str(enum code_environment_t e)
+{
+  switch (e) {
+    case CODE_ENVIRONMENT_UTILITY:
+      return "CODE_ENVIRONMENT_UTILITY";
+    case CODE_ENVIRONMENT_DAEMON:
+      return "CODE_ENVIRONMENT_DAEMON";
+    case CODE_ENVIRONMENT_LIBRARY:
+      return "CODE_ENVIRONMENT_LIBRARY";
+    default:
+      return NULL;
+  }
+}
+
+std::ostream &operator<<(std::ostream &oss, const enum code_environment_t e)
+{
+  oss << code_environment_to_str(e);
+  return oss;
+}
+
+#if defined(HAVE_SYS_PRCTL_H) && defined(PR_GET_NAME) /* Since 2.6.11 */
+
+int get_process_name(char *buf, int len)
+{
+  if (len <= 16) {
+    /* The man page discourages using this prctl with a buffer shorter
+     * than 16 bytes. With a 16-byte buffer, it might not be
+     * null-terminated. */
+    return -ENAMETOOLONG;
+  }
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(buf, 0, len);
+  return prctl(PR_GET_NAME, buf);
+}
+
+#elif defined(HAVE_GETPROGNAME)
+
+int get_process_name(char *buf, int len)
+{
+  if (len <= 0) {
+    return -EINVAL;
+  }
+
+  const char *progname = getprogname();
+  if (progname == nullptr || *progname == '\0') {
+    return -ENOSYS;
+  }
+
+  strncpy(buf, progname, len - 1);
+  buf[len - 1] = '\0';
+  return 0;
+}
+
+#else
+
+int get_process_name(char *buf, int len)
+{
+  return -ENOSYS;
+}
+
+#endif
+
+std::string get_process_name_cpp()
+{
+  char buf[32];
+  if (get_process_name(buf, sizeof(buf))) {
+    return "(unknown)";
+  }
+  return std::string(buf);
+}
diff --git a/src/common/code_environment.h b/src/common/code_environment.h
new file mode 100644
index 00000000..b94ba52c
--- /dev/null
+++ b/src/common/code_environment.h
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_CODE_ENVIRONMENT_H
+#define CEPH_COMMON_CODE_ENVIRONMENT_H
+
+enum code_environment_t {
+  CODE_ENVIRONMENT_UTILITY = 0,
+  CODE_ENVIRONMENT_DAEMON = 1,
+  CODE_ENVIRONMENT_LIBRARY = 2,
+  CODE_ENVIRONMENT_UTILITY_NODOUT = 3,
+};
+
+#ifdef __cplusplus
+#include <iosfwd>
+#include <string>
+
+extern "C" code_environment_t g_code_env;
+extern "C" const char *code_environment_to_str(enum code_environment_t e);
+std::ostream &operator<<(std::ostream &oss, const enum code_environment_t e);
+extern "C" int get_process_name(char *buf, int len);
+std::string get_process_name_cpp();
+
+#else
+
+extern code_environment_t g_code_env;
+const char *code_environment_to_str(const enum code_environment_t e);
+extern int get_process_name(char *buf, int len);
+
+#endif
+
+#endif
diff --git a/src/common/cohort_lru.h b/src/common/cohort_lru.h
new file mode 100644
index 00000000..2383fc95
--- /dev/null
+++ b/src/common/cohort_lru.h
@@ -0,0 +1,501 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Copyright (C) 2015 CohortFS, LLC.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.	See file COPYING.
+ *
+ */
+
+#ifndef COHORT_LRU_H
+#define COHORT_LRU_H
+
+#include <boost/intrusive/list.hpp>
+#include <boost/intrusive/slist.hpp>
+
+#include "common/likely.h"
+
+#ifndef CACHE_LINE_SIZE
+#define CACHE_LINE_SIZE 64 /* XXX arch-specific define */
+#endif
+#define CACHE_PAD(_n) char __pad ## _n [CACHE_LINE_SIZE]
+
+namespace cohort {
+
+  namespace lru {
+
+    namespace bi = boost::intrusive;
+
+    /* public flag values */
+    constexpr uint32_t FLAG_NONE = 0x0000;
+    constexpr uint32_t FLAG_INITIAL = 0x0001;
+    constexpr uint32_t FLAG_RECYCLE = 0x0002;
+
+    enum class Edge : std::uint8_t
+    {
+      MRU = 0,
+      LRU
+    };
+
+    typedef bi::link_mode<bi::safe_link> link_mode;
+
+    class ObjectFactory; // Forward declaration
+
+    class Object
+    {
+    private:
+      uint32_t lru_flags;
+      std::atomic<uint32_t> lru_refcnt;
+      std::atomic<uint32_t> lru_adj;
+      bi::list_member_hook< link_mode > lru_hook;
+
+      typedef bi::list<Object,
+		       bi::member_hook<
+			 Object, bi::list_member_hook< link_mode >,
+			 &Object::lru_hook >,
+		       bi::constant_time_size<true>> Queue;
+
+      bi::slist_member_hook< link_mode > q2_hook;
+
+      typedef bi::slist<Object,
+			bi::member_hook<
+			  Object, bi::slist_member_hook< link_mode >,
+			  &Object::q2_hook >,
+			bi::constant_time_size<true>> Queue2;
+
+    public:
+
+      Object() : lru_flags(FLAG_NONE), lru_refcnt(0), lru_adj(0) {}
+
+      uint32_t get_refcnt() const { return lru_refcnt; }
+
+      virtual bool reclaim(const ObjectFactory* newobj_fac) = 0;
+
+      virtual ~Object() {}
+
+    private:
+      template <typename LK>
+      friend class LRU;
+
+      template <typename T, typename TTree, typename CLT, typename CEQ,
+	      typename K, typename LK>
+      friend class TreeX;
+    };
+
+    /* allocator & recycler interface (create or re-use LRU objects) */
+    class ObjectFactory
+    {
+    public:
+      virtual Object* alloc(void) = 0;
+      virtual void recycle(Object*) = 0;
+      virtual ~ObjectFactory() {};
+    };
+
+    template <typename LK>
+    class LRU
+    {
+    private:
+
+      struct Lane {
+	LK lock;
+	Object::Queue q;
+	// Object::Queue pinned; /* placeholder for possible expansion */
+	CACHE_PAD(0);
+	Lane() {}
+      };
+
+      Lane *qlane;
+      int n_lanes;
+      std::atomic<uint32_t> evict_lane;
+      const uint32_t lane_hiwat;
+
+      static constexpr uint32_t lru_adj_modulus = 5;
+
+      static constexpr uint32_t SENTINEL_REFCNT = 1;
+
+      /* internal flag values */
+      static constexpr uint32_t FLAG_INLRU = 0x0001;
+      static constexpr uint32_t FLAG_PINNED  = 0x0002; // possible future use
+      static constexpr uint32_t FLAG_EVICTING = 0x0004;
+
+      Lane& lane_of(void* addr) {
+	return qlane[(uint64_t)(addr) % n_lanes];
+      }
+
+      uint32_t next_evict_lane() {
+	return (evict_lane++ % n_lanes);
+      }
+
+      bool can_reclaim(Object* o) {
+	return ((o->lru_refcnt == SENTINEL_REFCNT) &&
+		(!(o->lru_flags & FLAG_EVICTING)));
+      }
+
+      Object* evict_block(const ObjectFactory* newobj_fac) {
+	uint32_t lane_ix = next_evict_lane();
+	for (int ix = 0; ix < n_lanes; ++ix,
+	       lane_ix = next_evict_lane()) {
+	  Lane& lane = qlane[lane_ix];
+	  lane.lock.lock();
+	  /* if object at LRU has refcnt==1, it may be reclaimable */
+	  Object* o = &(lane.q.back());
+	  if (can_reclaim(o)) {
+	    ++(o->lru_refcnt);
+	    o->lru_flags |= FLAG_EVICTING;
+	    lane.lock.unlock();
+	    if (o->reclaim(newobj_fac)) {
+	      lane.lock.lock();
+	      --(o->lru_refcnt);
+	      /* assertions that o state has not changed across
+	       * relock */
+	      ceph_assert(o->lru_refcnt == SENTINEL_REFCNT);
+	      ceph_assert(o->lru_flags & FLAG_INLRU);
+	      Object::Queue::iterator it =
+		Object::Queue::s_iterator_to(*o);
+	      lane.q.erase(it);
+	      lane.lock.unlock();
+	      return o;
+	    } else {
+	      // XXX can't make unreachable (means what?)
+	      --(o->lru_refcnt);
+	      o->lru_flags &= ~FLAG_EVICTING;
+	      /* unlock in next block */
+	    }
+	  } /* can_reclaim(o) */
+	  lane.lock.unlock();
+	} /* each lane */
+	return nullptr;
+      } /* evict_block */
+
+    public:
+
+      LRU(int lanes, uint32_t _hiwat)
+	: n_lanes(lanes), evict_lane(0), lane_hiwat(_hiwat)
+	  {
+	    ceph_assert(n_lanes > 0);
+	    qlane = new Lane[n_lanes];
+	  }
+
+      ~LRU() { delete[] qlane; }
+
+      bool ref(Object* o, uint32_t flags) {
+	++(o->lru_refcnt);
+	if (flags & FLAG_INITIAL) {
+	  if ((++(o->lru_adj) % lru_adj_modulus) == 0) {
+	    Lane& lane = lane_of(o);
+	    lane.lock.lock();
+	    /* move to MRU */
+	    Object::Queue::iterator it =
+	      Object::Queue::s_iterator_to(*o);
+	    lane.q.erase(it);
+	    lane.q.push_front(*o);
+	    lane.lock.unlock();
+	  } /* adj */
+	} /* initial ref */
+	return true;
+      } /* ref */
+
+      void unref(Object* o, uint32_t flags) {
+	uint32_t refcnt = --(o->lru_refcnt);
+	Object* tdo = nullptr;
+	if (unlikely(refcnt == 0)) {
+	  Lane& lane = lane_of(o);
+	  lane.lock.lock();
+	  refcnt = o->lru_refcnt.load();
+	  if (unlikely(refcnt == 0)) {
+	    Object::Queue::iterator it =
+	      Object::Queue::s_iterator_to(*o);
+	    lane.q.erase(it);
+	    tdo = o;
+	  }
+	  lane.lock.unlock();
+	} else if (unlikely(refcnt == SENTINEL_REFCNT)) {
+	  Lane& lane = lane_of(o);
+	  lane.lock.lock();
+	  refcnt = o->lru_refcnt.load();
+	  if (likely(refcnt == SENTINEL_REFCNT)) {
+	    /* move to LRU */
+	    Object::Queue::iterator it =
+	      Object::Queue::s_iterator_to(*o);
+	    lane.q.erase(it);
+	    /* hiwat check */
+	    if (lane.q.size() > lane_hiwat) {
+	      tdo = o;
+	    } else {
+	      lane.q.push_back(*o);
+	    }
+	  }
+	  lane.lock.unlock();
+	}
+	/* unref out-of-line && !LOCKED */
+	if (tdo)
+	  delete tdo;
+      } /* unref */
+
+      Object* insert(ObjectFactory* fac, Edge edge, uint32_t& flags) {
+	/* use supplied functor to re-use an evicted object, or
+	 * allocate a new one of the descendant type */
+	Object* o = evict_block(fac);
+	if (o) {
+	  fac->recycle(o); /* recycle existing object */
+	  flags |= FLAG_RECYCLE;
+	}
+	else
+	  o = fac->alloc(); /* get a new one */
+
+	o->lru_flags = FLAG_INLRU;
+
+	Lane& lane = lane_of(o);
+	lane.lock.lock();
+	switch (edge) {
+	case Edge::MRU:
+	  lane.q.push_front(*o);
+	  break;
+	case Edge::LRU:
+	  lane.q.push_back(*o);
+	  break;
+	default:
+	  ceph_abort();
+	  break;
+	}
+	if (flags & FLAG_INITIAL)
+	  o->lru_refcnt += 2; /* sentinel ref + initial */
+	else
+	  ++(o->lru_refcnt); /* sentinel */
+	lane.lock.unlock();
+	return o;
+      } /* insert */
+
+    };
+
+    template <typename T, typename TTree, typename CLT, typename CEQ,
+	      typename K, typename LK>
+    class TreeX
+    {
+    public:
+
+      static constexpr uint32_t FLAG_NONE = 0x0000;
+      static constexpr uint32_t FLAG_LOCK = 0x0001;
+      static constexpr uint32_t FLAG_UNLOCK = 0x0002;
+      static constexpr uint32_t FLAG_UNLOCK_ON_MISS = 0x0004;
+
+      typedef T value_type;
+      typedef TTree container_type;
+      typedef typename TTree::iterator iterator;
+      typedef std::pair<iterator, bool> check_result;
+      typedef typename TTree::insert_commit_data insert_commit_data;
+      int n_part;
+      int csz;
+
+      typedef std::unique_lock<LK> unique_lock;
+
+      struct Partition {
+	LK lock;
+	TTree tr;
+	T** cache;
+	int csz;
+	CACHE_PAD(0);
+
+	Partition() : tr(), cache(nullptr), csz(0) {}
+
+	~Partition() {
+	  if (csz)
+	    ::operator delete(cache);
+	}
+      };
+
+      struct Latch {
+	Partition* p;
+	LK* lock;
+	insert_commit_data commit_data{};
+
+	Latch() : p(nullptr), lock(nullptr) {}
+      };
+
+      Partition& partition_of_scalar(uint64_t x) {
+	return part[x % n_part];
+      }
+
+      Partition& get(uint8_t x) {
+	return part[x];
+      }
+
+      Partition*& get() {
+	return part;
+      }
+
+      void lock() {
+	std::for_each(locks.begin(), locks.end(),
+		      [](LK* lk){ lk->lock(); });
+      }
+
+      void unlock() {
+	std::for_each(locks.begin(), locks.end(),
+		      [](LK* lk){ lk->unlock(); });
+      }
+
+      TreeX(int n_part=1, int csz=127) : n_part(n_part), csz(csz) {
+	ceph_assert(n_part > 0);
+	part = new Partition[n_part];
+	for (int ix = 0; ix < n_part; ++ix) {
+	  Partition& p = part[ix];
+	  if (csz) {
+	    p.csz = csz;
+	    p.cache = (T**) ::operator new(csz * sizeof(T*));
+	    // FIPS zeroization audit 20191115: this memset is not security related.
+	    memset(p.cache, 0, csz * sizeof(T*));
+	  }
+	  locks.push_back(&p.lock);
+	}
+      }
+
+      ~TreeX() {
+	delete[] part;
+      }
+
+      T* find(uint64_t hk, const K& k, uint32_t flags) {
+	T* v;
+	Latch lat;
+	uint32_t slot = 0;
+	lat.p = &(partition_of_scalar(hk));
+	if (flags & FLAG_LOCK) {
+	  lat.lock = &lat.p->lock;
+	  lat.lock->lock();
+	}
+	if (csz) { /* template specialize? */
+	  slot = hk % csz;
+	  v = lat.p->cache[slot];
+	  if (v) {
+	    if (CEQ()(*v, k)) {
+	      if (flags & FLAG_LOCK)
+		lat.lock->unlock();
+	      return v;
+	    }
+	    v = nullptr;
+	  }
+	} else {
+	  v = nullptr;
+	}
+	iterator it = lat.p->tr.find(k, CLT());
+	if (it != lat.p->tr.end()){
+	  v = &(*(it));
+	  if (csz) {
+	    /* fill cache slot at hk */
+	    lat.p->cache[slot] = v;
+	  }
+	}
+	if (flags & FLAG_LOCK)
+	  lat.lock->unlock();
+	return v;
+      } /* find */
+
+      T* find_latch(uint64_t hk, const K& k, Latch& lat,
+		    uint32_t flags) {
+	uint32_t slot = 0;
+	T* v;
+	lat.p = &(partition_of_scalar(hk));
+	lat.lock = &lat.p->lock;
+	if (flags & FLAG_LOCK)
+	  lat.lock->lock();
+	if (csz) { /* template specialize? */
+	  slot = hk % csz;
+	  v = lat.p->cache[slot];
+	  if (v) {
+	    if (CEQ()(*v, k)) {
+	      if ((flags & FLAG_LOCK) && (flags & FLAG_UNLOCK))
+		lat.lock->unlock();
+	      return v;
+	    }
+	    v = nullptr;
+	  }
+	} else {
+	  v = nullptr;
+	}
+	check_result r = lat.p->tr.insert_unique_check(
+	  k, CLT(), lat.commit_data);
+	if (! r.second /* !insertable (i.e., !found) */) {
+	  v = &(*(r.first));
+	  if (csz) {
+	    /* fill cache slot at hk */
+	    lat.p->cache[slot] = v;
+	  }
+	}
+	if ((flags & FLAG_LOCK) && (flags & FLAG_UNLOCK))
+	  lat.lock->unlock();
+	return v;
+      } /* find_latch */
+      bool is_same_partition(uint64_t lhs, uint64_t rhs) {
+        return ((lhs % n_part) == (rhs % n_part));
+      }
+      void insert_latched(T* v, Latch& lat, uint32_t flags) {
+	(void) lat.p->tr.insert_unique_commit(*v, lat.commit_data);
+	if (flags & FLAG_UNLOCK)
+	  lat.lock->unlock();
+      } /* insert_latched */
+
+      void insert(uint64_t hk, T* v, uint32_t flags) {
+	Partition& p = partition_of_scalar(hk);
+	if (flags & FLAG_LOCK)
+	  p.lock.lock();
+	p.tr.insert_unique(*v);
+	if (flags & FLAG_LOCK)
+	  p.lock.unlock();
+      } /* insert */
+
+      void remove(uint64_t hk, T* v, uint32_t flags) {
+	Partition& p = partition_of_scalar(hk);
+	iterator it = TTree::s_iterator_to(*v);
+	if (flags & FLAG_LOCK)
+	  p.lock.lock();
+	p.tr.erase(it);
+	if (csz) { /* template specialize? */
+	  uint32_t slot = hk % csz;
+	  T* v2 = p.cache[slot];
+	  /* we are intrusive, just compare addresses */
+	  if (v == v2)
+	    p.cache[slot] = nullptr;
+	}
+	if (flags & FLAG_LOCK)
+	  p.lock.unlock();
+      } /* remove */
+
+      void drain(std::function<void(T*)> uref,
+		 uint32_t flags = FLAG_NONE) {
+	/* clear a table, call supplied function on
+	 * each element found (e.g., returns sentinel
+	 * references) */
+	Object::Queue2 drain_q;
+	for (int t_ix = 0; t_ix < n_part; ++t_ix) {
+	  Partition& p = part[t_ix];
+	  if (flags & FLAG_LOCK) /* LOCKED */
+	    p.lock.lock();
+	  while (p.tr.size() > 0) {
+	    iterator it = p.tr.begin();
+	    T* v = &(*it);
+	    p.tr.erase(it);
+	    drain_q.push_front(*v);
+	  }
+	  if (flags & FLAG_LOCK) /* we locked it, !LOCKED */
+	    p.lock.unlock();
+	} /* each partition */
+	/* unref out-of-line && !LOCKED */
+	while (drain_q.size() > 0) {
+	  Object::Queue2::iterator it = drain_q.begin();
+	  T* v = static_cast<T*>(&(*it));
+	  drain_q.erase(it); /* must precede uref(v) in safe_link mode */
+	  uref(v);
+	}
+      } /* drain */
+
+    private:
+      Partition *part;
+      std::vector<LK*> locks;
+    };
+
+  } /* namespace LRU */
+} /* namespace cohort */
+
+#endif /* COHORT_LRU_H */
diff --git a/src/common/common_init.cc b/src/common/common_init.cc
new file mode 100644
index 00000000..09280493
--- /dev/null
+++ b/src/common/common_init.cc
@@ -0,0 +1,142 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2010-2011 Dreamhost
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+#include "common/common_init.h"
+#include "common/admin_socket.h"
+#include "common/ceph_argparse.h"
+#include "common/ceph_context.h"
+#include "common/config.h"
+#include "common/dout.h"
+#include "common/strtol.h"
+#include "common/valgrind.h"
+#include "common/zipkin_trace.h"
+
+#define dout_subsys ceph_subsys_
+
+#define _STR(x) #x
+#define STRINGIFY(x) _STR(x)
+
+#ifndef WITH_SEASTAR
+CephContext *common_preinit(const CephInitParameters &iparams,
+			    enum code_environment_t code_env, int flags)
+{
+  // set code environment
+  ANNOTATE_BENIGN_RACE_SIZED(&g_code_env, sizeof(g_code_env), "g_code_env");
+  g_code_env = code_env;
+
+  // Create a configuration object
+  CephContext *cct = new CephContext(iparams.module_type, code_env, flags);
+
+  auto& conf = cct->_conf;
+  // add config observers here
+
+  // Set up our entity name.
+  conf->name = iparams.name;
+
+  // different default keyring locations for osd and mds.  this is
+  // for backward compatibility.  moving forward, we want all keyrings
+  // in these locations.  the mon already forces $mon_data/keyring.
+  if (conf->name.is_mds()) {
+    conf.set_val_default("keyring", "$mds_data/keyring");
+  } else if (conf->name.is_osd()) {
+    conf.set_val_default("keyring", "$osd_data/keyring");
+  }
+
+  if ((flags & CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS)) {
+    // make this unique despite multiple instances by the same name.
+    conf.set_val_default("admin_socket",
+			  "$run_dir/$cluster-$name.$pid.$cctid.asok");
+  }
+
+  if (code_env == CODE_ENVIRONMENT_LIBRARY ||
+      code_env == CODE_ENVIRONMENT_UTILITY_NODOUT) {
+    conf.set_val_default("log_to_stderr", "false");
+    conf.set_val_default("err_to_stderr", "false");
+    conf.set_val_default("log_flush_on_exit", "false");
+  }
+
+  return cct;
+}
+#endif	// #ifndef WITH_SEASTAR
+
+void complain_about_parse_errors(CephContext *cct,
+				 std::deque<std::string> *parse_errors)
+{
+  if (parse_errors->empty())
+    return;
+  lderr(cct) << "Errors while parsing config file!" << dendl;
+  int cur_err = 0;
+  static const int MAX_PARSE_ERRORS = 20;
+  for (std::deque<std::string>::const_iterator p = parse_errors->begin();
+       p != parse_errors->end(); ++p)
+  {
+    lderr(cct) << *p << dendl;
+    if (cur_err == MAX_PARSE_ERRORS) {
+      lderr(cct) << "Suppressed " << (parse_errors->size() - MAX_PARSE_ERRORS)
+	   << " more errors." << dendl;
+      break;
+    }
+    ++cur_err;
+  }
+}
+
+#ifndef WITH_SEASTAR
+
+/* Please be sure that this can safely be called multiple times by the
+ * same application. */
+void common_init_finish(CephContext *cct)
+{
+  // only do this once per cct
+  if (cct->_finished) {
+    return;
+  }
+  cct->_finished = true;
+  cct->init_crypto();
+  ZTracer::ztrace_init();
+
+  if (!cct->_log->is_started()) {
+    cct->_log->start();
+  }
+
+  int flags = cct->get_init_flags();
+  if (!(flags & CINIT_FLAG_NO_DAEMON_ACTIONS))
+    cct->start_service_thread();
+
+  if ((flags & CINIT_FLAG_DEFER_DROP_PRIVILEGES) &&
+      (cct->get_set_uid() || cct->get_set_gid())) {
+    cct->get_admin_socket()->chown(cct->get_set_uid(), cct->get_set_gid());
+  }
+
+  const auto& conf = cct->_conf;
+
+  if (!conf->admin_socket.empty() && !conf->admin_socket_mode.empty()) {
+    int ret = 0;
+    std::string err;
+
+    ret = strict_strtol(conf->admin_socket_mode.c_str(), 8, &err);
+    if (err.empty()) {
+      if (!(ret & (~ACCESSPERMS))) {
+        cct->get_admin_socket()->chmod(static_cast<mode_t>(ret));
+      } else {
+        lderr(cct) << "Invalid octal permissions string: "
+            << conf->admin_socket_mode << dendl;
+      }
+    } else {
+      lderr(cct) << "Invalid octal string: " << err << dendl;
+    }
+  }
+}
+
+#endif	// #ifndef WITH_SEASTAR
diff --git a/src/common/common_init.h b/src/common/common_init.h
new file mode 100644
index 00000000..ee315e76
--- /dev/null
+++ b/src/common/common_init.h
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2009-2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_INIT_H
+#define CEPH_COMMON_INIT_H
+
+#include <deque>
+
+#include "common/code_environment.h"
+
+class CephContext;
+
+enum common_init_flags_t {
+  // Set up defaults that make sense for an unprivileged daemon
+  CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS = 0x1,
+
+  // By default, don't read a configuration file OR contact mons
+  CINIT_FLAG_NO_DEFAULT_CONFIG_FILE = 0x2,
+
+  // Don't close stderr (in daemonize)
+  CINIT_FLAG_NO_CLOSE_STDERR = 0x4,
+
+  // don't do anything daemonish, like create /var/run/ceph, or print a banner
+  CINIT_FLAG_NO_DAEMON_ACTIONS = 0x8,
+
+  // don't drop privileges
+  CINIT_FLAG_DEFER_DROP_PRIVILEGES = 0x10,
+
+  // don't contact mons for config
+  CINIT_FLAG_NO_MON_CONFIG = 0x20,
+
+  // don't expose default cct perf counters
+  CINIT_FLAG_NO_CCT_PERF_COUNTERS = 0x40,
+};
+
+#ifndef WITH_SEASTAR
+class CephInitParameters;
+
+/*
+ * NOTE: If you are writing a Ceph daemon, ignore this function and call
+ * global_init instead. It will call common_preinit for you.
+ *
+ * common_preinit creates the CephContext.
+ *
+ * After this function gives you a CephContext, you need to set up the
+ * Ceph configuration, which lives inside the CephContext as md_config_t.
+ * The initial settings are not very useful because they do not reflect what
+ * the user asked for.
+ *
+ * This is usually done by something like this:
+ * cct->_conf.parse_env();
+ * cct->_conf.apply_changes();
+ *
+ * Your library may also supply functions to read a configuration file.
+ */
+CephContext *common_preinit(const CephInitParameters &iparams,
+			    enum code_environment_t code_env, int flags);
+#endif // #ifndef WITH_SEASTAR
+
+/* Print out some parse errors. */
+void complain_about_parse_errors(CephContext *cct,
+				 std::deque<std::string> *parse_errors);
+
+/* This function is called after you have done your last
+ * fork. When you make this call, the system will initialize everything that
+ * cannot be initialized before a fork.
+ *
+ * This includes things like starting threads, initializing libraries that
+ * can't handle forking, and so forth.
+ *
+ * If you are writing a Ceph library, you can call this pretty much any time.
+ * We do not allow our library users to fork and continue using the Ceph
+ * libraries. The most obvious reason for this is that the threads started by
+ * the Ceph libraries would be destroyed by a fork().
+ */
+void common_init_finish(CephContext *cct);
+
+#endif
diff --git a/src/common/compat.cc b/src/common/compat.cc
new file mode 100644
index 00000000..ceb8a011
--- /dev/null
+++ b/src/common/compat.cc
@@ -0,0 +1,196 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#if defined(__linux__) 
+#include <sys/vfs.h>
+#endif
+
+#include "include/compat.h"
+#include "include/sock_compat.h"
+#include "common/safe_io.h"
+
+// The type-value for a ZFS FS in fstatfs.
+#define FS_ZFS_TYPE 0xde
+
+// On FreeBSD, ZFS fallocate always fails since it is considered impossible to
+// reserve space on a COW filesystem. posix_fallocate() returns EINVAL
+// Linux in this case already emulates the reservation in glibc
+// In which case it is allocated manually, and still that is not a real guarantee
+// that a full buffer is allocated on disk, since it could be compressed.
+// To prevent this the written buffer needs to be loaded with random data.
+int manual_fallocate(int fd, off_t offset, off_t len) {
+  int r = lseek(fd, offset, SEEK_SET);
+  if (r == -1)
+    return errno;
+  char data[1024*128];
+  // TODO: compressing filesystems would require random data
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(data, 0x42, sizeof(data));
+  for (off_t off = 0; off < len; off += sizeof(data)) {
+    if (off + static_cast<off_t>(sizeof(data)) > len)
+      r = safe_write(fd, data, len - off);
+    else
+      r = safe_write(fd, data, sizeof(data));
+    if (r == -1) {
+      return errno;
+    }
+  }
+  return 0;
+}
+
+int on_zfs(int basedir_fd) {
+  struct statfs basefs;
+  (void)fstatfs(basedir_fd, &basefs);
+  return (basefs.f_type == FS_ZFS_TYPE);
+}
+
+int ceph_posix_fallocate(int fd, off_t offset, off_t len) {
+  // Return 0 if oke, otherwise errno > 0
+
+#ifdef HAVE_POSIX_FALLOCATE
+  if (on_zfs(fd)) {
+    return manual_fallocate(fd, offset, len);
+  } else {
+    return posix_fallocate(fd, offset, len);
+  }
+#elif defined(__APPLE__)
+  fstore_t store;
+  store.fst_flags = F_ALLOCATECONTIG;
+  store.fst_posmode = F_PEOFPOSMODE;
+  store.fst_offset = offset;
+  store.fst_length = len;
+
+  int ret = fcntl(fd, F_PREALLOCATE, &store);
+  if (ret == -1) {
+    ret = errno;
+  }
+  return ret;
+#else
+  return manual_fallocate(fd, offset, len);
+#endif
+} 
+
+int pipe_cloexec(int pipefd[2])
+{
+#if defined(HAVE_PIPE2)
+  return pipe2(pipefd, O_CLOEXEC);
+#else
+  if (pipe(pipefd) == -1)
+    return -1;
+
+  /*
+   * The old-fashioned, race-condition prone way that we have to fall
+   * back on if pipe2 does not exist.
+   */
+  if (fcntl(pipefd[0], F_SETFD, FD_CLOEXEC) < 0) {
+    goto fail;
+  }
+
+  if (fcntl(pipefd[1], F_SETFD, FD_CLOEXEC) < 0) {
+    goto fail;
+  }
+
+  return 0;
+fail:
+  int save_errno = errno;
+  VOID_TEMP_FAILURE_RETRY(close(pipefd[0]));
+  VOID_TEMP_FAILURE_RETRY(close(pipefd[1]));
+  return (errno = save_errno, -1);
+#endif
+}
+
+
+int socket_cloexec(int domain, int type, int protocol)
+{
+#ifdef SOCK_CLOEXEC
+  return socket(domain, type|SOCK_CLOEXEC, protocol);
+#else
+  int fd = socket(domain, type, protocol);
+  if (fd == -1)
+    return -1;
+
+  if (fcntl(fd, F_SETFD, FD_CLOEXEC) < 0)
+    goto fail;
+
+  return fd;
+fail:
+  int save_errno = errno;
+  VOID_TEMP_FAILURE_RETRY(close(fd));
+  return (errno = save_errno, -1);
+#endif
+}
+
+int socketpair_cloexec(int domain, int type, int protocol, int sv[2])
+{
+#ifdef SOCK_CLOEXEC
+  return socketpair(domain, type|SOCK_CLOEXEC, protocol, sv);
+#else
+  int rc = socketpair(domain, type, protocol, sv);
+  if (rc == -1)
+    return -1;
+
+  if (fcntl(sv[0], F_SETFD, FD_CLOEXEC) < 0)
+    goto fail;
+
+  if (fcntl(sv[1], F_SETFD, FD_CLOEXEC) < 0)
+    goto fail;
+
+  return 0;
+fail:
+  int save_errno = errno;
+  VOID_TEMP_FAILURE_RETRY(close(sv[0]));
+  VOID_TEMP_FAILURE_RETRY(close(sv[1]));
+  return (errno = save_errno, -1);
+#endif
+}
+
+int accept_cloexec(int sockfd, struct sockaddr* addr, socklen_t* addrlen)
+{
+#ifdef HAVE_ACCEPT4
+  return accept4(sockfd, addr, addrlen, SOCK_CLOEXEC);
+#else
+  int fd = accept(sockfd, addr, addrlen);
+  if (fd == -1)
+    return -1;
+
+  if (fcntl(fd, F_SETFD, FD_CLOEXEC) < 0)
+    goto fail;
+
+  return fd;
+fail:
+  int save_errno = errno;
+  VOID_TEMP_FAILURE_RETRY(close(fd));
+  return (errno = save_errno, -1);
+#endif
+}
+
+#if defined(__FreeBSD__)
+int sched_setaffinity(pid_t pid, size_t cpusetsize,
+                      cpu_set_t *mask)
+{
+  return 0;
+}
+#endif
+
diff --git a/src/common/compiler_extensions.h b/src/common/compiler_extensions.h
new file mode 100644
index 00000000..2fd8f5c2
--- /dev/null
+++ b/src/common/compiler_extensions.h
@@ -0,0 +1,30 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMPILER_EXTENSIONS_H
+#define CEPH_COMPILER_EXTENSIONS_H
+
+/* We should be able to take advantage of nice nonstandard features of gcc
+ * and other compilers, but still maintain portability.
+ */
+
+#ifdef __GNUC__
+// GCC
+#define WARN_UNUSED_RESULT __attribute__((warn_unused_result))
+#else
+// some other compiler - just make it a no-op
+#define WARN_UNUSED_RESULT
+#endif
+
+#endif
diff --git a/src/common/condition_variable_debug.cc b/src/common/condition_variable_debug.cc
new file mode 100644
index 00000000..a2ddd769
--- /dev/null
+++ b/src/common/condition_variable_debug.cc
@@ -0,0 +1,77 @@
+#include "condition_variable_debug.h"
+#include "common/mutex_debug.h"
+
+namespace ceph {
+
+condition_variable_debug::condition_variable_debug()
+  : waiter_mutex{nullptr}
+{
+  int r = pthread_cond_init(&cond, nullptr);
+  if (r) {
+    throw std::system_error(r, std::generic_category());
+  }
+}
+
+condition_variable_debug::~condition_variable_debug()
+{
+  pthread_cond_destroy(&cond);
+}
+
+void condition_variable_debug::wait(std::unique_lock<mutex_debug>& lock)
+{
+  // make sure this cond is used with one mutex only
+  ceph_assert(waiter_mutex == nullptr ||
+         waiter_mutex == lock.mutex());
+  waiter_mutex = lock.mutex();
+  ceph_assert(waiter_mutex->is_locked());
+  waiter_mutex->_pre_unlock();
+  if (int r = pthread_cond_wait(&cond, waiter_mutex->native_handle());
+      r != 0) {
+    throw std::system_error(r, std::generic_category());
+  }
+  waiter_mutex->_post_lock();
+}
+
+void condition_variable_debug::notify_one()
+{
+  // make sure signaler is holding the waiter's lock.
+  ceph_assert(waiter_mutex == nullptr ||
+         waiter_mutex->is_locked());
+  if (int r = pthread_cond_signal(&cond); r != 0) {
+    throw std::system_error(r, std::generic_category());
+  }
+}
+
+void condition_variable_debug::notify_all(bool sloppy)
+{
+  // make sure signaler is holding the waiter's lock.
+  ceph_assert(waiter_mutex == NULL ||
+         waiter_mutex->is_locked());
+  if (int r = pthread_cond_broadcast(&cond); r != 0 && !sloppy) {
+    throw std::system_error(r, std::generic_category());
+  }
+}
+
+std::cv_status condition_variable_debug::_wait_until(mutex_debug* mutex,
+                                                     timespec* ts)
+{
+  // make sure this cond is used with one mutex only
+  ceph_assert(waiter_mutex == nullptr ||
+         waiter_mutex == mutex);
+  waiter_mutex = mutex;
+  ceph_assert(waiter_mutex->is_locked());
+
+  waiter_mutex->_pre_unlock();
+  int r = pthread_cond_timedwait(&cond, waiter_mutex->native_handle(), ts);
+  waiter_mutex->_post_lock();
+  switch (r) {
+  case 0:
+    return std::cv_status::no_timeout;
+  case ETIMEDOUT:
+    return std::cv_status::timeout;
+  default:
+    throw std::system_error(r, std::generic_category());
+  }
+}
+
+} // namespace ceph
diff --git a/src/common/condition_variable_debug.h b/src/common/condition_variable_debug.h
new file mode 100644
index 00000000..3241502f
--- /dev/null
+++ b/src/common/condition_variable_debug.h
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <condition_variable>
+#include <ctime>
+#include <pthread.h>
+#include "common/ceph_time.h"
+
+namespace ceph {
+
+namespace mutex_debug_detail {
+  template<bool> class mutex_debug_impl;
+}
+
+class condition_variable_debug {
+  using mutex_debug = mutex_debug_detail::mutex_debug_impl<false>;
+
+  pthread_cond_t cond;
+  mutex_debug* waiter_mutex;
+
+  condition_variable_debug&
+  operator=(const condition_variable_debug&) = delete;
+  condition_variable_debug(const condition_variable_debug&) = delete;
+
+public:
+  condition_variable_debug();
+  ~condition_variable_debug();
+  void wait(std::unique_lock<mutex_debug>& lock);
+  template<class Predicate>
+  void wait(std::unique_lock<mutex_debug>& lock, Predicate pred) {
+    while (!pred()) {
+      wait(lock);
+    }
+  }
+  template<class Clock, class Duration>
+  std::cv_status wait_until(
+    std::unique_lock<mutex_debug>& lock,
+    const std::chrono::time_point<Clock, Duration>& when) {
+    timespec ts = when.to_timespec(when);
+    return _wait_until(lock.mutex(), &ts);
+  }
+  template<class Rep, class Period>
+  std::cv_status wait_for(
+    std::unique_lock<mutex_debug>& lock,
+    const std::chrono::duration<Rep, Period>& awhile) {
+    ceph::real_time when{ceph::real_clock::now()};
+    when += awhile;
+    timespec ts = ceph::real_clock::to_timespec(when);
+    return _wait_until(lock.mutex(), &ts);
+  }
+  void notify_one();
+  void notify_all(bool sloppy = false);
+private:
+  std::cv_status _wait_until(mutex_debug* mutex, timespec* ts);
+};
+
+} // namespace ceph
diff --git a/src/common/config.cc b/src/common/config.cc
new file mode 100644
index 00000000..863525ef
--- /dev/null
+++ b/src/common/config.cc
@@ -0,0 +1,1564 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <boost/type_traits.hpp>
+
+#include "common/ceph_argparse.h"
+#include "common/common_init.h"
+#include "common/config.h"
+#include "common/config_obs.h"
+#include "include/str_list.h"
+#include "include/stringify.h"
+#include "osd/osd_types.h"
+#include "common/errno.h"
+#include "common/hostname.h"
+#include "common/dout.h"
+
+/* Don't use standard Ceph logging in this file.
+ * We can't use logging until it's initialized, and a lot of the necessary
+ * initialization happens here.
+ */
+#undef dout
+#undef pdout
+#undef derr
+#undef generic_dout
+
+// set set_mon_vals()
+#define dout_subsys ceph_subsys_monc
+
+using std::map;
+using std::list;
+using std::ostringstream;
+using std::pair;
+using std::string;
+
+static const char *CEPH_CONF_FILE_DEFAULT = "$data_dir/config, /etc/ceph/$cluster.conf, $home/.ceph/$cluster.conf, $cluster.conf"
+#if defined(__FreeBSD__)
+    ", /usr/local/etc/ceph/$cluster.conf"
+#endif
+    ;
+
+#define _STR(x) #x
+#define STRINGIFY(x) _STR(x)
+
+const char *ceph_conf_level_name(int level)
+{
+  switch (level) {
+  case CONF_DEFAULT: return "default";   // built-in default
+  case CONF_MON: return "mon";           // monitor config database
+  case CONF_ENV: return "env";           // process environment (CEPH_ARGS)
+  case CONF_FILE: return "file";         // ceph.conf file
+  case CONF_CMDLINE: return "cmdline";   // process command line args
+  case CONF_OVERRIDE: return "override"; // injectargs or 'config set' at runtime
+  case CONF_FINAL: return "final";
+  default: return "???";
+  }
+}
+
+int ceph_resolve_file_search(const std::string& filename_list,
+			     std::string& result)
+{
+  list<string> ls;
+  get_str_list(filename_list, ls);
+
+  int ret = -ENOENT;
+  list<string>::iterator iter;
+  for (iter = ls.begin(); iter != ls.end(); ++iter) {
+    int fd = ::open(iter->c_str(), O_RDONLY|O_CLOEXEC);
+    if (fd < 0) {
+      ret = -errno;
+      continue;
+    }
+    close(fd);
+    result = *iter;
+    return 0;
+  }
+
+  return ret;
+}
+
+static int conf_stringify(const Option::value_t& v, string *out)
+{
+  if (boost::get<boost::blank>(&v)) {
+    return -ENOENT;
+  }
+  *out = Option::to_str(v);
+  return 0;
+}
+
+md_config_t::md_config_t(ConfigValues& values,
+			 const ConfigTracker& tracker,
+			 bool is_daemon)
+  : is_daemon(is_daemon)
+{
+  // Load the compile-time list of Option into
+  // a map so that we can resolve keys quickly.
+  for (const auto &i : ceph_options) {
+    if (schema.count(i.name)) {
+      // We may be instantiated pre-logging so send 
+      std::cerr << "Duplicate config key in schema: '" << i.name << "'"
+                << std::endl;
+      ceph_abort();
+    }
+    schema.emplace(std::piecewise_construct,
+		   std::forward_as_tuple(i.name),
+		   std::forward_as_tuple(i));
+  }
+
+  // Define the debug_* options as well.
+  subsys_options.reserve(values.subsys.get_num());
+  for (unsigned i = 0; i < values.subsys.get_num(); ++i) {
+    string name = string("debug_") + values.subsys.get_name(i);
+    subsys_options.push_back(
+      Option(name, Option::TYPE_STR, Option::LEVEL_ADVANCED));
+    Option& opt = subsys_options.back();
+    opt.set_default(stringify(values.subsys.get_log_level(i)) + "/" +
+		    stringify(values.subsys.get_gather_level(i)));
+    string desc = string("Debug level for ") + values.subsys.get_name(i);
+    opt.set_description(desc.c_str());
+    opt.set_flag(Option::FLAG_RUNTIME);
+    opt.set_long_description("The value takes the form 'N' or 'N/M' where N and M are values between 0 and 99.  N is the debug level to log (all values below this are included), and M is the level to gather and buffer in memory.  In the event of a crash, the most recent items <= M are dumped to the log file.");
+    opt.set_subsys(i);
+    opt.set_validator([](std::string *value, std::string *error_message) {
+	int m, n;
+	int r = sscanf(value->c_str(), "%d/%d", &m, &n);
+	if (r >= 1) {
+	  if (m < 0 || m > 99) {
+	    *error_message = "value must be in range [0, 99]";
+	    return -ERANGE;
+	  }
+	  if (r == 2) {
+	    if (n < 0 || n > 99) {
+	      *error_message = "value must be in range [0, 99]";
+	      return -ERANGE;
+	    }
+	  } else {
+	    // normalize to M/N
+	    n = m;
+	    *value = stringify(m) + "/" + stringify(n);
+	  }
+	} else {
+	  *error_message = "value must take the form N or N/M, where N and M are integers";
+	  return -EINVAL;
+	}
+	return 0;
+      });
+  }
+  for (auto& opt : subsys_options) {
+    schema.emplace(opt.name, opt);
+  }
+
+  // Populate list of legacy_values according to the OPTION() definitions
+  // Note that this is just setting up our map of name->member ptr.  The
+  // default values etc will get loaded in along with new-style data,
+  // as all loads write to both the values map, and the legacy
+  // members if present.
+  legacy_values = {
+#define OPTION(name, type) \
+    {std::string(STRINGIFY(name)), &ConfigValues::name},
+#define SAFE_OPTION(name, type) OPTION(name, type)
+#include "common/legacy_config_opts.h"
+#undef OPTION
+#undef SAFE_OPTION
+  };
+
+  validate_schema();
+
+  // Validate default values from the schema
+  for (const auto &i : schema) {
+    const Option &opt = i.second;
+    if (opt.type == Option::TYPE_STR) {
+      bool has_daemon_default = !boost::get<boost::blank>(&opt.daemon_value);
+      Option::value_t default_val;
+      if (is_daemon && has_daemon_default) {
+	default_val = opt.daemon_value;
+      } else {
+	default_val = opt.value;
+      }
+      // We call pre_validate as a sanity check, but also to get any
+      // side effect (value modification) from the validator.
+      std::string *def_str = boost::get<std::string>(&default_val);
+      std::string val = *def_str;
+      std::string err;
+      if (opt.pre_validate(&val, &err) != 0) {
+        std::cerr << "Default value " << opt.name << "=" << *def_str << " is "
+                     "invalid: " << err << std::endl;
+
+        // This is the compiled-in default that is failing its own option's
+        // validation, so this is super-invalid and should never make it
+        // past a pull request: crash out.
+        ceph_abort();
+      }
+      if (val != *def_str) {
+	// if the validator normalizes the string into a different form than
+	// what was compiled in, use that.
+	set_val_default(values, tracker, opt.name, val);
+      }
+    }
+  }
+
+  // Copy out values (defaults) into any legacy (C struct member) fields
+  update_legacy_vals(values);
+}
+
+md_config_t::~md_config_t()
+{
+}
+
+/**
+ * Sanity check schema.  Assert out on failures, to ensure any bad changes
+ * cannot possibly pass any testing and make it into a release.
+ */
+void md_config_t::validate_schema()
+{
+  for (const auto &i : schema) {
+    const auto &opt = i.second;
+    for (const auto &see_also_key : opt.see_also) {
+      if (schema.count(see_also_key) == 0) {
+        std::cerr << "Non-existent see-also key '" << see_also_key
+                  << "' on option '" << opt.name << "'" << std::endl;
+        ceph_abort();
+      }
+    }
+  }
+
+  for (const auto &i : legacy_values) {
+    if (schema.count(i.first) == 0) {
+      std::cerr << "Schema is missing legacy field '" << i.first << "'"
+                << std::endl;
+      ceph_abort();
+    }
+  }
+}
+
+const Option *md_config_t::find_option(const string& name) const
+{
+  auto p = schema.find(name);
+  if (p != schema.end()) {
+    return &p->second;
+  }
+  return nullptr;
+}
+
+void md_config_t::set_val_default(ConfigValues& values,
+				  const ConfigTracker& tracker,
+				  const string& name, const std::string& val)
+{
+  const Option *o = find_option(name);
+  ceph_assert(o);
+  string err;
+  int r = _set_val(values, tracker, val, *o, CONF_DEFAULT, &err);
+  ceph_assert(r >= 0);
+}
+
+int md_config_t::set_mon_vals(CephContext *cct,
+    ConfigValues& values,
+    const ConfigTracker& tracker,
+    const map<string,string>& kv,
+    config_callback config_cb)
+{
+  ignored_mon_values.clear();
+
+  if (!config_cb) {
+    ldout(cct, 4) << __func__ << " no callback set" << dendl;
+  }
+
+  for (auto& i : kv) {
+    if (config_cb) {
+      if (config_cb(i.first, i.second)) {
+	ldout(cct, 4) << __func__ << " callback consumed " << i.first << dendl;
+	continue;
+      }
+      ldout(cct, 4) << __func__ << " callback ignored " << i.first << dendl;
+    }
+    const Option *o = find_option(i.first);
+    if (!o) {
+      ldout(cct,10) << __func__ << " " << i.first << " = " << i.second
+		    << " (unrecognized option)" << dendl;
+      continue;
+    }
+    if (o->has_flag(Option::FLAG_NO_MON_UPDATE)) {
+      ignored_mon_values.emplace(i);
+      continue;
+    }
+    std::string err;
+    int r = _set_val(values, tracker, i.second, *o, CONF_MON, &err);
+    if (r < 0) {
+      ldout(cct, 4) << __func__ << " failed to set " << i.first << " = "
+		    << i.second << ": " << err << dendl;
+      ignored_mon_values.emplace(i);
+    } else if (r == ConfigValues::SET_NO_CHANGE ||
+	       r == ConfigValues::SET_NO_EFFECT) {
+      ldout(cct,20) << __func__ << " " << i.first << " = " << i.second
+		    << " (no change)" << dendl;
+    } else if (r == ConfigValues::SET_HAVE_EFFECT) {
+      ldout(cct,10) << __func__ << " " << i.first << " = " << i.second << dendl;
+    } else {
+      ceph_abort();
+    }
+  }
+  values.for_each([&] (auto name, auto configs) {
+    auto config = configs.find(CONF_MON);
+    if (config == configs.end()) {
+      return;
+    }
+    if (kv.find(name) != kv.end()) {
+      return;
+    }
+    ldout(cct,10) << __func__ << " " << name
+		  << " cleared (was " << Option::to_str(config->second) << ")"
+		  << dendl;
+    values.rm_val(name, CONF_MON);
+    // if this is a debug option, it needs to propagate to teh subsys;
+    // this isn't covered by update_legacy_vals() below.  similarly,
+    // we want to trigger a config notification for these items.
+    const Option *o = find_option(name);
+    _refresh(values, *o);
+  });
+  values_bl.clear();
+  update_legacy_vals(values);
+  return 0;
+}
+
+int md_config_t::parse_config_files(ConfigValues& values,
+				    const ConfigTracker& tracker,
+				    const char *conf_files_str,
+				    std::ostream *warnings,
+				    int flags)
+{
+
+  if (safe_to_start_threads)
+    return -ENOSYS;
+
+  if (!values.cluster.size() && !conf_files_str) {
+    /*
+     * set the cluster name to 'ceph' when neither cluster name nor
+     * configuration file are specified.
+     */
+    values.cluster = "ceph";
+  }
+
+  if (!conf_files_str) {
+    const char *c = getenv("CEPH_CONF");
+    if (c) {
+      conf_files_str = c;
+    }
+    else {
+      if (flags & CINIT_FLAG_NO_DEFAULT_CONFIG_FILE)
+	return 0;
+      conf_files_str = CEPH_CONF_FILE_DEFAULT;
+    }
+  }
+
+  std::list<std::string> conf_files;
+  get_str_list(conf_files_str, conf_files);
+  auto p = conf_files.begin();
+  while (p != conf_files.end()) {
+    string &s = *p;
+    if (s.find("$data_dir") != string::npos &&
+	data_dir_option.empty()) {
+      // useless $data_dir item, skip
+      p = conf_files.erase(p);
+    } else {
+      early_expand_meta(values, s, warnings);
+      ++p;
+    }
+  }
+
+  // open new conf
+  list<string>::const_iterator c;
+  for (c = conf_files.begin(); c != conf_files.end(); ++c) {
+    cf.clear();
+    string fn = *c;
+
+    int ret = cf.parse_file(fn.c_str(), &parse_errors, warnings);
+    if (ret == 0)
+      break;
+    else if (ret != -ENOENT)
+      return ret;
+  }
+  // it must have been all ENOENTs, that's the only way we got here
+  if (c == conf_files.end())
+    return -ENOENT;
+
+  if (values.cluster.size() == 0) {
+    /*
+     * If cluster name is not set yet, use the prefix of the
+     * basename of configuration file as cluster name.
+     */
+    auto start = c->rfind('/') + 1;
+    auto end = c->find(".conf", start);
+    if (end == c->npos) {
+        /*
+         * If the configuration file does not follow $cluster.conf
+         * convention, we do the last try and assign the cluster to
+         * 'ceph'.
+         */
+        values.cluster = "ceph";
+    } else {
+      values.cluster = c->substr(start, end - start);
+    }
+  }
+
+  std::vector <std::string> my_sections;
+  _get_my_sections(values, my_sections);
+  for (const auto &i : schema) {
+    const auto &opt = i.second;
+    std::string val;
+    int ret = _get_val_from_conf_file(my_sections, opt.name, val);
+    if (ret == 0) {
+      std::string error_message;
+      int r = _set_val(values, tracker, val, opt, CONF_FILE, &error_message);
+      if (warnings != nullptr && (r < 0 || !error_message.empty())) {
+        *warnings << "parse error setting '" << opt.name << "' to '" << val
+                  << "'";
+        if (!error_message.empty()) {
+          *warnings << " (" << error_message << ")";
+        }
+        *warnings << std::endl;
+      }
+    }
+  }
+
+  // Warn about section names that look like old-style section names
+  std::deque < std::string > old_style_section_names;
+  for (ConfFile::const_section_iter_t s = cf.sections_begin();
+       s != cf.sections_end(); ++s) {
+    const string &str(s->first);
+    if (((str.find("mds") == 0) || (str.find("mon") == 0) ||
+	 (str.find("osd") == 0)) && (str.size() > 3) && (str[3] != '.')) {
+      old_style_section_names.push_back(str);
+    }
+  }
+  if (!old_style_section_names.empty()) {
+    ostringstream oss;
+    cerr << "ERROR! old-style section name(s) found: ";
+    string sep;
+    for (std::deque < std::string >::const_iterator os = old_style_section_names.begin();
+	 os != old_style_section_names.end(); ++os) {
+      cerr << sep << *os;
+      sep = ", ";
+    }
+    cerr << ". Please use the new style section names that include a period.";
+  }
+
+  update_legacy_vals(values);
+
+  return 0;
+}
+
+void md_config_t::parse_env(unsigned entity_type,
+			    ConfigValues& values,
+			    const ConfigTracker& tracker,
+			    const char *args_var)
+{
+  if (safe_to_start_threads)
+    return;
+  if (!args_var) {
+    args_var = "CEPH_ARGS";
+  }
+  if (auto s = getenv("CEPH_KEYRING"); s) {
+    string err;
+    _set_val(values, tracker, s, *find_option("keyring"), CONF_ENV, &err);
+  }
+  if (auto dir = getenv("CEPH_LIB"); dir) {
+    for (auto name : { "erasure_code_dir", "plugin_dir", "osd_class_dir" }) {
+    std::string err;
+      const Option *o = find_option(name);
+      ceph_assert(o);
+      _set_val(values, tracker, dir, *o, CONF_ENV, &err);
+    }
+  }
+
+  // Apply pod memory limits:
+  //
+  // There are two types of resource requests: `limits` and `requests`.
+  //
+  // - Requests: Used by the K8s scheduler to determine on which nodes to
+  //   schedule the pods. This helps spread the pods to different nodes. This
+  //   value should be conservative in order to make sure all the pods are
+  //   schedulable. This corresponds to POD_MEMORY_REQUEST (set by the Rook
+  //   CRD) and is the target memory utilization we try to maintain for daemons
+  //   that respect it.
+  //
+  //   If POD_MEMORY_REQUEST is present, we use it as the target.
+  //
+  // - Limits: At runtime, the container runtime (and Linux) will use the
+  //   limits to see if the pod is using too many resources. In that case, the
+  //   pod will be killed/restarted automatically if the pod goes over the limit.
+  //   This should be higher than what is specified for requests (potentially
+  //   much higher). This corresponds to the cgroup memory limit that will
+  //   trigger the Linux OOM killer.
+  //
+  //   If POD_MEMORY_LIMIT is present, we use it as the /default/ value for
+  //   the target, which means it will only apply if the *_memory_target option
+  //   isn't set via some other path (e.g., POD_MEMORY_REQUEST, or the cluster
+  //   config, or whatever.)
+  //
+  // Here are the documented best practices:
+  //   https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/#motivation-for-cpu-requests-and-limits
+  //
+  // When the operator creates the CephCluster CR, it will need to generate the
+  // desired requests and limits. As long as we are conservative in our choice
+  // for requests and generous with the limits we should be in a good place to
+  // get started.
+  //
+  // The support in Rook is already there for applying the limits as seen in
+  // these links.
+  //
+  // Rook docs on the resource requests and limits:
+  //   https://rook.io/docs/rook/v1.0/ceph-cluster-crd.html#cluster-wide-resources-configuration-settings
+  // Example CR settings:
+  //   https://github.com/rook/rook/blob/6d2ef936698593036185aabcb00d1d74f9c7bfc1/cluster/examples/kubernetes/ceph/cluster.yaml#L90
+  //
+  uint64_t pod_limit = 0, pod_request = 0;
+  if (auto pod_lim = getenv("POD_MEMORY_LIMIT"); pod_lim) {
+    string err;
+    uint64_t v = atoll(pod_lim);
+    if (v) {
+      switch (entity_type) {
+      case CEPH_ENTITY_TYPE_OSD:
+        {
+	  double cgroup_ratio = get_val<double>(
+	    values, "osd_memory_target_cgroup_limit_ratio");
+	  if (cgroup_ratio > 0.0) {
+	    pod_limit = v * cgroup_ratio;
+	    // set osd_memory_target *default* based on cgroup limit, so that
+	    // it can be overridden by any explicit settings elsewhere.
+	    set_val_default(values, tracker,
+			    "osd_memory_target", stringify(pod_limit));
+	  }
+	}
+      }
+    }
+  }
+  if (auto pod_req = getenv("POD_MEMORY_REQUEST"); pod_req) {
+    if (uint64_t v = atoll(pod_req); v) {
+      pod_request = v;
+    }
+  }
+  if (pod_request && pod_limit) {
+    // If both LIMIT and REQUEST are set, ensure that we use the
+    // min of request and limit*ratio.  This is important
+    // because k8s set set LIMIT == REQUEST if only LIMIT is
+    // specified, and we want to apply the ratio in that case,
+    // even though REQUEST is present.
+    pod_request = std::min<uint64_t>(pod_request, pod_limit);
+  }
+  if (pod_request) {
+    string err;
+    switch (entity_type) {
+    case CEPH_ENTITY_TYPE_OSD:
+      _set_val(values, tracker, stringify(pod_request),
+	       *find_option("osd_memory_target"),
+	       CONF_ENV, &err);
+      break;
+    }
+  }
+
+  if (getenv(args_var)) {
+    vector<const char *> env_args;
+    env_to_vec(env_args, args_var);
+    parse_argv(values, tracker, env_args, CONF_ENV);
+  }
+}
+
+void md_config_t::show_config(const ConfigValues& values,
+			      std::ostream& out) const
+{
+  _show_config(values, &out, nullptr);
+}
+
+void md_config_t::show_config(const ConfigValues& values,
+			      Formatter *f) const
+{
+  _show_config(values, nullptr, f);
+}
+
+void md_config_t::config_options(Formatter *f) const
+{
+  f->open_array_section("options");
+  for (const auto& i: schema) {
+    f->dump_object("option", i.second);
+  }
+  f->close_section();
+}
+
+void md_config_t::_show_config(const ConfigValues& values,
+			       std::ostream *out, Formatter *f) const
+{
+  if (out) {
+    *out << "name = " << values.name << std::endl;
+    *out << "cluster = " << values.cluster << std::endl;
+  }
+  if (f) {
+    f->dump_string("name", stringify(values.name));
+    f->dump_string("cluster", values.cluster);
+  }
+  for (const auto& i: schema) {
+    const Option &opt = i.second;
+    string val;
+    conf_stringify(_get_val(values, opt), &val);
+    if (out) {
+      *out << opt.name << " = " << val << std::endl;
+    }
+    if (f) {
+      f->dump_string(opt.name.c_str(), val);
+    }
+  }
+}
+
+int md_config_t::parse_argv(ConfigValues& values,
+			    const ConfigTracker& tracker,
+			    std::vector<const char*>& args, int level)
+{
+  if (safe_to_start_threads) {
+    return -ENOSYS;
+  }
+
+  // In this function, don't change any parts of the configuration directly.
+  // Instead, use set_val to set them. This will allow us to send the proper
+  // observer notifications later.
+  std::string val;
+  for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
+    if (strcmp(*i, "--") == 0) {
+      /* Normally we would use ceph_argparse_double_dash. However, in this
+       * function we *don't* want to remove the double dash, because later
+       * argument parses will still need to see it. */
+      break;
+    }
+    else if (ceph_argparse_flag(args, i, "--show_conf", (char*)NULL)) {
+      cerr << cf << std::endl;
+      _exit(0);
+    }
+    else if (ceph_argparse_flag(args, i, "--show_config", (char*)NULL)) {
+      do_show_config = true;
+    }
+    else if (ceph_argparse_witharg(args, i, &val, "--show_config_value", (char*)NULL)) {
+      do_show_config_value = val;
+    }
+    else if (ceph_argparse_flag(args, i, "--no-mon-config", (char*)NULL)) {
+      values.no_mon_config = true;
+    }
+    else if (ceph_argparse_flag(args, i, "--log-early", (char*)NULL)) {
+      values.log_early = true;
+    }
+    else if (ceph_argparse_flag(args, i, "--mon-config", (char*)NULL)) {
+      values.no_mon_config = false;
+    }
+    else if (ceph_argparse_flag(args, i, "--foreground", "-f", (char*)NULL)) {
+      set_val_or_die(values, tracker, "daemonize", "false");
+    }
+    else if (ceph_argparse_flag(args, i, "-d", (char*)NULL)) {
+      set_val_or_die(values, tracker, "fuse_debug", "true");
+      set_val_or_die(values, tracker, "daemonize", "false");
+      set_val_or_die(values, tracker, "log_file", "");
+      set_val_or_die(values, tracker, "log_to_stderr", "true");
+      set_val_or_die(values, tracker, "err_to_stderr", "true");
+      set_val_or_die(values, tracker, "log_to_syslog", "false");
+    }
+    // Some stuff that we wanted to give universal single-character options for
+    // Careful: you can burn through the alphabet pretty quickly by adding
+    // to this list.
+    else if (ceph_argparse_witharg(args, i, &val, "--monmap", "-M", (char*)NULL)) {
+      set_val_or_die(values, tracker, "monmap", val.c_str());
+    }
+    else if (ceph_argparse_witharg(args, i, &val, "--mon_host", "-m", (char*)NULL)) {
+      set_val_or_die(values, tracker, "mon_host", val.c_str());
+    }
+    else if (ceph_argparse_witharg(args, i, &val, "--bind", (char*)NULL)) {
+      set_val_or_die(values, tracker, "public_addr", val.c_str());
+    }
+    else if (ceph_argparse_witharg(args, i, &val, "--keyfile", "-K", (char*)NULL)) {
+      bufferlist bl;
+      string err;
+      int r;
+      if (val == "-") {
+	r = bl.read_fd(STDIN_FILENO, 1024);
+      } else {
+	r = bl.read_file(val.c_str(), &err);
+      }
+      if (r >= 0) {
+	string k(bl.c_str(), bl.length());
+	set_val_or_die(values, tracker, "key", k.c_str());
+      }
+    }
+    else if (ceph_argparse_witharg(args, i, &val, "--keyring", "-k", (char*)NULL)) {
+      set_val_or_die(values, tracker, "keyring", val.c_str());
+    }
+    else if (ceph_argparse_witharg(args, i, &val, "--client_mountpoint", "-r", (char*)NULL)) {
+      set_val_or_die(values, tracker, "client_mountpoint", val.c_str());
+    }
+    else {
+      int r = parse_option(values, tracker, args, i, NULL, level);
+      if (r < 0) {
+        return r;
+      }
+    }
+  }
+  // meta expands could have modified anything.  Copy it all out again.
+  update_legacy_vals(values);
+  return 0;
+}
+
+void md_config_t::do_argv_commands(const ConfigValues& values) const
+{
+
+  if (do_show_config) {
+    _show_config(values, &cout, NULL);
+    _exit(0);
+  }
+
+  if (do_show_config_value.size()) {
+    string val;
+    int r = conf_stringify(_get_val(values, do_show_config_value, 0, &cerr),
+			   &val);
+    if (r < 0) {
+      if (r == -ENOENT)
+	std::cerr << "failed to get config option '"
+		  << do_show_config_value << "': option not found" << std::endl;
+      else
+	std::cerr << "failed to get config option '"
+		  << do_show_config_value << "': " << cpp_strerror(r)
+		  << std::endl;
+      _exit(1);
+    }
+    std::cout << val << std::endl;
+    _exit(0);
+  }
+}
+
+int md_config_t::parse_option(ConfigValues& values,
+			      const ConfigTracker& tracker,
+			      std::vector<const char*>& args,
+			      std::vector<const char*>::iterator& i,
+			      ostream *oss,
+			      int level)
+{
+  int ret = 0;
+  size_t o = 0;
+  std::string val;
+
+  std::string option_name;
+  std::string error_message;
+  o = 0;
+  for (const auto& opt_iter: schema) {
+    const Option &opt = opt_iter.second;
+    ostringstream err;
+    std::string as_option("--");
+    as_option += opt.name;
+    option_name = opt.name;
+    if (ceph_argparse_witharg(
+	  args, i, &val, err,
+	  string(string("--default-") + opt.name).c_str(), (char*)NULL)) {
+      if (!err.str().empty()) {
+        error_message = err.str();
+	ret = -EINVAL;
+	break;
+      }
+      ret = _set_val(values, tracker,  val, opt, CONF_DEFAULT, &error_message);
+      break;
+    } else if (opt.type == Option::TYPE_BOOL) {
+      int res;
+      if (ceph_argparse_binary_flag(args, i, &res, oss, as_option.c_str(),
+				    (char*)NULL)) {
+	if (res == 0)
+	  ret = _set_val(values, tracker, "false", opt, level, &error_message);
+	else if (res == 1)
+	  ret = _set_val(values, tracker, "true", opt, level, &error_message);
+	else
+	  ret = res;
+	break;
+      } else {
+	std::string no("--no-");
+	no += opt.name;
+	if (ceph_argparse_flag(args, i, no.c_str(), (char*)NULL)) {
+	  ret = _set_val(values, tracker, "false", opt, level, &error_message);
+	  break;
+	}
+      }
+    } else if (ceph_argparse_witharg(args, i, &val, err,
+                                     as_option.c_str(), (char*)NULL)) {
+      if (!err.str().empty()) {
+        error_message = err.str();
+	ret = -EINVAL;
+	break;
+      }
+      ret = _set_val(values, tracker,  val, opt, level, &error_message);
+      break;
+    }
+    ++o;
+  }
+
+  if (ret < 0 || !error_message.empty()) {
+    ceph_assert(!option_name.empty());
+    if (oss) {
+      *oss << "Parse error setting " << option_name << " to '"
+           << val << "' using injectargs";
+      if (!error_message.empty()) {
+        *oss << " (" << error_message << ")";
+      }
+      *oss << ".\n";
+    } else {
+      cerr << "parse error setting '" << option_name << "' to '"
+	   << val << "'";
+      if (!error_message.empty()) {
+        cerr << " (" << error_message << ")";
+      }
+      cerr << "\n" << std::endl;
+    }
+  }
+
+  if (o == schema.size()) {
+    // ignore
+    ++i;
+  }
+  return ret >= 0 ? 0 : ret;
+}
+
+int md_config_t::parse_injectargs(ConfigValues& values,
+				  const ConfigTracker& tracker,
+				  std::vector<const char*>& args,
+				  std::ostream *oss)
+{
+  int ret = 0;
+  for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
+    int r = parse_option(values, tracker, args, i, oss, CONF_OVERRIDE);
+    if (r < 0)
+      ret = r;
+  }
+  return ret;
+}
+
+void md_config_t::set_safe_to_start_threads()
+{
+  safe_to_start_threads = true;
+}
+
+void md_config_t::_clear_safe_to_start_threads()
+{
+  safe_to_start_threads = false;
+}
+
+int md_config_t::injectargs(ConfigValues& values,
+			    const ConfigTracker& tracker,
+			    const std::string& s, std::ostream *oss)
+{
+  int ret;
+  char b[s.length()+1];
+  strcpy(b, s.c_str());
+  std::vector<const char*> nargs;
+  char *p = b;
+  while (*p) {
+    nargs.push_back(p);
+    while (*p && *p != ' ') p++;
+    if (!*p)
+      break;
+    *p++ = 0;
+    while (*p && *p == ' ') p++;
+  }
+  ret = parse_injectargs(values, tracker, nargs, oss);
+  if (!nargs.empty()) {
+    *oss << " failed to parse arguments: ";
+    std::string prefix;
+    for (std::vector<const char*>::const_iterator i = nargs.begin();
+	 i != nargs.end(); ++i) {
+      *oss << prefix << *i;
+      prefix = ",";
+    }
+    *oss << "\n";
+    ret = -EINVAL;
+  }
+  update_legacy_vals(values);
+  return ret;
+}
+
+void md_config_t::set_val_or_die(ConfigValues& values,
+				 const ConfigTracker& tracker,
+				 const std::string &key,
+				 const std::string &val)
+{
+  std::stringstream err;
+  int ret = set_val(values, tracker, key, val, &err);
+  if (ret != 0) {
+    std::cerr << "set_val_or_die(" << key << "): " << err.str();
+  }
+  ceph_assert(ret == 0);
+}
+
+int md_config_t::set_val(ConfigValues& values,
+			 const ConfigTracker& tracker,
+			 const std::string &key, const char *val,
+			 std::stringstream *err_ss)
+{
+  if (key.empty()) {
+    if (err_ss) *err_ss << "No key specified";
+    return -EINVAL;
+  }
+  if (!val) {
+    return -EINVAL;
+  }
+
+  std::string v(val);
+
+  string k(ConfFile::normalize_key_name(key));
+
+  const auto &opt_iter = schema.find(k);
+  if (opt_iter != schema.end()) {
+    const Option &opt = opt_iter->second;
+    std::string error_message;
+    int r = _set_val(values, tracker, v, opt, CONF_OVERRIDE, &error_message);
+    if (r >= 0) {
+      if (err_ss) *err_ss << "Set " << opt.name << " to " << v;
+      r = 0;
+    } else {
+      if (err_ss) *err_ss << error_message;
+    }
+    return r;
+  }
+
+  if (err_ss) *err_ss << "Configuration option not found: '" << key << "'";
+  return -ENOENT;
+}
+
+int md_config_t::rm_val(ConfigValues& values, const std::string& key)
+{
+  return _rm_val(values, key, CONF_OVERRIDE);
+}
+
+void md_config_t::get_defaults_bl(const ConfigValues& values,
+					 bufferlist *bl)
+{
+  if (defaults_bl.length() == 0) {
+    uint32_t n = 0;
+    bufferlist bl;
+    for (const auto &i : schema) {
+      ++n;
+      encode(i.second.name, bl);
+      auto [value, found] = values.get_value(i.second.name, CONF_DEFAULT);
+      if (found) {
+	encode(Option::to_str(value), bl);
+      } else {
+	string val;
+	conf_stringify(_get_val_default(i.second), &val);
+	encode(val, bl);
+      }
+    }
+    encode(n, defaults_bl);
+    defaults_bl.claim_append(bl);
+  }
+  *bl = defaults_bl;
+}
+
+void md_config_t::get_config_bl(
+  const ConfigValues& values,
+  uint64_t have_version,
+  bufferlist *bl,
+  uint64_t *got_version)
+{
+  if (values_bl.length() == 0) {
+    uint32_t n = 0;
+    bufferlist bl;
+    values.for_each([&](auto& name, auto& configs) {
+      if (name == "fsid" ||
+	  name == "host") {
+	return;
+      }
+      ++n;
+      encode(name, bl);
+      encode((uint32_t)configs.size(), bl);
+      for (auto& j : configs) {
+	encode(j.first, bl);
+	encode(Option::to_str(j.second), bl);
+      }
+    });
+    // make sure overridden items appear, and include the default value
+    for (auto& i : ignored_mon_values) {
+      if (values.contains(i.first)) {
+	continue;
+      }
+      if (i.first == "fsid" ||
+	  i.first == "host") {
+	continue;
+      }
+      const Option *opt = find_option(i.first);
+      if (!opt) {
+	continue;
+      }
+      ++n;
+      encode(i.first, bl);
+      encode((uint32_t)1, bl);
+      encode((int32_t)CONF_DEFAULT, bl);
+      string val;
+      conf_stringify(_get_val_default(*opt), &val);
+      encode(val, bl);
+    }
+    encode(n, values_bl);
+    values_bl.claim_append(bl);
+    encode(ignored_mon_values, values_bl);
+    ++values_bl_version;
+  }
+  if (have_version != values_bl_version) {
+    *bl = values_bl;
+    *got_version = values_bl_version;
+  }
+}
+
+int md_config_t::get_val(const ConfigValues& values,
+			 const std::string &key, char **buf, int len) const
+{
+  string k(ConfFile::normalize_key_name(key));
+  return _get_val_cstr(values, k, buf, len);
+}
+
+int md_config_t::get_val(
+  const ConfigValues& values,
+  const std::string &key,
+  std::string *val) const
+{
+  return conf_stringify(get_val_generic(values, key), val);
+}
+
+Option::value_t md_config_t::get_val_generic(
+  const ConfigValues& values,
+  const std::string &key) const
+{
+  string k(ConfFile::normalize_key_name(key));
+  return _get_val(values, k);
+}
+
+Option::value_t md_config_t::_get_val(
+  const ConfigValues& values,
+  const std::string &key,
+  expand_stack_t *stack,
+  std::ostream *err) const
+{
+  if (key.empty()) {
+    return Option::value_t(boost::blank());
+  }
+
+  // In key names, leading and trailing whitespace are not significant.
+  string k(ConfFile::normalize_key_name(key));
+
+  const Option *o = find_option(key);
+  if (!o) {
+    // not a valid config option
+    return Option::value_t(boost::blank());
+  }
+
+  return _get_val(values, *o, stack, err);
+}
+
+Option::value_t md_config_t::_get_val(
+  const ConfigValues& values,
+  const Option& o,
+  expand_stack_t *stack,
+  std::ostream *err) const
+{
+  expand_stack_t a_stack;
+  if (!stack) {
+    stack = &a_stack;
+  }
+  return _expand_meta(values,
+		      _get_val_nometa(values, o),
+		      &o, stack, err);
+}
+
+Option::value_t md_config_t::_get_val_nometa(const ConfigValues& values,
+					     const Option& o) const
+{
+  if (auto [value, found] = values.get_value(o.name, -1); found) {
+    return value;
+  } else {
+    return _get_val_default(o);
+  }
+}
+
+const Option::value_t& md_config_t::_get_val_default(const Option& o) const
+{
+  bool has_daemon_default = !boost::get<boost::blank>(&o.daemon_value);
+  if (is_daemon && has_daemon_default) {
+    return o.daemon_value;
+  } else {
+    return o.value;
+  }
+}
+
+void md_config_t::early_expand_meta(
+  const ConfigValues& values,
+  std::string &val,
+  std::ostream *err) const
+{
+  expand_stack_t stack;
+  Option::value_t v = _expand_meta(values,
+				   Option::value_t(val),
+				   nullptr, &stack, err);
+  conf_stringify(v, &val);
+}
+
+bool md_config_t::finalize_reexpand_meta(ConfigValues& values,
+					 const ConfigTracker& tracker)
+{
+  std::vector<std::string> reexpands;
+  reexpands.swap(may_reexpand_meta);
+  for (auto& name : reexpands) {
+    // always refresh the options if they are in the may_reexpand_meta
+    // map, because the options may have already been expanded with old
+    // meta.
+    const auto &opt_iter = schema.find(name);
+    ceph_assert(opt_iter != schema.end());
+    const Option &opt = opt_iter->second;
+    _refresh(values, opt);
+  }
+
+  return !may_reexpand_meta.empty();
+}
+
+Option::value_t md_config_t::_expand_meta(
+  const ConfigValues& values,
+  const Option::value_t& in,
+  const Option *o,
+  expand_stack_t *stack,
+  std::ostream *err) const
+{
+  //cout << __func__ << " in '" << in << "' stack " << stack << std::endl;
+  if (!stack) {
+    return in;
+  }
+  const std::string *str = boost::get<const std::string>(&in);
+  if (!str) {
+    // strings only!
+    return in;
+  }
+
+  auto pos = str->find('$');
+  if (pos == std::string::npos) {
+    // no substitutions!
+    return in;
+  }
+
+  if (o) {
+    stack->push_back(make_pair(o, &in));
+  }
+  string out;
+  decltype(pos) last_pos = 0;
+  while (pos != std::string::npos) {
+    ceph_assert((*str)[pos] == '$');
+    if (pos > last_pos) {
+      out += str->substr(last_pos, pos - last_pos);
+    }
+
+    // try to parse the variable name into var, either \$\{(.+)\} or
+    // \$[a-z\_]+
+    const char *valid_chars = "abcdefghijklmnopqrstuvwxyz_";
+    string var;
+    size_t endpos = 0;
+    if ((*str)[pos+1] == '{') {
+      // ...${foo_bar}...
+      endpos = str->find_first_not_of(valid_chars, pos + 2);
+      if (endpos != std::string::npos &&
+	  (*str)[endpos] == '}') {
+	var = str->substr(pos + 2, endpos - pos - 2);
+	endpos++;
+      }
+    } else {
+      // ...$foo...
+      endpos = str->find_first_not_of(valid_chars, pos + 1);
+      if (endpos != std::string::npos)
+	var = str->substr(pos + 1, endpos - pos - 1);
+      else
+	var = str->substr(pos + 1);
+    }
+    last_pos = endpos;
+
+    if (!var.size()) {
+      out += '$';
+    } else {
+      //cout << " found var " << var << std::endl;
+      // special metavariable?
+      if (var == "type") {
+	out += values.name.get_type_name();
+      } else if (var == "cluster") {
+	out += values.cluster;
+      } else if (var == "name") {
+	out += values.name.to_cstr();
+      } else if (var == "host") {
+	if (values.host == "") {
+	  out += ceph_get_short_hostname();
+	} else {
+	  out += values.host;
+	}
+      } else if (var == "num") {
+	out += values.name.get_id().c_str();
+      } else if (var == "id") {
+	out += values.name.get_id();
+      } else if (var == "pid") {
+	out += stringify(getpid());
+        if (o) {
+          may_reexpand_meta.push_back(o->name);
+        }
+      } else if (var == "cctid") {
+	out += stringify((unsigned long long)this);
+      } else if (var == "home") {
+	const char *home = getenv("HOME");
+	out = home ? std::string(home) : std::string();
+      } else {
+	if (var == "data_dir") {
+	  var = data_dir_option;
+	}
+	const Option *o = find_option(var);
+	if (!o) {
+	  out += str->substr(pos, endpos - pos);
+	} else {
+	  auto match = std::find_if(
+	    stack->begin(), stack->end(),
+	    [o](pair<const Option *,const Option::value_t*>& item) {
+	      return item.first == o;
+	    });
+	  if (match != stack->end()) {
+	    // substitution loop; break the cycle
+	    if (err) {
+	      *err << "variable expansion loop at " << var << "="
+		   << Option::to_str(*match->second) << "\n"
+		   << "expansion stack:\n";
+	      for (auto i = stack->rbegin(); i != stack->rend(); ++i) {
+		*err << i->first->name << "="
+		     << Option::to_str(*i->second) << "\n";
+	      }
+	    }
+	    return Option::value_t(std::string("$") + o->name);
+	  } else {
+	    // recursively evaluate!
+	    string n;
+	    conf_stringify(_get_val(values, *o, stack, err), &n);
+	    out += n;
+	  }
+	}
+      }
+    }
+    pos = str->find('$', last_pos);
+  }
+  if (last_pos != std::string::npos) {
+    out += str->substr(last_pos);
+  }
+  if (o) {
+    stack->pop_back();
+  }
+
+  return Option::value_t(out);
+}
+
+int md_config_t::_get_val_cstr(
+  const ConfigValues& values,
+  const std::string &key, char **buf, int len) const
+{
+  if (key.empty())
+    return -EINVAL;
+
+  string val;
+  if (conf_stringify(_get_val(values, key), &val) == 0) {
+    int l = val.length() + 1;
+    if (len == -1) {
+      *buf = (char*)malloc(l);
+      if (!*buf)
+        return -ENOMEM;
+      strncpy(*buf, val.c_str(), l);
+      return 0;
+    }
+    snprintf(*buf, len, "%s", val.c_str());
+    return (l > len) ? -ENAMETOOLONG : 0;
+  }
+
+  string k(ConfFile::normalize_key_name(key));
+
+  // couldn't find a configuration option with key 'k'
+  return -ENOENT;
+}
+
+void md_config_t::get_all_keys(std::vector<std::string> *keys) const {
+  const std::string negative_flag_prefix("no_");
+
+  keys->clear();
+  keys->reserve(schema.size());
+  for (const auto &i: schema) {
+    const Option &opt = i.second;
+    keys->push_back(opt.name);
+    if (opt.type == Option::TYPE_BOOL) {
+      keys->push_back(negative_flag_prefix + opt.name);
+    }
+  }
+}
+
+/* The order of the sections here is important.  The first section in the
+ * vector is the "highest priority" section; if we find it there, we'll stop
+ * looking. The lowest priority section is the one we look in only if all
+ * others had nothing.  This should always be the global section.
+ */
+void md_config_t::get_my_sections(const ConfigValues& values,
+				  std::vector <std::string> &sections) const
+{
+  _get_my_sections(values, sections);
+}
+
+void md_config_t::_get_my_sections(const ConfigValues& values,
+				   std::vector <std::string> &sections) const
+{
+  sections.push_back(values.name.to_str());
+
+  sections.push_back(values.name.get_type_name());
+
+  sections.push_back("global");
+}
+
+// Return a list of all sections
+int md_config_t::get_all_sections(std::vector <std::string> &sections) const
+{
+  for (ConfFile::const_section_iter_t s = cf.sections_begin();
+       s != cf.sections_end(); ++s) {
+    sections.push_back(s->first);
+  }
+  return 0;
+}
+
+int md_config_t::get_val_from_conf_file(
+  const ConfigValues& values,
+  const std::vector <std::string> &sections,
+  const std::string &key,
+  std::string &out,
+  bool emeta) const
+{
+  int r = _get_val_from_conf_file(sections, key, out);
+  if (r < 0) {
+    return r;
+  }
+  if (emeta) {
+    expand_stack_t stack;
+    auto v = _expand_meta(values, Option::value_t(out), nullptr, &stack, nullptr);
+    conf_stringify(v, &out);
+  }
+  return 0;
+}
+
+int md_config_t::_get_val_from_conf_file(
+  const std::vector <std::string> &sections,
+  const std::string &key,
+  std::string &out) const
+{
+  std::vector <std::string>::const_iterator s = sections.begin();
+  std::vector <std::string>::const_iterator s_end = sections.end();
+  for (; s != s_end; ++s) {
+    int ret = cf.read(s->c_str(), key, out);
+    if (ret == 0) {
+      return 0;
+    } else if (ret != -ENOENT) {
+      return ret;
+    }
+  }
+  return -ENOENT;
+}
+
+int md_config_t::_set_val(
+  ConfigValues& values,
+  const ConfigTracker& observers,
+  const std::string &raw_val,
+  const Option &opt,
+  int level,
+  std::string *error_message)
+{
+  Option::value_t new_value;
+  ceph_assert(error_message);
+  int r = opt.parse_value(raw_val, &new_value, error_message);
+  if (r < 0) {
+    return r;
+  }
+
+  // unsafe runtime change?
+  if (!opt.can_update_at_runtime() &&
+      safe_to_start_threads &&
+      !observers.is_tracking(opt.name)) {
+    // accept value if it is not actually a change
+    if (new_value != _get_val_nometa(values, opt)) {
+      *error_message = string("Configuration option '") + opt.name +
+	"' may not be modified at runtime";
+      return -ENOSYS;
+    }
+  }
+
+  // Apply the value to its entry in the `values` map
+  auto result = values.set_value(opt.name, std::move(new_value), level);
+  switch (result) {
+  case ConfigValues::SET_NO_CHANGE:
+    break;
+  case ConfigValues::SET_NO_EFFECT:
+    values_bl.clear();
+    break;
+  case ConfigValues::SET_HAVE_EFFECT:
+    values_bl.clear();
+    _refresh(values, opt);
+    break;
+  }
+  return result;
+}
+
+void md_config_t::_refresh(ConfigValues& values, const Option& opt)
+{
+  // Apply the value to its legacy field, if it has one
+  auto legacy_ptr_iter = legacy_values.find(std::string(opt.name));
+  if (legacy_ptr_iter != legacy_values.end()) {
+    update_legacy_val(values, opt, legacy_ptr_iter->second);
+  }
+  // Was this a debug_* option update?
+  if (opt.subsys >= 0) {
+    string actual_val;
+    conf_stringify(_get_val(values, opt), &actual_val);
+    values.set_logging(opt.subsys, actual_val.c_str());
+  } else {
+    // normal option, advertise the change.
+    values.changed.insert(opt.name);
+  }
+}
+
+int md_config_t::_rm_val(ConfigValues& values,
+			 const std::string& key,
+			 int level)
+{
+  if (schema.count(key) == 0) {
+    return -EINVAL;
+  }
+  auto ret = values.rm_val(key, level);
+  if (ret < 0) {
+    return ret;
+  }
+  if (ret == ConfigValues::SET_HAVE_EFFECT) {
+    _refresh(values, *find_option(key));
+  }
+  values_bl.clear();
+  return 0;
+}
+
+namespace {
+template<typename Size>
+struct get_size_visitor : public boost::static_visitor<Size>
+{
+  get_size_visitor() {}
+
+  template<typename T>
+  Size operator()(const T&) const {
+    return -1;
+  }
+  Size operator()(const Option::size_t& sz) const {
+    return static_cast<Size>(sz.value);
+  }
+  Size operator()(const Size& v) const {
+    return v;
+  }
+};
+
+/**
+ * Handles assigning from a variant-of-types to a variant-of-pointers-to-types
+ */
+template<class Config>
+class assign_visitor : public boost::static_visitor<>
+{
+  Config *conf;
+  Option::value_t val;
+  public:
+
+  assign_visitor(Config *conf_, Option::value_t val_)
+    : conf(conf_), val(val_)
+  {}
+
+  template <typename T>
+  void operator()(T Config::* ptr) const
+  {
+    T *member = const_cast<T *>(&(conf->*(boost::get<const T Config::*>(ptr))));
+
+    *member = boost::get<T>(val);
+  }
+  void operator()(uint64_t Config::* ptr) const
+  {
+    using T = uint64_t;
+    auto member = const_cast<T*>(&(conf->*(boost::get<const T Config::*>(ptr))));
+    *member = boost::apply_visitor(get_size_visitor<T>{}, val);
+  }
+  void operator()(int64_t Config::* ptr) const
+  {
+    using T = int64_t;
+    auto member = const_cast<T*>(&(conf->*(boost::get<const T Config::*>(ptr))));
+    *member = boost::apply_visitor(get_size_visitor<T>{}, val);
+  }
+};
+} // anonymous namespace
+
+void md_config_t::update_legacy_vals(ConfigValues& values)
+{
+  for (const auto &i : legacy_values) {
+    const auto &name = i.first;
+    const auto &option = schema.at(name);
+    auto ptr = i.second;
+    update_legacy_val(values, option, ptr);
+  }
+}
+
+void md_config_t::update_legacy_val(ConfigValues& values,
+				    const Option &opt,
+                                    md_config_t::member_ptr_t member_ptr)
+{
+  Option::value_t v = _get_val(values, opt);
+  boost::apply_visitor(assign_visitor(&values, v), member_ptr);
+}
+
+static void dump(Formatter *f, int level, Option::value_t in)
+{
+  if (const bool *v = boost::get<const bool>(&in)) {
+    f->dump_bool(ceph_conf_level_name(level), *v);
+  } else if (const int64_t *v = boost::get<const int64_t>(&in)) {
+    f->dump_int(ceph_conf_level_name(level), *v);
+  } else if (const uint64_t *v = boost::get<const uint64_t>(&in)) {
+    f->dump_unsigned(ceph_conf_level_name(level), *v);
+  } else if (const double *v = boost::get<const double>(&in)) {
+    f->dump_float(ceph_conf_level_name(level), *v);
+  } else {
+    f->dump_stream(ceph_conf_level_name(level)) << Option::to_str(in);
+  }
+}
+
+void md_config_t::diff(
+  const ConfigValues& values,
+  Formatter *f,
+  string name) const
+{
+  values.for_each([this, f, &values] (auto& name, auto& configs) {
+    if (configs.empty()) {
+      return;
+    }
+    f->open_object_section(name.c_str());
+    const Option *o = find_option(name);
+    if (configs.size() &&
+	configs.begin()->first != CONF_DEFAULT) {
+      // show compiled-in default only if an override default wasn't provided
+      dump(f, CONF_DEFAULT, _get_val_default(*o));
+    }
+    for (auto& j : configs) {
+      dump(f, j.first, j.second);
+    }
+    dump(f, CONF_FINAL, _get_val(values, *o));
+    f->close_section();
+  });
+}
+
+void md_config_t::complain_about_parse_errors(CephContext *cct)
+{
+  ::complain_about_parse_errors(cct, &parse_errors);
+}
diff --git a/src/common/config.h b/src/common/config.h
new file mode 100644
index 00000000..a432a3ae
--- /dev/null
+++ b/src/common/config.h
@@ -0,0 +1,370 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_CONFIG_H
+#define CEPH_CONFIG_H
+
+#include <map>
+#include <boost/container/small_vector.hpp>
+#include "common/ConfUtils.h"
+#include "common/code_environment.h"
+#include "log/SubsystemMap.h"
+#include "common/options.h"
+#include "common/subsys_types.h"
+#include "common/config_tracker.h"
+#include "common/config_values.h"
+
+class CephContext;
+
+enum {
+  CONF_DEFAULT,
+  CONF_MON,
+  CONF_FILE,
+  CONF_ENV,
+  CONF_CMDLINE,
+  CONF_OVERRIDE,
+  CONF_FINAL
+};
+
+extern const char *ceph_conf_level_name(int level);
+
+/** This class represents the current Ceph configuration.
+ *
+ * For Ceph daemons, this is the daemon configuration.  Log levels, caching
+ * settings, btrfs settings, and so forth can all be found here.  For libcephfs
+ * and librados users, this is the configuration associated with their context.
+ *
+ * For information about how this class is loaded from a configuration file,
+ * see common/ConfUtils.
+ *
+ * ACCESS
+ *
+ * There are 3 ways to read the ceph context-- the old way and two new ways.
+ * In the old way, code would simply read the public variables of the
+ * configuration, without taking a lock. In the new way #1, code registers a
+ * configuration observer which receives callbacks when a value changes. These
+ * callbacks take place under the md_config_t lock. Alternatively one can use
+ * get_val(const char *name) method to safely get a copy of the value.
+ *
+ * To prevent serious problems resulting from thread-safety issues, we disallow
+ * changing std::string configuration values after
+ * md_config_t::safe_to_start_threads becomes true. You can still
+ * change integer or floating point values, and the option declared with
+ * SAFE_OPTION macro. Notice the latter options can not be read directly
+ * (conf->foo), one should use either observers or get_val() method
+ * (conf->get_val("foo")).
+ *
+ * FIXME: really we shouldn't allow changing integer or floating point values
+ * while another thread is reading them, either.
+ */
+struct md_config_t {
+public:
+  typedef boost::variant<int64_t ConfigValues::*,
+                         uint64_t ConfigValues::*,
+                         std::string ConfigValues::*,
+                         double ConfigValues::*,
+                         bool ConfigValues::*,
+                         entity_addr_t ConfigValues::*,
+			 entity_addrvec_t ConfigValues::*,
+                         uuid_d ConfigValues::*> member_ptr_t;
+
+  // For use when intercepting configuration updates
+  typedef std::function<bool(
+      const std::string &k, const std::string &v)> config_callback;
+
+  /// true if we are a daemon (as per CephContext::code_env)
+  const bool is_daemon;
+
+  /*
+   * Mapping from legacy config option names to class members
+   */
+  std::map<std::string, member_ptr_t> legacy_values;
+
+  /**
+   * The configuration schema, in the form of Option objects describing
+   * possible settings.
+   */
+  std::map<std::string, const Option&> schema;
+
+  /// values from mon that we failed to set
+  std::map<std::string,std::string> ignored_mon_values;
+
+  /// original raw values saved that may need to re-expand at certain time
+  mutable std::vector<std::string> may_reexpand_meta;
+
+  /// encoded, cached copy of of values + ignored_mon_values
+  bufferlist values_bl;
+
+  /// version for values_bl; increments each time there is a change
+  uint64_t values_bl_version = 0;
+
+  /// encoded copy of defaults (map<string,string>)
+  bufferlist defaults_bl;
+
+  typedef enum {
+    OPT_INT, OPT_LONGLONG, OPT_STR, OPT_DOUBLE, OPT_FLOAT, OPT_BOOL,
+    OPT_ADDR, OPT_ADDRVEC, OPT_U32, OPT_U64, OPT_UUID
+  } opt_type_t;
+
+  // Create a new md_config_t structure.
+  explicit md_config_t(ConfigValues& values,
+		       const ConfigTracker& tracker,
+		       bool is_daemon=false);
+  ~md_config_t();
+
+  // Parse a config file
+  int parse_config_files(ConfigValues& values, const ConfigTracker& tracker,
+			 const char *conf_files,
+			 std::ostream *warnings, int flags);
+
+  // Absorb config settings from the environment
+  void parse_env(unsigned entity_type,
+		 ConfigValues& values, const ConfigTracker& tracker,
+		 const char *env_var = "CEPH_ARGS");
+
+  // Absorb config settings from argv
+  int parse_argv(ConfigValues& values, const ConfigTracker& tracker,
+		 std::vector<const char*>& args, int level=CONF_CMDLINE);
+
+  // do any commands we got from argv (--show-config, --show-config-val)
+  void do_argv_commands(const ConfigValues& values) const;
+
+  bool _internal_field(const string& k);
+
+  void set_safe_to_start_threads();
+  void _clear_safe_to_start_threads();  // this is only used by the unit test
+
+  /// Look up an option in the schema
+  const Option *find_option(const string& name) const;
+
+  /// Set a default value
+  void set_val_default(ConfigValues& values,
+		       const ConfigTracker& tracker,
+		       const std::string& key, const std::string &val);
+
+  /// Set a values from mon
+  int set_mon_vals(CephContext *cct,
+      ConfigValues& values,
+      const ConfigTracker& tracker,
+      const map<std::string,std::string>& kv,
+      config_callback config_cb);
+
+  // Called by the Ceph daemons to make configuration changes at runtime
+  int injectargs(ConfigValues& values,
+		 const ConfigTracker& tracker,
+		 const std::string &s,
+		 std::ostream *oss);
+
+  // Set a configuration value, or crash
+  // Metavariables will be expanded.
+  void set_val_or_die(ConfigValues& values, const ConfigTracker& tracker,
+		      const std::string &key, const std::string &val);
+
+  // Set a configuration value.
+  // Metavariables will be expanded.
+  int set_val(ConfigValues& values, const ConfigTracker& tracker,
+	      const std::string &key, const char *val,
+              std::stringstream *err_ss=nullptr);
+  int set_val(ConfigValues& values, const ConfigTracker& tracker,
+	      const std::string &key, const string& s,
+              std::stringstream *err_ss=nullptr) {
+    return set_val(values, tracker, key, s.c_str(), err_ss);
+  }
+
+  /// clear override value
+  int rm_val(ConfigValues& values, const std::string& key);
+
+  /// get encoded map<string,map<int32_t,string>> of entire config
+  void get_config_bl(const ConfigValues& values,
+		     uint64_t have_version,
+		     bufferlist *bl,
+		     uint64_t *got_version);
+
+  /// get encoded map<string,string> of compiled-in defaults
+  void get_defaults_bl(const ConfigValues& values, bufferlist *bl);
+
+  // Get a configuration value.
+  // No metavariables will be returned (they will have already been expanded)
+  int get_val(const ConfigValues& values, const std::string &key, char **buf, int len) const;
+  int get_val(const ConfigValues& values, const std::string &key, std::string *val) const;
+  Option::value_t get_val_generic(const ConfigValues& values, const std::string &key) const;
+  template<typename T> const T get_val(const ConfigValues& values, const std::string &key) const;
+  template<typename T, typename Callback, typename...Args>
+  auto with_val(const ConfigValues& values, const string& key,
+		Callback&& cb, Args&&... args) const ->
+    std::result_of_t<Callback(const T&, Args...)> {
+    return std::forward<Callback>(cb)(
+      boost::get<T>(this->get_val_generic(values, key)),
+      std::forward<Args>(args)...);
+  }
+
+  void get_all_keys(std::vector<std::string> *keys) const;
+
+  // Return a list of all the sections that the current entity is a member of.
+  void get_my_sections(const ConfigValues& values,
+		       std::vector <std::string> &sections) const;
+
+  // Return a list of all sections
+  int get_all_sections(std::vector <std::string> &sections) const;
+
+  // Get a value from the configuration file that we read earlier.
+  // Metavariables will be expanded if emeta is true.
+  int get_val_from_conf_file(const ConfigValues& values,
+		   const std::vector <std::string> &sections,
+		   std::string const &key, std::string &out, bool emeta) const;
+
+  /// dump all config values to a stream
+  void show_config(const ConfigValues& values, std::ostream& out) const;
+  /// dump all config values to a formatter
+  void show_config(const ConfigValues& values, Formatter *f) const;
+  
+  /// dump all config settings to a formatter
+  void config_options(Formatter *f) const;
+
+  /// dump config diff from default, conf, mon, etc.
+  void diff(const ConfigValues& values,
+	    Formatter *f,
+	    std::string name=string{}) const;
+
+  /// print/log warnings/errors from parsing the config
+  void complain_about_parse_errors(CephContext *cct);
+
+private:
+  // we use this to avoid variable expansion loops
+  typedef boost::container::small_vector<pair<const Option*,
+					      const Option::value_t*>,
+					 4> expand_stack_t;
+
+  void validate_schema();
+  void validate_default_settings();
+
+  int _get_val_cstr(const ConfigValues& values,
+		    const std::string &key, char **buf, int len) const;
+  Option::value_t _get_val(const ConfigValues& values,
+			   const std::string &key,
+			   expand_stack_t *stack=0,
+			   std::ostream *err=0) const;
+  Option::value_t _get_val(const ConfigValues& values,
+			   const Option& o,
+			   expand_stack_t *stack=0,
+			   std::ostream *err=0) const;
+  const Option::value_t& _get_val_default(const Option& o) const;
+  Option::value_t _get_val_nometa(const ConfigValues& values,
+				  const Option& o) const;
+
+  int _rm_val(ConfigValues& values, const std::string& key, int level);
+
+  void _refresh(ConfigValues& values, const Option& opt);
+
+  void _show_config(const ConfigValues& values,
+		    std::ostream *out, Formatter *f) const;
+
+  void _get_my_sections(const ConfigValues& values,
+			std::vector <std::string> &sections) const;
+
+  int _get_val_from_conf_file(const std::vector <std::string> &sections,
+			      const std::string &key, std::string &out) const;
+
+  int parse_option(ConfigValues& values,
+		   const ConfigTracker& tracker,
+		   std::vector<const char*>& args,
+		   std::vector<const char*>::iterator& i,
+		   std::ostream *oss,
+		   int level);
+  int parse_injectargs(ConfigValues& values,
+		       const ConfigTracker& tracker,
+		       std::vector<const char*>& args,
+		       std::ostream *oss);
+
+  // @returns negative number for an error, otherwise a
+  //          @c ConfigValues::set_value_result_t is returned.
+  int _set_val(
+    ConfigValues& values,
+    const ConfigTracker& tracker,
+    const std::string &val,
+    const Option &opt,
+    int level,  // CONF_*
+    std::string *error_message);
+
+  template <typename T>
+  void assign_member(member_ptr_t ptr, const Option::value_t &val);
+
+
+  void update_legacy_vals(ConfigValues& values);
+  void update_legacy_val(ConfigValues& values,
+			 const Option &opt,
+			 member_ptr_t member);
+
+  Option::value_t _expand_meta(
+    const ConfigValues& values,
+    const Option::value_t& in,
+    const Option *o,
+    expand_stack_t *stack,
+    std::ostream *err) const;
+
+public:  // for global_init
+  void early_expand_meta(const ConfigValues& values,
+			 std::string &val,
+			 std::ostream *oss) const;
+
+  // for those want to reexpand special meta, e.g, $pid
+  bool finalize_reexpand_meta(ConfigValues& values,
+			      const ConfigTracker& tracker);
+private:
+
+  /// expand all metavariables in config structure.
+  void expand_all_meta();
+
+  // The configuration file we read, or NULL if we haven't read one.
+  ConfFile cf;
+public:
+  std::deque<std::string> parse_errors;
+private:
+
+  // This will be set to true when it is safe to start threads.
+  // Once it is true, it will never change.
+  bool safe_to_start_threads = false;
+
+  bool do_show_config = false;
+  string do_show_config_value;
+
+  vector<Option> subsys_options;
+
+public:
+  string data_dir_option;  ///< data_dir config option, if any
+
+public:
+  unsigned get_osd_pool_default_min_size(const ConfigValues& values,
+                                         uint8_t size) const {
+    uint8_t min_size = get_val<uint64_t>(values, "osd_pool_default_min_size");
+    return min_size ? std::min(min_size, size) : (size - size / 2);
+  }
+
+  friend class test_md_config_t;
+};
+
+template<typename T>
+const T md_config_t::get_val(const ConfigValues& values,
+			     const std::string &key) const {
+  return boost::get<T>(this->get_val_generic(values, key));
+}
+
+inline std::ostream& operator<<(std::ostream& o, const boost::blank& ) {
+      return o << "INVALID_CONFIG_VALUE";
+}
+
+int ceph_resolve_file_search(const std::string& filename_list,
+			     std::string& result);
+
+#endif
diff --git a/src/common/config_cacher.h b/src/common/config_cacher.h
new file mode 100644
index 00000000..a84bad08
--- /dev/null
+++ b/src/common/config_cacher.h
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_CONFIG_CACHER_H
+#define CEPH_CONFIG_CACHER_H
+
+#include "common/config_obs.h"
+#include "common/config.h"
+
+template <typename ValueT>
+class md_config_cacher_t : public md_config_obs_t {
+  ConfigProxy& conf;
+  const char* const option_name;
+  std::atomic<ValueT> value_cache;
+
+  const char** get_tracked_conf_keys() const override {
+    const static char* keys[] = { option_name, nullptr };
+    return keys;
+  }
+
+  void handle_conf_change(const ConfigProxy& conf,
+                          const std::set<std::string>& changed) override {
+    if (changed.count(option_name)) {
+      value_cache.store(conf.get_val<ValueT>(option_name));
+    }
+  }
+
+public:
+  md_config_cacher_t(ConfigProxy& conf,
+                     const char* const option_name)
+    : conf(conf),
+      option_name(option_name) {
+    conf.add_observer(this);
+    std::atomic_init(&value_cache,
+                     conf.get_val<ValueT>(option_name));
+  }
+
+  ~md_config_cacher_t() {
+    conf.remove_observer(this);
+  }
+
+  operator ValueT() const {
+    return value_cache.load();
+  }
+};
+
+#endif // CEPH_CONFIG_CACHER_H
+
diff --git a/src/common/config_fwd.h b/src/common/config_fwd.h
new file mode 100644
index 00000000..817d1a0f
--- /dev/null
+++ b/src/common/config_fwd.h
@@ -0,0 +1,12 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#pragma once
+
+#ifdef WITH_SEASTAR
+namespace ceph::common {
+  class ConfigProxy;
+}
+using ConfigProxy = ceph::common::ConfigProxy;
+#else
+class ConfigProxy;
+#endif
diff --git a/src/common/config_obs.h b/src/common/config_obs.h
new file mode 100644
index 00000000..20d12ad8
--- /dev/null
+++ b/src/common/config_obs.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_CONFIG_OBS_H
+#define CEPH_CONFIG_OBS_H
+
+#include <set>
+#include <string>
+
+#include "common/config_fwd.h"
+
+namespace ceph {
+/** @brief Base class for configuration observers.
+ * Use this as a base class for your object if it has to respond to configuration changes,
+ * for example by updating some values or modifying its behavior.
+ * Subscribe for configuration changes by calling the md_config_t::add_observer() method
+ * and unsubscribe using md_config_t::remove_observer().
+ */
+template<class ConfigProxy>
+class md_config_obs_impl {
+public:
+  virtual ~md_config_obs_impl() {}
+  /** @brief Get a table of strings specifying the configuration keys in which the object is interested.
+   * This is called when the object is subscribed to configuration changes with add_observer().
+   * The returned table should not be freed until the observer is removed with remove_observer().
+   * Note that it is not possible to change the set of tracked keys without re-subscribing. */
+  virtual const char** get_tracked_conf_keys() const = 0;
+  /// React to a configuration change.
+  virtual void handle_conf_change(const ConfigProxy& conf,
+				  const std::set <std::string> &changed) = 0;
+  /// Unused for now
+  virtual void handle_subsys_change(const ConfigProxy& conf,
+				    const std::set<int>& changed) { }
+};
+}
+
+using md_config_obs_t = ceph::md_config_obs_impl<ConfigProxy>;
+
+#endif
diff --git a/src/common/config_obs_mgr.h b/src/common/config_obs_mgr.h
new file mode 100644
index 00000000..25a8b05b
--- /dev/null
+++ b/src/common/config_obs_mgr.h
@@ -0,0 +1,118 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <string>
+
+#include "common/config_tracker.h"
+
+class ConfigValues;
+
+// @c ObserverMgr manages a set of config observers which are interested in
+// the changes of settings at runtime.
+template<class ConfigObs>
+class ObserverMgr : public ConfigTracker {
+  // Maps configuration options to the observer listening for them.
+  using obs_map_t = std::multimap<std::string, ConfigObs*>;
+  obs_map_t observers;
+
+public:
+  typedef std::map<ConfigObs*, std::set<std::string>> rev_obs_map;
+  typedef std::function<void(ConfigObs*, const std::string&)> config_gather_cb;
+
+  // Adds a new observer to this configuration. You can do this at any time,
+  // but it will only receive notifications for the changes that happen after
+  // you attach it, obviously.
+  //
+  // Most developers will probably attach their observers after global_init,
+  // but before anyone can call injectargs.
+  //
+  // The caller is responsible for allocating observers.
+  void add_observer(ConfigObs* observer);
+
+  // Remove an observer from this configuration.
+  // This doesn't delete the observer! If you allocated it with new(),
+  // you need to delete it yourself.
+  // This function will assert if you try to delete an observer that isn't
+  // there.
+  void remove_observer(ConfigObs* observer);
+  // invoke callback for every observers tracking keys
+  void for_each_observer(config_gather_cb callback);
+  // invoke callback for observers keys tracking the provided change set
+  template<class ConfigProxyT>
+  void for_each_change(const std::set<std::string>& changes,
+                       ConfigProxyT& proxy,
+                       config_gather_cb callback, std::ostream *oss);
+  bool is_tracking(const std::string& name) const override;
+};
+
+// we could put the implementations in a .cc file, and only instantiate the
+// used template specializations explicitly, but that forces us to involve
+// unused headers and libraries at compile-time. for instance, for instantiate,
+// to instantiate ObserverMgr for seastar, we will need to include seastar
+// headers to get the necessary types in place, but that would force us to link
+// the non-seastar binaries against seastar libraries. so, to avoid pulling
+// in unused dependencies at the expense of increasing compiling time, we put
+// the implementation in the header file.
+template<class ConfigObs>
+void ObserverMgr<ConfigObs>::add_observer(ConfigObs* observer)
+{
+  const char **keys = observer->get_tracked_conf_keys();
+  for (const char ** k = keys; *k; ++k) {
+    observers.emplace(*k, observer);
+  }
+}
+
+template<class ConfigObs>
+void ObserverMgr<ConfigObs>::remove_observer(ConfigObs* observer)
+{
+  [[maybe_unused]] bool found_obs = false;
+  for (auto o = observers.begin(); o != observers.end(); ) {
+    if (o->second == observer) {
+      observers.erase(o++);
+      found_obs = true;
+    } else {
+      ++o;
+    }
+  }
+  ceph_assert(found_obs);
+}
+
+template<class ConfigObs>
+void ObserverMgr<ConfigObs>::for_each_observer(config_gather_cb callback)
+{
+  for (const auto& [key, obs] : observers) {
+    callback(obs, key);
+  }
+}
+
+template<class ConfigObs>
+template<class ConfigProxyT>
+void ObserverMgr<ConfigObs>::for_each_change(const std::set<std::string>& changes,
+                                             ConfigProxyT& proxy,
+                                             config_gather_cb callback, std::ostream *oss)
+{
+  // create the reverse observer mapping, mapping observers to the set of
+  // changed keys that they'll get.
+  string val;
+  for (auto& key : changes) {
+    auto [first, last] = observers.equal_range(key);
+    if ((oss) && !proxy.get_val(key, &val)) {
+      (*oss) << key << " = '" << val << "' ";
+      if (first == last) {
+        (*oss) << "(not observed, change may require restart) ";
+      }
+    }
+    for (auto r = first; r != last; ++r) {
+      callback(r->second, key);
+    }
+  }
+}
+
+template<class ConfigObs>
+bool ObserverMgr<ConfigObs>::is_tracking(const std::string& name) const
+{
+  return observers.count(name) > 0;
+}
diff --git a/src/common/config_proxy.h b/src/common/config_proxy.h
new file mode 100644
index 00000000..6f886c25
--- /dev/null
+++ b/src/common/config_proxy.h
@@ -0,0 +1,335 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#pragma once
+
+#include <type_traits>
+#include "common/config.h"
+#include "common/config_obs.h"
+#include "common/config_obs_mgr.h"
+#include "common/ceph_mutex.h"
+
+// @c ConfigProxy is a facade of multiple config related classes. it exposes
+// the legacy settings with arrow operator, and the new-style config with its
+// member methods.
+class ConfigProxy {
+  static ConfigValues get_config_values(const ConfigProxy &config_proxy) {
+    std::lock_guard locker(config_proxy.lock);
+    return config_proxy.values;
+  }
+
+  /**
+   * The current values of all settings described by the schema
+   */
+  ConfigValues values;
+  using md_config_obs_t = ceph::md_config_obs_impl<ConfigProxy>;
+  ObserverMgr<md_config_obs_t> obs_mgr;
+  md_config_t config;
+  /** A lock that protects the md_config_t internals. It is
+   * recursive, for simplicity.
+   * It is best if this lock comes first in the lock hierarchy. We will
+   * hold this lock when calling configuration observers.  */
+  mutable ceph::recursive_mutex lock =
+    ceph::make_recursive_mutex("ConfigProxy::lock");
+
+  class CallGate {
+  private:
+    uint32_t call_count = 0;
+    ceph::mutex lock;
+    ceph::condition_variable cond;
+  public:
+    CallGate()
+      : lock(ceph::make_mutex("call::gate::lock")) {
+    }
+
+    void enter() {
+      std::lock_guard<ceph::mutex> locker(lock);
+      ++call_count;
+    }
+    void leave() {
+      std::lock_guard<ceph::mutex> locker(lock);
+      ceph_assert(call_count > 0);
+      if (--call_count == 0) {
+        cond.notify_all();
+      }
+    }
+    void close() {
+      std::unique_lock<ceph::mutex> locker(lock);
+      while (call_count != 0) {
+        cond.wait(locker);
+      }
+    }
+  };
+
+  void call_gate_enter(md_config_obs_t *obs) {
+    auto p = obs_call_gate.find(obs);
+    ceph_assert(p != obs_call_gate.end());
+    p->second->enter();
+  }
+  void call_gate_leave(md_config_obs_t *obs) {
+    auto p = obs_call_gate.find(obs);
+    ceph_assert(p != obs_call_gate.end());
+    p->second->leave();
+  }
+  void call_gate_close(md_config_obs_t *obs) {
+    auto p = obs_call_gate.find(obs);
+    ceph_assert(p != obs_call_gate.end());
+    p->second->close();
+  }
+
+  using rev_obs_map_t = ObserverMgr<md_config_obs_t>::rev_obs_map;
+  typedef std::unique_ptr<CallGate> CallGateRef;
+
+  std::map<md_config_obs_t*, CallGateRef> obs_call_gate;
+
+  void call_observers(std::unique_lock<ceph::recursive_mutex>& locker,
+                      rev_obs_map_t& rev_obs) {
+    // observers are notified outside of lock
+    locker.unlock();
+    for (auto& [obs, keys] : rev_obs) {
+      obs->handle_conf_change(*this, keys);
+    }
+    locker.lock();
+
+    for (auto& rev_ob : rev_obs) {
+      call_gate_leave(rev_ob.first);
+    }
+  }
+
+  void map_observer_changes(md_config_obs_t *obs, const std::string &key,
+                            rev_obs_map_t *rev_obs) {
+    ceph_assert(ceph_mutex_is_locked(lock));
+
+    auto [it, new_entry] = rev_obs->emplace(obs, std::set<std::string>{});
+    it->second.emplace(key);
+    if (new_entry) {
+      // this needs to be done under lock as once this lock is
+      // dropped (before calling observers) a remove_observer()
+      // can sneak in and cause havoc.
+      call_gate_enter(obs);
+    }
+  }
+
+public:
+  explicit ConfigProxy(bool is_daemon)
+    : config{values, obs_mgr, is_daemon}
+  {}
+  explicit ConfigProxy(const ConfigProxy &config_proxy)
+    : values(get_config_values(config_proxy)),
+      config{values, obs_mgr, config_proxy.config.is_daemon}
+  {}
+  const ConfigValues* operator->() const noexcept {
+    return &values;
+  }
+  ConfigValues* operator->() noexcept {
+    return &values;
+  }
+  int get_val(const std::string& key, char** buf, int len) const {
+    std::lock_guard l{lock};
+    return config.get_val(values, key, buf, len);
+  }
+  int get_val(const std::string &key, std::string *val) const {
+    std::lock_guard l{lock};
+    return config.get_val(values, key, val);
+  }
+  template<typename T>
+  const T get_val(const std::string& key) const {
+    std::lock_guard l{lock};
+    return config.template get_val<T>(values, key);
+  }
+  template<typename T, typename Callback, typename...Args>
+  auto with_val(const string& key, Callback&& cb, Args&&... args) const {
+    std::lock_guard l{lock};
+    return config.template with_val<T>(values, key,
+				       std::forward<Callback>(cb),
+				       std::forward<Args>(args)...);
+  }
+  void config_options(Formatter *f) const {
+    config.config_options(f);
+  }
+  const decltype(md_config_t::schema)& get_schema() const {
+    return config.schema;
+  }
+  const Option* get_schema(const std::string& key) const {
+    auto found = config.schema.find(key);
+    if (found == config.schema.end()) {
+      return nullptr;
+    } else {
+      return &found->second;
+    }
+  }
+  const Option *find_option(const string& name) const {
+    return config.find_option(name);
+  }
+  void diff(Formatter *f, const std::string& name=string{}) const {
+    std::lock_guard l{lock};
+    return config.diff(values, f, name);
+  }
+  void get_my_sections(std::vector <std::string> &sections) const {
+    std::lock_guard l{lock};
+    config.get_my_sections(values, sections);
+  }
+  int get_all_sections(std::vector<std::string>& sections) const {
+    std::lock_guard l{lock};
+    return config.get_all_sections(sections);
+  }
+  int get_val_from_conf_file(const std::vector<std::string>& sections,
+			     const std::string& key, std::string& out,
+			     bool emeta) const {
+    std::lock_guard l{lock};
+    return config.get_val_from_conf_file(values,
+					 sections, key, out, emeta);
+  }
+  unsigned get_osd_pool_default_min_size(uint8_t size) const {
+    return config.get_osd_pool_default_min_size(values, size);
+  }
+  void early_expand_meta(std::string &val,
+			 std::ostream *oss) const {
+    std::lock_guard l{lock};
+    return config.early_expand_meta(values, val, oss);
+  }
+  // for those want to reexpand special meta, e.g, $pid
+  void finalize_reexpand_meta() {
+    std::unique_lock locker(lock);
+    rev_obs_map_t rev_obs;
+    if (config.finalize_reexpand_meta(values, obs_mgr)) {
+      _gather_changes(values.changed, &rev_obs, nullptr);
+    }
+
+    call_observers(locker, rev_obs);
+  }
+  void add_observer(md_config_obs_t* obs) {
+    std::lock_guard l(lock);
+    obs_mgr.add_observer(obs);
+    obs_call_gate.emplace(obs, std::make_unique<CallGate>());
+  }
+  void remove_observer(md_config_obs_t* obs) {
+    std::lock_guard l(lock);
+    call_gate_close(obs);
+    obs_call_gate.erase(obs);
+    obs_mgr.remove_observer(obs);
+  }
+  void call_all_observers() {
+    std::unique_lock locker(lock);
+    rev_obs_map_t rev_obs;
+    obs_mgr.for_each_observer(
+      [this, &rev_obs](md_config_obs_t *obs, const std::string &key) {
+        map_observer_changes(obs, key, &rev_obs);
+      });
+
+    call_observers(locker, rev_obs);
+  }
+  void set_safe_to_start_threads() {
+    config.set_safe_to_start_threads();
+  }
+  void _clear_safe_to_start_threads() {
+    config._clear_safe_to_start_threads();
+  }
+  void show_config(std::ostream& out) {
+    std::lock_guard l{lock};
+    config.show_config(values, out);
+  }
+  void show_config(Formatter *f) {
+    std::lock_guard l{lock};
+    config.show_config(values, f);
+  }
+  void config_options(Formatter *f) {
+    std::lock_guard l{lock};
+    config.config_options(f);
+  }
+  int rm_val(const std::string& key) {
+    std::lock_guard l{lock};
+    return config.rm_val(values, key);
+  }
+  // Expand all metavariables. Make any pending observer callbacks.
+  void apply_changes(std::ostream* oss) {
+    std::unique_lock locker(lock);
+    rev_obs_map_t rev_obs;
+
+    // apply changes until the cluster name is assigned
+    if (!values.cluster.empty()) {
+      // meta expands could have modified anything.  Copy it all out again.
+      _gather_changes(values.changed, &rev_obs, oss);
+    }
+
+    call_observers(locker, rev_obs);
+  }
+  void _gather_changes(std::set<std::string> &changes,
+                       rev_obs_map_t *rev_obs, std::ostream* oss) {
+    obs_mgr.for_each_change(
+      changes, *this,
+      [this, rev_obs](md_config_obs_t *obs, const std::string &key) {
+        map_observer_changes(obs, key, rev_obs);
+      }, oss);
+      changes.clear();
+  }
+  int set_val(const std::string& key, const std::string& s,
+              std::stringstream* err_ss=nullptr) {
+    std::lock_guard l{lock};
+    return config.set_val(values, obs_mgr, key, s, err_ss);
+  }
+  void set_val_default(const std::string& key, const std::string& val) {
+    std::lock_guard l{lock};
+    config.set_val_default(values, obs_mgr, key, val);
+  }
+  void set_val_or_die(const std::string& key, const std::string& val) {
+    std::lock_guard l{lock};
+    config.set_val_or_die(values, obs_mgr, key, val);
+  }
+  int set_mon_vals(CephContext *cct,
+		   const map<std::string,std::string>& kv,
+		   md_config_t::config_callback config_cb) {
+    std::unique_lock locker(lock);
+    int ret = config.set_mon_vals(cct, values, obs_mgr, kv, config_cb);
+
+    rev_obs_map_t rev_obs;
+    _gather_changes(values.changed, &rev_obs, nullptr);
+
+    call_observers(locker, rev_obs);
+    return ret;
+  }
+  int injectargs(const std::string &s, std::ostream *oss) {
+    std::unique_lock locker(lock);
+    int ret = config.injectargs(values, obs_mgr, s, oss);
+
+    rev_obs_map_t rev_obs;
+    _gather_changes(values.changed, &rev_obs, oss);
+
+    call_observers(locker, rev_obs);
+    return ret;
+  }
+  void parse_env(unsigned entity_type,
+		 const char *env_var = "CEPH_ARGS") {
+    std::lock_guard l{lock};
+    config.parse_env(entity_type, values, obs_mgr, env_var);
+  }
+  int parse_argv(std::vector<const char*>& args, int level=CONF_CMDLINE) {
+    std::lock_guard l{lock};
+    return config.parse_argv(values, obs_mgr, args, level);
+  }
+  int parse_config_files(const char *conf_files,
+			 std::ostream *warnings, int flags) {
+    std::lock_guard l{lock};
+    return config.parse_config_files(values, obs_mgr,
+				     conf_files, warnings, flags);
+  }
+  size_t num_parse_errors() const {
+    return config.parse_errors.size();
+  }
+  void complain_about_parse_errors(CephContext *cct) {
+    return config.complain_about_parse_errors(cct);
+  }
+  void do_argv_commands() const {
+    std::lock_guard l{lock};
+    config.do_argv_commands(values);
+  }
+  void get_config_bl(uint64_t have_version,
+		     bufferlist *bl,
+		     uint64_t *got_version) {
+    std::lock_guard l{lock};
+    config.get_config_bl(values, have_version, bl, got_version);
+  }
+  void get_defaults_bl(bufferlist *bl) {
+    std::lock_guard l{lock};
+    config.get_defaults_bl(values, bl);
+  }
+};
diff --git a/src/common/config_tracker.h b/src/common/config_tracker.h
new file mode 100644
index 00000000..783e6f2e
--- /dev/null
+++ b/src/common/config_tracker.h
@@ -0,0 +1,18 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#pragma once
+
+#include <string>
+
+// @ConfigTracker is queried to see if any added observers is tracking one or
+// more changed settings.
+//
+// this class is introduced in hope to decouple @c md_config_t from any instantiated
+// class of @c ObserverMgr, as what the former wants is but @c is_tracking(), and to
+// make ObserverMgr a template parameter of md_config_t's methods just complicates
+// the dependencies between header files, and slows down the compiling.
+class ConfigTracker {
+public:
+  virtual ~ConfigTracker() = default;
+  virtual bool is_tracking(const std::string& name) const = 0;
+};
diff --git a/src/common/config_values.cc b/src/common/config_values.cc
new file mode 100644
index 00000000..24f556e3
--- /dev/null
+++ b/src/common/config_values.cc
@@ -0,0 +1,84 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+#include "config_values.h"
+
+#include "config.h"
+
+ConfigValues::set_value_result_t
+ConfigValues::set_value(const std::string& key,
+                        Option::value_t&& new_value,
+                        int level)
+{  
+  if (auto p = values.find(key); p != values.end()) {
+    auto q = p->second.find(level);
+    if (q != p->second.end()) {
+      if (new_value == q->second) {
+        return SET_NO_CHANGE;
+      }
+      q->second = std::move(new_value);
+    } else {
+      p->second[level] = std::move(new_value);
+    }
+    if (p->second.rbegin()->first > level) {
+      // there was a higher priority value; no effect
+      return SET_NO_EFFECT;
+    } else {
+      return SET_HAVE_EFFECT;
+    }
+  } else {
+    values[key][level] = std::move(new_value);
+    return SET_HAVE_EFFECT;
+  }
+}
+
+int ConfigValues::rm_val(const std::string& key, int level)
+{
+  auto i = values.find(key);
+  if (i == values.end()) {
+    return -ENOENT;
+  }
+  auto j = i->second.find(level);
+  if (j == i->second.end()) {
+    return -ENOENT;
+  }
+  bool matters = (j->first == i->second.rbegin()->first);
+  i->second.erase(j);
+  if (matters) {
+    return SET_HAVE_EFFECT;
+  } else {
+    return SET_NO_EFFECT;
+  }
+}
+
+std::pair<Option::value_t, bool>
+ConfigValues::get_value(const std::string& name, int level) const
+{
+  auto p = values.find(name);
+  if (p != values.end() && !p->second.empty()) {
+    // use highest-priority value available (see CONF_*)
+    if (level < 0) {
+      return {p->second.rbegin()->second, true};
+    } else if (auto found = p->second.find(level);
+               found != p->second.end()) {
+      return {found->second, true};
+    }
+  }
+  return {Option::value_t{}, false};
+}
+
+void ConfigValues::set_logging(int which, const char* val)
+{
+  int log, gather;
+  int r = sscanf(val, "%d/%d", &log, &gather);
+  if (r >= 1) {
+    if (r < 2) {
+      gather = log;
+    }
+    subsys.set_log_level(which, log);
+    subsys.set_gather_level(which, gather);
+  }
+}
+
+bool ConfigValues::contains(const std::string& key) const
+{
+  return values.count(key);
+}
diff --git a/src/common/config_values.h b/src/common/config_values.h
new file mode 100644
index 00000000..ab52060e
--- /dev/null
+++ b/src/common/config_values.h
@@ -0,0 +1,101 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+#pragma once
+
+#include <cstdint>
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+
+#include "common/entity_name.h"
+#include "common/options.h"
+#include "log/SubsystemMap.h"
+#include "msg/msg_types.h"
+
+// @c ConfigValues keeps track of mappings from the config names to their values,
+// debug logging settings, and some other "unnamed" settings, like entity name of
+// the daemon.
+class ConfigValues {
+  using values_t = std::map<std::string, map<int32_t,Option::value_t>>;
+  values_t values;
+  // for populating md_config_impl::legacy_values in ctor
+  friend struct md_config_t;
+
+public:
+  EntityName name;
+  /// cluster name
+  string cluster;
+  ceph::logging::SubsystemMap subsys;
+  bool no_mon_config = false;
+  bool log_early = false;
+  // Set of configuration options that have changed since the last
+  // apply_changes
+  using changed_set_t = std::set<std::string>;
+  changed_set_t changed;
+
+// This macro block defines C members of the md_config_t struct
+// corresponding to the definitions in legacy_config_opts.h.
+// These C members are consumed by code that was written before
+// the new options.cc infrastructure: all newer code should
+// be consume options via explicit get() rather than C members.
+#define OPTION_OPT_INT(name) int64_t name;
+#define OPTION_OPT_LONGLONG(name) int64_t name;
+#define OPTION_OPT_STR(name) std::string name;
+#define OPTION_OPT_DOUBLE(name) double name;
+#define OPTION_OPT_FLOAT(name) double name;
+#define OPTION_OPT_BOOL(name) bool name;
+#define OPTION_OPT_ADDR(name) entity_addr_t name;
+#define OPTION_OPT_ADDRVEC(name) entity_addrvec_t name;
+#define OPTION_OPT_U32(name) uint64_t name;
+#define OPTION_OPT_U64(name) uint64_t name;
+#define OPTION_OPT_UUID(name) uuid_d name;
+#define OPTION_OPT_SIZE(name) size_t name;
+#define OPTION(name, ty)       \
+  public:                      \
+    OPTION_##ty(name)          
+#define SAFE_OPTION(name, ty)       \
+  protected:                        \
+    OPTION_##ty(name)               
+#include "common/legacy_config_opts.h"
+#undef OPTION_OPT_INT
+#undef OPTION_OPT_LONGLONG
+#undef OPTION_OPT_STR
+#undef OPTION_OPT_DOUBLE
+#undef OPTION_OPT_FLOAT
+#undef OPTION_OPT_BOOL
+#undef OPTION_OPT_ADDR
+#undef OPTION_OPT_ADDRVEC
+#undef OPTION_OPT_U32
+#undef OPTION_OPT_U64
+#undef OPTION_OPT_UUID
+#undef OPTION
+#undef SAFE_OPTION
+
+public:
+  enum set_value_result_t {
+    SET_NO_CHANGE,
+    SET_NO_EFFECT,
+    SET_HAVE_EFFECT,
+  };
+  /**
+   * @return true if changed, false otherwise
+   */
+  set_value_result_t set_value(const std::string& key,
+                               Option::value_t&& value,
+                               int level);
+  int rm_val(const std::string& key, int level);
+  void set_logging(int which, const char* val);
+  /**
+   * @param level the level of the setting, -1 for the one with the 
+   *              highest-priority
+   */
+  std::pair<Option::value_t, bool> get_value(const std::string& name,
+                                             int level) const;
+  template<typename Func> void for_each(Func&& func) const {
+    for (const auto& [name,configs] : values) {
+      func(name, configs);
+    }
+  }
+  bool contains(const std::string& key) const;
+};
diff --git a/src/common/convenience.h b/src/common/convenience.h
new file mode 100644
index 00000000..ad01d48b
--- /dev/null
+++ b/src/common/convenience.h
@@ -0,0 +1,139 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <mutex>
+#include <memory>
+#include <optional>
+#include <shared_mutex>
+#include <type_traits>
+#include <utility>
+
+#include <boost/optional.hpp>
+
+#include "include/ceph_assert.h" // I despise you. Not you the reader, I'm talking
+                            // to the include file.
+
+
+#ifndef CEPH_COMMON_CONVENIENCE_H
+#define CEPH_COMMON_CONVENIENCE_H
+
+namespace ceph {
+// boost::optional is wonderful! Unfortunately it lacks a function for
+// the thing you would most obviously want to do with it: apply a
+// function to its contents.
+
+// There are two obvious candidates. The first is a function that
+// takes a function and an optional value and returns an optional
+// value, either holding the return value of the function or holding
+// nothing.
+//
+// I'd considered making more overloads for mutable lvalue
+// references, but those are going a bit beyond likely use cases.
+//
+template<typename T, typename F>
+auto maybe_do(const boost::optional<T>& t, F&& f) ->
+  boost::optional<std::result_of_t<F(const std::decay_t<T>)>>
+{
+  if (t)
+    return { std::forward<F>(f)(*t) };
+  else
+    return boost::none;
+}
+
+// The other obvious function takes an optional but returns an
+// ‘unwrapped’ value, either the result of evaluating the function or
+// a provided alternate value.
+//
+template<typename T, typename F, typename U>
+auto maybe_do_or(const boost::optional<T>& t, F&& f, U&& u) ->
+  std::result_of_t<F(const std::decay_t<T>)>
+{
+  static_assert(std::is_convertible_v<U, std::result_of_t<F(T)>>,
+		"Alternate value must be convertible to function return type.");
+  if (t)
+    return std::forward<F>(f)(*t);
+  else
+    return std::forward<U>(u);
+}
+
+
+// Same thing but for std::optional
+
+template<typename T, typename F>
+auto maybe_do(const std::optional<T>& t, F&& f) ->
+  std::optional<std::result_of_t<F(const std::decay_t<T>)>>
+{
+  if (t)
+    return { std::forward<F>(f)(*t) };
+  else
+    return std::nullopt;
+}
+
+// The other obvious function takes an optional but returns an
+// ‘unwrapped’ value, either the result of evaluating the function or
+// a provided alternate value.
+//
+template<typename T, typename F, typename U>
+auto maybe_do_or(const std::optional<T>& t, F&& f, U&& u) ->
+  std::result_of_t<F(const std::decay_t<T>)>
+{
+  static_assert(std::is_convertible_v<U, std::result_of_t<F(T)>>,
+		"Alternate value must be convertible to function return type.");
+  if (t)
+    return std::forward<F>(f)(*t);
+  else
+    return std::forward<U>(u);
+}
+
+namespace _convenience {
+template<typename... Ts, typename F,  std::size_t... Is>
+inline void for_each_helper(const std::tuple<Ts...>& t, const F& f,
+			    std::index_sequence<Is...>) {
+  (f(std::get<Is>(t)), ..., void());
+}
+template<typename... Ts, typename F,  std::size_t... Is>
+inline void for_each_helper(std::tuple<Ts...>& t, const F& f,
+			    std::index_sequence<Is...>) {
+  (f(std::get<Is>(t)), ..., void());
+}
+template<typename... Ts, typename F,  std::size_t... Is>
+inline void for_each_helper(const std::tuple<Ts...>& t, F& f,
+			    std::index_sequence<Is...>) {
+  (f(std::get<Is>(t)), ..., void());
+}
+template<typename... Ts, typename F,  std::size_t... Is>
+inline void for_each_helper(std::tuple<Ts...>& t, F& f,
+			    std::index_sequence<Is...>) {
+  (f(std::get<Is>(t)), ..., void());
+}
+}
+
+template<typename... Ts, typename F>
+inline void for_each(const std::tuple<Ts...>& t, const F& f) {
+  _convenience::for_each_helper(t, f, std::index_sequence_for<Ts...>{});
+}
+template<typename... Ts, typename F>
+inline void for_each(std::tuple<Ts...>& t, const F& f) {
+  _convenience::for_each_helper(t, f, std::index_sequence_for<Ts...>{});
+}
+template<typename... Ts, typename F>
+inline void for_each(const std::tuple<Ts...>& t, F& f) {
+  _convenience::for_each_helper(t, f, std::index_sequence_for<Ts...>{});
+}
+template<typename... Ts, typename F>
+inline void for_each(std::tuple<Ts...>& t, F& f) {
+  _convenience::for_each_helper(t, f, std::index_sequence_for<Ts...>{});
+}
+}
+#endif // CEPH_COMMON_CONVENIENCE_H
diff --git a/src/common/crc32c.cc b/src/common/crc32c.cc
new file mode 100644
index 00000000..e4a77ae9
--- /dev/null
+++ b/src/common/crc32c.cc
@@ -0,0 +1,240 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/crc32c.h"
+#include "arch/probe.h"
+#include "arch/intel.h"
+#include "arch/arm.h"
+#include "arch/ppc.h"
+#include "common/sctp_crc32.h"
+#include "common/crc32c_intel_fast.h"
+#include "common/crc32c_aarch64.h"
+#include "common/crc32c_ppc.h"
+
+/*
+ * choose best implementation based on the CPU architecture.
+ */
+ceph_crc32c_func_t ceph_choose_crc32(void)
+{
+  // make sure we've probed cpu features; this might depend on the
+  // link order of this file relative to arch/probe.cc.
+  ceph_arch_probe();
+
+  // if the CPU supports it, *and* the fast version is compiled in,
+  // use that.
+#if defined(__i386__) || defined(__x86_64__)
+  if (ceph_arch_intel_sse42 && ceph_crc32c_intel_fast_exists()) {
+    return ceph_crc32c_intel_fast;
+  }
+#elif defined(__arm__) || defined(__aarch64__)
+# if defined(HAVE_ARMV8_CRC)
+  if (ceph_arch_aarch64_crc32){
+    return ceph_crc32c_aarch64;
+  }
+# endif
+#elif defined(__powerpc__) || defined(__ppc__)
+  if (ceph_arch_ppc_crc32) {
+    return ceph_crc32c_ppc;
+  }
+#endif
+  // default
+  return ceph_crc32c_sctp;
+}
+
+/*
+ * static global
+ *
+ * This is a bit of a no-no for shared libraries, but we don't care.
+ * It is effectively constant for the executing process as the value
+ * depends on the CPU architecture.
+ *
+ * We initialize it during program init using the magic of C++.
+ */
+ceph_crc32c_func_t ceph_crc32c_func = ceph_choose_crc32();
+
+
+/*
+ * Look: http://crcutil.googlecode.com/files/crc-doc.1.0.pdf
+ * Here is implementation that goes 1 logical step further,
+ * it splits calculating CRC into jumps of length 1, 2, 4, 8, ....
+ * Each jump is performed on single input bit separately, xor-ed after that.
+ *
+ * This function is unused. It is here to show how crc_turbo_table was obtained.
+ */
+void create_turbo_table(uint32_t table[32][32])
+{
+  //crc_turbo_struct table;
+  for (int bit = 0 ; bit < 32 ; bit++) {
+    table[0][bit] = ceph_crc32c_sctp(1UL << bit, nullptr, 1);
+  }
+  for (int range = 1; range <32 ; range++) {
+    for (int bit = 0 ; bit < 32 ; bit++) {
+      uint32_t crc_x = table[range-1][bit];
+      uint32_t crc_y = 0;
+      for (int b = 0 ; b < 32 ; b++) {
+        if ( (crc_x & (1UL << b)) != 0 ) {
+          crc_y = crc_y ^ table[range-1][b];
+        }
+      }
+      table[range][bit] = crc_y;
+    }
+  }
+}
+
+static uint32_t crc_turbo_table[32][32] =
+{
+    {0xf26b8303, 0xe13b70f7, 0xc79a971f, 0x8ad958cf, 0x105ec76f, 0x20bd8ede, 0x417b1dbc, 0x82f63b78,
+     0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020, 0x00000040, 0x00000080,
+     0x00000100, 0x00000200, 0x00000400, 0x00000800, 0x00001000, 0x00002000, 0x00004000, 0x00008000,
+     0x00010000, 0x00020000, 0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000},
+    {0x13a29877, 0x274530ee, 0x4e8a61dc, 0x9d14c3b8, 0x3fc5f181, 0x7f8be302, 0xff17c604, 0xfbc3faf9,
+     0xf26b8303, 0xe13b70f7, 0xc79a971f, 0x8ad958cf, 0x105ec76f, 0x20bd8ede, 0x417b1dbc, 0x82f63b78,
+     0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020, 0x00000040, 0x00000080,
+     0x00000100, 0x00000200, 0x00000400, 0x00000800, 0x00001000, 0x00002000, 0x00004000, 0x00008000},
+    {0xdd45aab8, 0xbf672381, 0x7b2231f3, 0xf64463e6, 0xe964b13d, 0xd725148b, 0xaba65fe7, 0x52a0c93f,
+     0xa541927e, 0x4f6f520d, 0x9edea41a, 0x38513ec5, 0x70a27d8a, 0xe144fb14, 0xc76580d9, 0x8b277743,
+     0x13a29877, 0x274530ee, 0x4e8a61dc, 0x9d14c3b8, 0x3fc5f181, 0x7f8be302, 0xff17c604, 0xfbc3faf9,
+     0xf26b8303, 0xe13b70f7, 0xc79a971f, 0x8ad958cf, 0x105ec76f, 0x20bd8ede, 0x417b1dbc, 0x82f63b78},
+    {0x493c7d27, 0x9278fa4e, 0x211d826d, 0x423b04da, 0x847609b4, 0x0d006599, 0x1a00cb32, 0x34019664,
+     0x68032cc8, 0xd0065990, 0xa5e0c5d1, 0x4e2dfd53, 0x9c5bfaa6, 0x3d5b83bd, 0x7ab7077a, 0xf56e0ef4,
+     0xef306b19, 0xdb8ca0c3, 0xb2f53777, 0x6006181f, 0xc00c303e, 0x85f4168d, 0x0e045beb, 0x1c08b7d6,
+     0x38116fac, 0x7022df58, 0xe045beb0, 0xc5670b91, 0x8f2261d3, 0x1ba8b557, 0x37516aae, 0x6ea2d55c},
+    {0xf20c0dfe, 0xe1f46d0d, 0xc604aceb, 0x89e52f27, 0x162628bf, 0x2c4c517e, 0x5898a2fc, 0xb13145f8,
+     0x678efd01, 0xcf1dfa02, 0x9bd782f5, 0x3243731b, 0x6486e636, 0xc90dcc6c, 0x97f7ee29, 0x2a03aaa3,
+     0x54075546, 0xa80eaa8c, 0x55f123e9, 0xabe247d2, 0x5228f955, 0xa451f2aa, 0x4d4f93a5, 0x9a9f274a,
+     0x30d23865, 0x61a470ca, 0xc348e194, 0x837db5d9, 0x03171d43, 0x062e3a86, 0x0c5c750c, 0x18b8ea18},
+    {0x3da6d0cb, 0x7b4da196, 0xf69b432c, 0xe8daf0a9, 0xd45997a3, 0xad5f59b7, 0x5f52c59f, 0xbea58b3e,
+     0x78a7608d, 0xf14ec11a, 0xe771f4c5, 0xcb0f9f7b, 0x93f34807, 0x220ae6ff, 0x4415cdfe, 0x882b9bfc,
+     0x15bb4109, 0x2b768212, 0x56ed0424, 0xadda0848, 0x5e586661, 0xbcb0ccc2, 0x7c8def75, 0xf91bdeea,
+     0xf7dbcb25, 0xea5be0bb, 0xd15bb787, 0xa75b19ff, 0x4b5a450f, 0x96b48a1e, 0x288562cd, 0x510ac59a},
+    {0x740eef02, 0xe81dde04, 0xd5d7caf9, 0xae43e303, 0x596bb0f7, 0xb2d761ee, 0x6042b52d, 0xc0856a5a,
+     0x84e6a245, 0x0c21327b, 0x184264f6, 0x3084c9ec, 0x610993d8, 0xc21327b0, 0x81ca3991, 0x067805d3,
+     0x0cf00ba6, 0x19e0174c, 0x33c02e98, 0x67805d30, 0xcf00ba60, 0x9bed0231, 0x32367293, 0x646ce526,
+     0xc8d9ca4c, 0x945fe269, 0x2d53b223, 0x5aa76446, 0xb54ec88c, 0x6f71e7e9, 0xdee3cfd2, 0xb82be955},
+    {0x6992cea2, 0xd3259d44, 0xa3a74c79, 0x42a2ee03, 0x8545dc06, 0x0f67cefd, 0x1ecf9dfa, 0x3d9f3bf4,
+     0x7b3e77e8, 0xf67cefd0, 0xe915a951, 0xd7c72453, 0xaa623e57, 0x51280a5f, 0xa25014be, 0x414c5f8d,
+     0x8298bf1a, 0x00dd08c5, 0x01ba118a, 0x03742314, 0x06e84628, 0x0dd08c50, 0x1ba118a0, 0x37423140,
+     0x6e846280, 0xdd08c500, 0xbffdfcf1, 0x7a178f13, 0xf42f1e26, 0xedb24abd, 0xde88e38b, 0xb8fdb1e7},
+    {0xdcb17aa4, 0xbc8e83b9, 0x7cf17183, 0xf9e2e306, 0xf629b0fd, 0xe9bf170b, 0xd69258e7, 0xa8c8c73f,
+     0x547df88f, 0xa8fbf11e, 0x541b94cd, 0xa837299a, 0x558225c5, 0xab044b8a, 0x53e4e1e5, 0xa7c9c3ca,
+     0x4a7ff165, 0x94ffe2ca, 0x2c13b365, 0x582766ca, 0xb04ecd94, 0x6571edd9, 0xcae3dbb2, 0x902bc195,
+     0x25bbf5db, 0x4b77ebb6, 0x96efd76c, 0x2833d829, 0x5067b052, 0xa0cf60a4, 0x4472b7b9, 0x88e56f72},
+    {0xbd6f81f8, 0x7f337501, 0xfe66ea02, 0xf921a2f5, 0xf7af331b, 0xeab210c7, 0xd088577f, 0xa4fcd80f,
+     0x4c15c6ef, 0x982b8dde, 0x35bb6d4d, 0x6b76da9a, 0xd6edb534, 0xa8371c99, 0x55824fc3, 0xab049f86,
+     0x53e549fd, 0xa7ca93fa, 0x4a795105, 0x94f2a20a, 0x2c0932e5, 0x581265ca, 0xb024cb94, 0x65a5e1d9,
+     0xcb4bc3b2, 0x937bf195, 0x231b95db, 0x46372bb6, 0x8c6e576c, 0x1d30d829, 0x3a61b052, 0x74c360a4},
+    {0xfe314258, 0xf98ef241, 0xf6f19273, 0xe80f5217, 0xd5f2d2df, 0xae09d34f, 0x59ffd06f, 0xb3ffa0de,
+     0x6213374d, 0xc4266e9a, 0x8da0abc5, 0x1ead217b, 0x3d5a42f6, 0x7ab485ec, 0xf5690bd8, 0xef3e6141,
+     0xdb90b473, 0xb2cd1e17, 0x60764adf, 0xc0ec95be, 0x84355d8d, 0x0d86cdeb, 0x1b0d9bd6, 0x361b37ac,
+     0x6c366f58, 0xd86cdeb0, 0xb535cb91, 0x6f87e1d3, 0xdf0fc3a6, 0xbbf3f1bd, 0x720b958b, 0xe4172b16},
+    {0xf7506984, 0xeb4ca5f9, 0xd3753d03, 0xa3060cf7, 0x43e06f1f, 0x87c0de3e, 0x0a6dca8d, 0x14db951a,
+     0x29b72a34, 0x536e5468, 0xa6dca8d0, 0x48552751, 0x90aa4ea2, 0x24b8ebb5, 0x4971d76a, 0x92e3aed4,
+     0x202b2b59, 0x405656b2, 0x80acad64, 0x04b52c39, 0x096a5872, 0x12d4b0e4, 0x25a961c8, 0x4b52c390,
+     0x96a58720, 0x28a778b1, 0x514ef162, 0xa29de2c4, 0x40d7b379, 0x81af66f2, 0x06b2bb15, 0x0d65762a},
+    {0xc2a5b65e, 0x80a71a4d, 0x04a2426b, 0x094484d6, 0x128909ac, 0x25121358, 0x4a2426b0, 0x94484d60,
+     0x2d7cec31, 0x5af9d862, 0xb5f3b0c4, 0x6e0b1779, 0xdc162ef2, 0xbdc02b15, 0x7e6c20db, 0xfcd841b6,
+     0xfc5cf59d, 0xfd559dcb, 0xff474d67, 0xfb62ec3f, 0xf329ae8f, 0xe3bf2bef, 0xc292212f, 0x80c834af,
+     0x047c1faf, 0x08f83f5e, 0x11f07ebc, 0x23e0fd78, 0x47c1faf0, 0x8f83f5e0, 0x1aeb9d31, 0x35d73a62},
+    {0xe040e0ac, 0xc56db7a9, 0x8f3719a3, 0x1b8245b7, 0x37048b6e, 0x6e0916dc, 0xdc122db8, 0xbdc82d81,
+     0x7e7c2df3, 0xfcf85be6, 0xfc1cc13d, 0xfdd5f48b, 0xfe479fe7, 0xf963493f, 0xf72ae48f, 0xebb9bfef,
+     0xd29f092f, 0xa0d264af, 0x4448bfaf, 0x88917f5e, 0x14ce884d, 0x299d109a, 0x533a2134, 0xa6744268,
+     0x4904f221, 0x9209e442, 0x21ffbe75, 0x43ff7cea, 0x87fef9d4, 0x0a118559, 0x14230ab2, 0x28461564},
+    {0xc7cacead, 0x8a79ebab, 0x111fa1a7, 0x223f434e, 0x447e869c, 0x88fd0d38, 0x14166c81, 0x282cd902,
+     0x5059b204, 0xa0b36408, 0x448abee1, 0x89157dc2, 0x17c68d75, 0x2f8d1aea, 0x5f1a35d4, 0xbe346ba8,
+     0x7984a1a1, 0xf3094342, 0xe3fef075, 0xc211961b, 0x81cf5ac7, 0x0672c37f, 0x0ce586fe, 0x19cb0dfc,
+     0x33961bf8, 0x672c37f0, 0xce586fe0, 0x995ca931, 0x37552493, 0x6eaa4926, 0xdd54924c, 0xbf455269},
+    {0x04fcdcbf, 0x09f9b97e, 0x13f372fc, 0x27e6e5f8, 0x4fcdcbf0, 0x9f9b97e0, 0x3adb5931, 0x75b6b262,
+     0xeb6d64c4, 0xd336bf79, 0xa3810803, 0x42ee66f7, 0x85dccdee, 0x0e55ed2d, 0x1cabda5a, 0x3957b4b4,
+     0x72af6968, 0xe55ed2d0, 0xcf51d351, 0x9b4fd053, 0x3373d657, 0x66e7acae, 0xcdcf595c, 0x9e72c449,
+     0x3909fe63, 0x7213fcc6, 0xe427f98c, 0xcda385e9, 0x9eab7d23, 0x38ba8cb7, 0x7175196e, 0xe2ea32dc},
+    {0x6bafcc21, 0xd75f9842, 0xab534675, 0x534afa1b, 0xa695f436, 0x48c79e9d, 0x918f3d3a, 0x26f20c85,
+     0x4de4190a, 0x9bc83214, 0x327c12d9, 0x64f825b2, 0xc9f04b64, 0x960ce039, 0x29f5b683, 0x53eb6d06,
+     0xa7d6da0c, 0x4a41c2e9, 0x948385d2, 0x2ceb7d55, 0x59d6faaa, 0xb3adf554, 0x62b79c59, 0xc56f38b2,
+     0x8f320795, 0x1b8879db, 0x3710f3b6, 0x6e21e76c, 0xdc43ced8, 0xbd6beb41, 0x7f3ba073, 0xfe7740e6},
+    {0x140441c6, 0x2808838c, 0x50110718, 0xa0220e30, 0x45a86a91, 0x8b50d522, 0x134ddcb5, 0x269bb96a,
+     0x4d3772d4, 0x9a6ee5a8, 0x3131bda1, 0x62637b42, 0xc4c6f684, 0x8c619bf9, 0x1d2f4103, 0x3a5e8206,
+     0x74bd040c, 0xe97a0818, 0xd71866c1, 0xabdcbb73, 0x52550017, 0xa4aa002e, 0x4cb876ad, 0x9970ed5a,
+     0x370dac45, 0x6e1b588a, 0xdc36b114, 0xbd8114d9, 0x7eee5f43, 0xfddcbe86, 0xfe550bfd, 0xf946610b},
+    {0x68175a0a, 0xd02eb414, 0xa5b11ed9, 0x4e8e4b43, 0x9d1c9686, 0x3fd55bfd, 0x7faab7fa, 0xff556ff4,
+     0xfb46a919, 0xf36124c3, 0xe32e3f77, 0xc3b0081f, 0x828c66cf, 0x00f4bb6f, 0x01e976de, 0x03d2edbc,
+     0x07a5db78, 0x0f4bb6f0, 0x1e976de0, 0x3d2edbc0, 0x7a5db780, 0xf4bb6f00, 0xec9aa8f1, 0xdcd92713,
+     0xbc5e38d7, 0x7d50075f, 0xfaa00ebe, 0xf0ac6b8d, 0xe4b4a1eb, 0xcc853527, 0x9ce61cbf, 0x3c204f8f},
+    {0xe1ff3667, 0xc6121a3f, 0x89c8428f, 0x167cf3ef, 0x2cf9e7de, 0x59f3cfbc, 0xb3e79f78, 0x62234801,
+     0xc4469002, 0x8d6156f5, 0x1f2edb1b, 0x3e5db636, 0x7cbb6c6c, 0xf976d8d8, 0xf701c741, 0xebeff873,
+     0xd2338617, 0xa18b7adf, 0x46fa834f, 0x8df5069e, 0x1e067bcd, 0x3c0cf79a, 0x7819ef34, 0xf033de68,
+     0xe58bca21, 0xcefbe2b3, 0x981bb397, 0x35db11df, 0x6bb623be, 0xd76c477c, 0xab34f809, 0x538586e3},
+    {0x8b7230ec, 0x13081729, 0x26102e52, 0x4c205ca4, 0x9840b948, 0x356d0461, 0x6ada08c2, 0xd5b41184,
+     0xae8455f9, 0x58e4dd03, 0xb1c9ba06, 0x667f02fd, 0xccfe05fa, 0x9c107d05, 0x3dcc8cfb, 0x7b9919f6,
+     0xf73233ec, 0xeb881129, 0xd2fc54a3, 0xa014dfb7, 0x45c5c99f, 0x8b8b933e, 0x12fb508d, 0x25f6a11a,
+     0x4bed4234, 0x97da8468, 0x2a597e21, 0x54b2fc42, 0xa965f884, 0x572787f9, 0xae4f0ff2, 0x59726915},
+    {0x56175f20, 0xac2ebe40, 0x5db10a71, 0xbb6214e2, 0x73285f35, 0xe650be6a, 0xc94d0a25, 0x977662bb,
+     0x2b00b387, 0x5601670e, 0xac02ce1c, 0x5de9eac9, 0xbbd3d592, 0x724bddd5, 0xe497bbaa, 0xccc301a5,
+     0x9c6a75bb, 0x3d389d87, 0x7a713b0e, 0xf4e2761c, 0xec289ac9, 0xddbd4363, 0xbe96f037, 0x78c1969f,
+     0xf1832d3e, 0xe6ea2c8d, 0xc8382feb, 0x959c2927, 0x2ed424bf, 0x5da8497e, 0xbb5092fc, 0x734d5309},
+    {0xb9a3dcd0, 0x76abcf51, 0xed579ea2, 0xdf434bb5, 0xbb6ae19b, 0x7339b5c7, 0xe6736b8e, 0xc90aa1ed,
+     0x97f9352b, 0x2a1e1ca7, 0x543c394e, 0xa878729c, 0x551c93c9, 0xaa392792, 0x519e39d5, 0xa33c73aa,
+     0x439491a5, 0x8729234a, 0x0bbe3065, 0x177c60ca, 0x2ef8c194, 0x5df18328, 0xbbe30650, 0x722a7a51,
+     0xe454f4a2, 0xcd459fb5, 0x9f67499b, 0x3b22e5c7, 0x7645cb8e, 0xec8b971c, 0xdcfb58c9, 0xbc1ac763},
+    {0xdd2d789e, 0xbfb687cd, 0x7a81796b, 0xf502f2d6, 0xefe9935d, 0xda3f504b, 0xb192d667, 0x66c9da3f,
+     0xcd93b47e, 0x9ecb1e0d, 0x387a4aeb, 0x70f495d6, 0xe1e92bac, 0xc63e21a9, 0x899035a3, 0x16cc1db7,
+     0x2d983b6e, 0x5b3076dc, 0xb660edb8, 0x692dad81, 0xd25b5b02, 0xa15ac0f5, 0x4759f71b, 0x8eb3ee36,
+     0x188baa9d, 0x3117553a, 0x622eaa74, 0xc45d54e8, 0x8d56df21, 0x1f41c8b3, 0x3e839166, 0x7d0722cc},
+    {0x44036c4a, 0x8806d894, 0x15e1c7d9, 0x2bc38fb2, 0x57871f64, 0xaf0e3ec8, 0x5bf00b61, 0xb7e016c2,
+     0x6a2c5b75, 0xd458b6ea, 0xad5d1b25, 0x5f5640bb, 0xbeac8176, 0x78b5741d, 0xf16ae83a, 0xe739a685,
+     0xcb9f3bfb, 0x92d20107, 0x204874ff, 0x4090e9fe, 0x8121d3fc, 0x07afd109, 0x0f5fa212, 0x1ebf4424,
+     0x3d7e8848, 0x7afd1090, 0xf5fa2120, 0xee1834b1, 0xd9dc1f93, 0xb65449d7, 0x6944e55f, 0xd289cabe},
+    {0x4612657d, 0x8c24cafa, 0x1da5e305, 0x3b4bc60a, 0x76978c14, 0xed2f1828, 0xdfb246a1, 0xba88fbb3,
+     0x70fd8197, 0xe1fb032e, 0xc61a70ad, 0x89d897ab, 0x165d59a7, 0x2cbab34e, 0x5975669c, 0xb2eacd38,
+     0x6039ec81, 0xc073d902, 0x850bc4f5, 0x0ffbff1b, 0x1ff7fe36, 0x3feffc6c, 0x7fdff8d8, 0xffbff1b0,
+     0xfa939591, 0xf0cb5dd3, 0xe47acd57, 0xcd19ec5f, 0x9fdfae4f, 0x3a532a6f, 0x74a654de, 0xe94ca9bc},
+    {0x584d5569, 0xb09aaad2, 0x64d92355, 0xc9b246aa, 0x9688fba5, 0x28fd81bb, 0x51fb0376, 0xa3f606ec,
+     0x42007b29, 0x8400f652, 0x0ded9a55, 0x1bdb34aa, 0x37b66954, 0x6f6cd2a8, 0xded9a550, 0xb85f3c51,
+     0x75520e53, 0xeaa41ca6, 0xd0a44fbd, 0xa4a4e98b, 0x4ca5a5e7, 0x994b4bce, 0x377ae16d, 0x6ef5c2da,
+     0xddeb85b4, 0xbe3b7d99, 0x799a8dc3, 0xf3351b86, 0xe38641fd, 0xc2e0f50b, 0x802d9ce7, 0x05b74f3f},
+    {0xe8cd33e2, 0xd4761135, 0xad00549b, 0x5fecdfc7, 0xbfd9bf8e, 0x7a5f09ed, 0xf4be13da, 0xec905145,
+     0xdcccd47b, 0xbc75de07, 0x7d07caff, 0xfa0f95fe, 0xf1f35d0d, 0xe60acceb, 0xc9f9ef27, 0x961fa8bf,
+     0x29d3278f, 0x53a64f1e, 0xa74c9e3c, 0x4b754a89, 0x96ea9512, 0x28395cd5, 0x5072b9aa, 0xa0e57354,
+     0x44269059, 0x884d20b2, 0x15763795, 0x2aec6f2a, 0x55d8de54, 0xabb1bca8, 0x528f0fa1, 0xa51e1f42},
+    {0x82f63b78, 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020, 0x00000040,
+     0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800, 0x00001000, 0x00002000, 0x00004000,
+     0x00008000, 0x00010000, 0x00020000, 0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000,
+     0x00800000, 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000, 0x40000000},
+    {0x417b1dbc, 0x82f63b78, 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020,
+     0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800, 0x00001000, 0x00002000,
+     0x00004000, 0x00008000, 0x00010000, 0x00020000, 0x00040000, 0x00080000, 0x00100000, 0x00200000,
+     0x00400000, 0x00800000, 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000},
+    {0x105ec76f, 0x20bd8ede, 0x417b1dbc, 0x82f63b78, 0x00000001, 0x00000002, 0x00000004, 0x00000008,
+     0x00000010, 0x00000020, 0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800,
+     0x00001000, 0x00002000, 0x00004000, 0x00008000, 0x00010000, 0x00020000, 0x00040000, 0x00080000,
+     0x00100000, 0x00200000, 0x00400000, 0x00800000, 0x01000000, 0x02000000, 0x04000000, 0x08000000},
+    {0xf26b8303, 0xe13b70f7, 0xc79a971f, 0x8ad958cf, 0x105ec76f, 0x20bd8ede, 0x417b1dbc, 0x82f63b78,
+     0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020, 0x00000040, 0x00000080,
+     0x00000100, 0x00000200, 0x00000400, 0x00000800, 0x00001000, 0x00002000, 0x00004000, 0x00008000,
+     0x00010000, 0x00020000, 0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000}
+};
+
+uint32_t ceph_crc32c_zeros(uint32_t crc, unsigned len)
+{
+  int range = 0;
+  unsigned remainder = len & 15;
+  len = len >> 4;
+  range = 4;
+  while (len != 0) {
+    if ((len & 1) == 1) {
+      uint32_t crc1 = 0;
+      uint32_t* ptr = crc_turbo_table/*.val*/[range];
+      while (crc != 0) {
+        uint32_t mask = ~((crc & 1) - 1);
+        crc1 = crc1 ^ (mask & *ptr);
+        crc = crc >> 1;
+        ptr++;
+      }
+      crc = crc1;
+    }
+    len = len >> 1;
+    range++;
+  }
+  if (remainder > 0)
+    crc = ceph_crc32c(crc, nullptr, remainder);
+  return crc;
+}
diff --git a/src/common/crc32c_aarch64.c b/src/common/crc32c_aarch64.c
new file mode 100644
index 00000000..d15736a0
--- /dev/null
+++ b/src/common/crc32c_aarch64.c
@@ -0,0 +1,272 @@
+#include "acconfig.h"
+#include "include/int_types.h"
+#include "common/crc32c_aarch64.h"
+#include "arch/arm.h"
+
+#ifndef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
+/* Request crc extension capabilities from the assembler */
+asm(".arch_extension crc");
+
+#ifdef HAVE_ARMV8_CRYPTO
+/* Request crypto extension capabilities from the assembler */
+asm(".arch_extension crypto");
+#endif
+
+#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+
+#define CRC32C3X8(ITR) \
+	__asm__("crc32cx %w[c1], %w[c1], %x[v]":[c1]"+r"(crc1):[v]"r"(*((const uint64_t *)buffer + 42*1 + (ITR))));\
+	__asm__("crc32cx %w[c2], %w[c2], %x[v]":[c2]"+r"(crc2):[v]"r"(*((const uint64_t *)buffer + 42*2 + (ITR))));\
+	__asm__("crc32cx %w[c0], %w[c0], %x[v]":[c0]"+r"(crc0):[v]"r"(*((const uint64_t *)buffer + 42*0 + (ITR))));
+
+#define CRC32C3X8_ZERO \
+	__asm__("crc32cx %w[c0], %w[c0], xzr":[c0]"+r"(crc0));
+
+#else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+
+#include <arm_acle.h>
+#include <arm_neon.h>
+
+#define CRC32CX(crc, value) (crc) = __crc32cd((crc), (value))
+#define CRC32CW(crc, value) (crc) = __crc32cw((crc), (value))
+#define CRC32CH(crc, value) (crc) = __crc32ch((crc), (value))
+#define CRC32CB(crc, value) (crc) = __crc32cb((crc), (value))
+
+#define CRC32C3X8(ITR) \
+	crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\
+	crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\
+	crc0 = __crc32cd(crc0, *((const uint64_t *)buffer + 42*0 + (ITR)));
+
+#define CRC32C3X8_ZERO \
+	crc0 = __crc32cd(crc0, (const uint64_t)0);
+
+#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+
+#define CRC32C7X3X8(ITR) do {\
+	CRC32C3X8((ITR)*7+0) \
+	CRC32C3X8((ITR)*7+1) \
+	CRC32C3X8((ITR)*7+2) \
+	CRC32C3X8((ITR)*7+3) \
+	CRC32C3X8((ITR)*7+4) \
+	CRC32C3X8((ITR)*7+5) \
+	CRC32C3X8((ITR)*7+6) \
+	} while(0)
+
+#define CRC32C7X3X8_ZERO do {\
+	CRC32C3X8_ZERO \
+	CRC32C3X8_ZERO \
+	CRC32C3X8_ZERO \
+	CRC32C3X8_ZERO \
+	CRC32C3X8_ZERO \
+	CRC32C3X8_ZERO \
+	CRC32C3X8_ZERO \
+	} while(0)
+
+#define PREF4X64L1(PREF_OFFSET, ITR) \
+	__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
+	__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
+	__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
+	__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
+
+#define PREF1KL1(PREF_OFFSET) \
+	PREF4X64L1((PREF_OFFSET), 0) \
+	PREF4X64L1((PREF_OFFSET), 4) \
+	PREF4X64L1((PREF_OFFSET), 8) \
+	PREF4X64L1((PREF_OFFSET), 12)
+
+#define PREF4X64L2(PREF_OFFSET, ITR) \
+	__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
+	__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
+	__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
+	__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
+
+#define PREF1KL2(PREF_OFFSET) \
+	PREF4X64L2((PREF_OFFSET), 0) \
+	PREF4X64L2((PREF_OFFSET), 4) \
+	PREF4X64L2((PREF_OFFSET), 8) \
+	PREF4X64L2((PREF_OFFSET), 12)
+
+
+uint32_t ceph_crc32c_aarch64(uint32_t crc, unsigned char const *buffer, unsigned len)
+{
+	int64_t length = len;
+	uint32_t crc0, crc1, crc2;
+
+	if (buffer) {
+#ifdef HAVE_ARMV8_CRYPTO
+	        if (ceph_arch_aarch64_pmull) {
+#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
+		/* Calculate reflected crc with PMULL Instruction */
+		const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014;
+		uint64_t t0, t1;
+
+		/* crc done "by 3" for fixed input block size of 1024 bytes */
+		while ((length -= 1024) >= 0) {
+			/* Prefetch data for following block to avoid cache miss */
+			PREF1KL2(1024*3);
+			/* Do first 8 bytes here for better pipelining */
+			crc0 = __crc32cd(crc, *(const uint64_t *)buffer);
+			crc1 = 0;
+			crc2 = 0;
+			buffer += sizeof(uint64_t);
+
+			/* Process block inline
+			Process crc0 last to avoid dependency with above */
+			CRC32C7X3X8(0);
+			CRC32C7X3X8(1);
+			CRC32C7X3X8(2);
+			CRC32C7X3X8(3);
+			CRC32C7X3X8(4);
+			CRC32C7X3X8(5);
+
+			buffer += 42*3*sizeof(uint64_t);
+			/* Prefetch data for following block to avoid cache miss */
+			PREF1KL1(1024);
+
+			/* Merge crc0 and crc1 into crc2
+			   crc1 multiply by K2
+			   crc0 multiply by K1 */
+
+			t1 = (uint64_t)vmull_p64(crc1, k2);
+			t0 = (uint64_t)vmull_p64(crc0, k1);
+			crc = __crc32cd(crc2, *(const uint64_t *)buffer);
+			crc1 = __crc32cd(0, t1);
+			crc ^= crc1;
+			crc0 = __crc32cd(0, t0);
+			crc ^= crc0;
+
+			buffer += sizeof(uint64_t);
+		}
+#else /* !HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+		__asm__("mov    x16,            #0xf38a         \n\t"
+			"movk   x16,            #0xe417, lsl 16 \n\t"
+			"mov    v1.2d[0],       x16             \n\t"
+			"mov    x16,            #0x8014         \n\t"
+			"movk   x16,            #0x8f15, lsl 16 \n\t"
+			"mov    v0.2d[0],       x16             \n\t"
+			:::"x16");
+
+		while ((length -= 1024) >= 0) {
+			PREF1KL2(1024*3);
+			__asm__("crc32cx %w[c0], %w[c], %x[v]\n\t"
+				:[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(*(const uint64_t *)buffer):);
+			crc1 = 0;
+			crc2 = 0;
+			buffer += sizeof(uint64_t);
+
+			CRC32C7X3X8(0);
+			CRC32C7X3X8(1);
+			CRC32C7X3X8(2);
+			CRC32C7X3X8(3);
+			CRC32C7X3X8(4);
+			CRC32C7X3X8(5);
+
+			buffer += 42*3*sizeof(uint64_t);
+			PREF1KL1(1024);
+			__asm__("mov            v2.2d[0],       %x[c1]          \n\t"
+				"pmull          v2.1q,          v2.1d,  v0.1d   \n\t"
+				"mov            v3.2d[0],       %x[c0]          \n\t"
+				"pmull          v3.1q,          v3.1d,  v1.1d   \n\t"
+				"crc32cx        %w[c],          %w[c2], %x[v]   \n\t"
+				"mov            %x[c1],         v2.2d[0]        \n\t"
+				"crc32cx        %w[c1],         wzr,    %x[c1]  \n\t"
+				"eor            %w[c],          %w[c],  %w[c1]  \n\t"
+				"mov            %x[c0],         v3.2d[0]        \n\t"
+				"crc32cx        %w[c0],         wzr,    %x[c0]  \n\t"
+				"eor            %w[c],          %w[c],  %w[c0]  \n\t"
+				:[c1]"+r"(crc1), [c0]"+r"(crc0), [c2]"+r"(crc2), [c]"+r"(crc)
+				:[v]"r"(*((const uint64_t *)buffer)));
+			buffer += sizeof(uint64_t);
+		}
+#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+
+		if(!(length += 1024))
+			return crc;
+	        }
+#endif /* HAVE_ARMV8_CRYPTO */
+		while ((length -= sizeof(uint64_t)) >= 0) {
+			CRC32CX(crc, *(uint64_t *)buffer);
+			buffer += sizeof(uint64_t);
+		}
+
+		/* The following is more efficient than the straight loop */
+		if (length & sizeof(uint32_t)) {
+			CRC32CW(crc, *(uint32_t *)buffer);
+			buffer += sizeof(uint32_t);
+		}
+		if (length & sizeof(uint16_t)) {
+			CRC32CH(crc, *(uint16_t *)buffer);
+			buffer += sizeof(uint16_t);
+		}
+		if (length & sizeof(uint8_t))
+			CRC32CB(crc, *buffer);
+	} else {
+#ifdef HAVE_ARMV8_CRYPTO
+	        if (ceph_arch_aarch64_pmull) {
+#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
+		const poly64_t k1 = 0xe417f38a;
+		uint64_t t0;
+
+		while ((length -= 1024) >= 0) {
+			crc0 = __crc32cd(crc, 0);
+
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+
+			/* Merge crc0 into crc: crc0 multiply by K1 */
+
+			t0 = (uint64_t)vmull_p64(crc0, k1);
+			crc = __crc32cd(0, t0);
+		}
+#else /* !HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+		__asm__("mov    x16,            #0xf38a         \n\t"
+			"movk   x16,            #0xe417, lsl 16 \n\t"
+			"mov    v1.2d[0],       x16             \n\t"
+			:::"x16");
+
+		while ((length -= 1024) >= 0) {
+			__asm__("crc32cx %w[c0], %w[c], xzr\n\t"
+				:[c0]"=r"(crc0):[c]"r"(crc));
+
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+
+			__asm__("mov            v3.2d[0],       %x[c0]          \n\t"
+				"pmull          v3.1q,          v3.1d,  v1.1d   \n\t"
+				"mov            %x[c0],         v3.2d[0]        \n\t"
+				"crc32cx        %w[c],          wzr,    %x[c0]  \n\t"
+				:[c]"=r"(crc)
+				:[c0]"r"(crc0));
+		}
+#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+
+		if(!(length += 1024))
+			return crc;
+	        }
+#endif /* HAVE_ARMV8_CRYPTO */
+		while ((length -= sizeof(uint64_t)) >= 0)
+			CRC32CX(crc, 0);
+
+		/* The following is more efficient than the straight loop */
+		if (length & sizeof(uint32_t))
+			CRC32CW(crc, 0);
+
+		if (length & sizeof(uint16_t))
+			CRC32CH(crc, 0);
+
+		if (length & sizeof(uint8_t))
+			CRC32CB(crc, 0);
+	}
+	return crc;
+}
diff --git a/src/common/crc32c_aarch64.h b/src/common/crc32c_aarch64.h
new file mode 100644
index 00000000..51f0542f
--- /dev/null
+++ b/src/common/crc32c_aarch64.h
@@ -0,0 +1,28 @@
+#ifndef CEPH_COMMON_CRC32C_AARCH64_H
+#define CEPH_COMMON_CRC32C_AARCH64_H
+
+#include "acconfig.h"
+#include "arch/arm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef HAVE_ARMV8_CRC
+
+extern uint32_t ceph_crc32c_aarch64(uint32_t crc, unsigned char const *buffer, unsigned len);
+
+#else
+
+static inline uint32_t ceph_crc32c_aarch64(uint32_t crc, unsigned char const *buffer, unsigned len)
+{
+	return 0;
+}
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/common/crc32c_intel_baseline.c b/src/common/crc32c_intel_baseline.c
new file mode 100644
index 00000000..2862f627
--- /dev/null
+++ b/src/common/crc32c_intel_baseline.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright 2012-2013 Intel Corporation All Rights Reserved.
+ * All rights reserved.
+ *
+ * http://opensource.org/licenses/BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ *
+ * * Neither the name of the Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "include/int_types.h"
+
+#define MAX_ITER	8
+
+unsigned long crc32_table_iscsi_base[256] = {
+	0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, 
+	0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB, 
+	0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, 
+	0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24, 
+	0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B, 
+	0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384, 
+	0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54, 
+	0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B, 
+	0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A, 
+	0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35, 
+	0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5, 
+	0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA, 
+	0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45, 
+	0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A, 
+	0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A, 
+	0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595, 
+	0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48, 
+	0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957, 
+	0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687, 
+	0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198, 
+	0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927, 
+	0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38, 
+	0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8, 
+	0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7, 
+	0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096, 
+	0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789, 
+	0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859, 
+	0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46, 
+	0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9, 
+	0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6, 
+	0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36, 
+	0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829, 
+	0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C, 
+	0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93, 
+	0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043, 
+	0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C, 
+	0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3, 
+	0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC, 
+	0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C, 
+	0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033, 
+	0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652, 
+	0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D, 
+	0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D, 
+	0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982, 
+	0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D, 
+	0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622, 
+	0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2, 
+	0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED, 
+	0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530, 
+	0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F, 
+	0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF, 
+	0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0, 
+	0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F, 
+	0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540, 
+	0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90, 
+	0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F, 
+	0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE, 
+	0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1, 
+	0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321, 
+	0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E, 
+	0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81, 
+	0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E, 
+	0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, 
+	0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351, 
+};
+
+
+// iSCSI CRC baseline function
+uint32_t ceph_crc32c_intel_baseline(uint32_t crc_init2, unsigned char const *buffer, unsigned len)
+{
+	unsigned int crc_init = crc_init2;
+	unsigned int crc;
+	unsigned char* p_buf;
+
+	if (buffer) {
+		p_buf = (unsigned char*)buffer;
+		unsigned char const * p_end = buffer + len;
+
+		crc = crc_init;
+
+		while (p_buf < (unsigned char *) p_end ){
+			crc = (crc >> 8) ^ crc32_table_iscsi_base[(crc & 0x000000FF) ^ *p_buf++];
+		}
+	} else {
+		crc = crc_init;
+		while (len--) {
+			crc = (crc >> 8) ^ crc32_table_iscsi_base[(crc & 0x000000FF)];
+		}
+
+	}
+	return crc;	 
+}
diff --git a/src/common/crc32c_intel_baseline.h b/src/common/crc32c_intel_baseline.h
new file mode 100644
index 00000000..e463575e
--- /dev/null
+++ b/src/common/crc32c_intel_baseline.h
@@ -0,0 +1,16 @@
+#ifndef CEPH_COMMON_CRC32C_INTEL_BASELINE_H
+#define CEPH_COMMON_CRC32C_INTEL_BASELINE_H
+
+#include "include/int_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint32_t ceph_crc32c_intel_baseline(uint32_t crc, unsigned char const *buffer, unsigned len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/common/crc32c_intel_fast.c b/src/common/crc32c_intel_fast.c
new file mode 100644
index 00000000..188ff95f
--- /dev/null
+++ b/src/common/crc32c_intel_fast.c
@@ -0,0 +1,51 @@
+#include "acconfig.h"
+#include "common/crc32c_intel_baseline.h"
+
+extern unsigned int crc32_iscsi_00(unsigned char const *buffer, uint64_t len, uint64_t crc) asm("crc32_iscsi_00");
+extern unsigned int crc32_iscsi_zero_00(unsigned char const *buffer, uint64_t len, uint64_t crc) asm("crc32_iscsi_zero_00");
+
+#ifdef HAVE_GOOD_YASM_ELF64
+
+uint32_t ceph_crc32c_intel_fast(uint32_t crc, unsigned char const *buffer, unsigned len)
+{
+	uint32_t v;
+	unsigned left;
+
+	if (!buffer)
+	{
+	  return crc32_iscsi_zero_00(buffer, len, crc);
+	}
+
+	/*
+	 * the crc32_iscsi_00 method reads past buffer+len (because it
+	 * reads full words) which makes valgrind unhappy.  don't do
+	 * that.
+	 */
+	if (len < 16)
+		return ceph_crc32c_intel_baseline(crc, buffer, len);
+	left = ((unsigned long)buffer + len) & 7;
+	len -= left;
+	v = crc32_iscsi_00(buffer, len, crc);
+	if (left)
+		v = ceph_crc32c_intel_baseline(v, buffer + len, left);
+	return v;
+}
+
+int ceph_crc32c_intel_fast_exists(void)
+{
+	return 1;
+}
+
+#else
+
+int ceph_crc32c_intel_fast_exists(void)
+{
+	return 0;
+}
+
+uint32_t ceph_crc32c_intel_fast(uint32_t crc, unsigned char const *buffer, unsigned len)
+{
+	return 0;
+}
+
+#endif
diff --git a/src/common/crc32c_intel_fast.h b/src/common/crc32c_intel_fast.h
new file mode 100644
index 00000000..26a444f6
--- /dev/null
+++ b/src/common/crc32c_intel_fast.h
@@ -0,0 +1,28 @@
+#ifndef CEPH_COMMON_CRC32C_INTEL_FAST_H
+#define CEPH_COMMON_CRC32C_INTEL_FAST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* is the fast version compiled in */
+extern int ceph_crc32c_intel_fast_exists(void);
+
+#ifdef __x86_64__
+
+extern uint32_t ceph_crc32c_intel_fast(uint32_t crc, unsigned char const *buffer, unsigned len);
+
+#else
+
+static inline uint32_t ceph_crc32c_intel_fast(uint32_t crc, unsigned char const *buffer, unsigned len)
+{
+	return 0;
+}
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/common/crc32c_intel_fast_asm.s b/src/common/crc32c_intel_fast_asm.s
new file mode 100644
index 00000000..c9fc1b9d
--- /dev/null
+++ b/src/common/crc32c_intel_fast_asm.s
@@ -0,0 +1,674 @@
+;
+; Copyright 2012-2013 Intel Corporation All Rights Reserved.
+; All rights reserved.
+;
+; http://opensource.org/licenses/BSD-3-Clause
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following
+; conditions are met:
+;
+; * Redistributions of source code must retain the above copyright
+;   notice, this list of conditions and the following disclaimer.
+;
+; * Redistributions in binary form must reproduce the above copyright
+;   notice, this list of conditions and the following disclaimer in
+;   the documentation and/or other materials provided with the
+;   distribution.
+;
+; * Neither the name of the Intel Corporation nor the names of its
+;   contributors may be used to endorse or promote products derived
+;   from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+; FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+; COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+; INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+; HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+; STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+; ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+; OF THE POSSIBILITY OF SUCH DAMAGE.
+;
+
+; Function to compute iscsi CRC32 with table-based recombination
+; crc done "by 3" with block sizes 1920, 960, 480, 240
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; crcB3 MACRO to implement crc32 on 3 %%bSize-byte blocks
+%macro  crcB3 3
+%define %%bSize   %1    ; 1/3 of buffer size
+%define %%td2     %2    ; table offset for crc0 (2/3 of buffer)
+%define %%td1     %3    ; table offset for crc1 (1/3 of buffer)
+
+%IF %%bSize=640
+	sub     len, %%bSize*3
+	js      %%crcB3_end           ;; jump to next level if 3*blockSize > len
+%ELSE
+	cmp     len, %%bSize*3
+	jnae    %%crcB3_end           ;; jump to next level if 3*blockSize > len
+%ENDIF
+	;;;;;; Calculate CRC of 3 blocks of the buffer ;;;;;;
+%%crcB3_loop:
+					;; rax = crc0 = initial crc
+	xor     rbx, rbx                ;; rbx = crc1 = 0;
+	xor     r10, r10                ;; r10 = crc2 = 0;
+
+ %assign i 0
+ %rep %%bSize/8 - 1
+	crc32   rax, [bufptmp+i + 0*%%bSize]  ;; update crc0
+	crc32   rbx, [bufptmp+i + 1*%%bSize]  ;; update crc1
+	crc32   r10, [bufptmp+i + 2*%%bSize]  ;; update crc2
+	%assign i (i+8)
+ %endrep
+	crc32   rax, [bufptmp+i + 0*%%bSize]  ;; update crc0
+	crc32   rbx, [bufptmp+i + 1*%%bSize]  ;; update crc1
+; SKIP  ;crc32  r10, [bufptmp+i + 2*%%bSize]  ;; update crc2
+
+	; merge in crc0
+	movzx   bufp_dw, al
+	mov     r9d, [crc_init + bufp*4 + %%td2]
+	movzx   bufp_dw, ah
+	shr     eax, 16
+	mov     r11d, [crc_init + bufp*4 + %%td2]
+	shl     r11, 8
+	xor     r9, r11
+
+	movzx   bufp_dw, al
+	mov     r11d, [crc_init + bufp*4 + %%td2]
+	movzx   bufp_dw, ah
+	shl     r11, 16
+	xor     r9, r11
+	mov     r11d, [crc_init + bufp*4 + %%td2]
+	shl     r11, 24
+	xor     r9, r11
+
+	; merge in crc1
+
+	movzx   bufp_dw, bl
+	mov     r11d, [crc_init + bufp*4 + %%td1]
+	movzx   bufp_dw, bh
+	shr     ebx, 16
+	xor     r9, r11
+	mov     r11d, [crc_init + bufp*4 + %%td1]
+	shl     r11, 8
+	xor     r9, r11
+
+	movzx   bufp_dw, bl
+	mov     r11d, [crc_init + bufp*4 + %%td1]
+	movzx   bufp_dw, bh
+	shl     r11, 16
+	xor     r9, r11
+	mov     r11d, [crc_init + bufp*4 + %%td1]
+	shl     r11, 24
+	xor     r9, r11
+
+	xor     r9, [bufptmp+i + 2*%%bSize]
+	crc32   r10, r9
+	mov     rax, r10
+
+	add     bufptmp, %%bSize*3      ;; move to next block
+	sub     len, %%bSize*3
+%IF %%bSize=640
+	jns     %%crcB3_loop
+%ENDIF
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%%crcB3_end:
+%IF %%bSize=640
+	add     len, %%bSize*3
+%ENDIF
+	je      do_return               ;; return if remaining data is zero
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define ABI_IS_AMD64
+%elifidn __OUTPUT_FORMAT__, macho64
+%define ABI_IS_AMD64
+%endif
+
+;;; ISCSI CRC 32 Implementation with crc32 Instruction
+
+;;; unsigned int crc32_iscsi_00(unsigned char * buffer, int len, unsigned int crc_init);
+;;;
+;;;        *buf = rcx
+;;;         len = rdx
+;;;    crc_init = r8
+;;;
+
+global  crc32_iscsi_00:function
+crc32_iscsi_00:
+
+%ifdef ABI_IS_AMD64
+%define bufp            rdi
+%define bufp_dw         edi
+%define bufp_w          di
+%define bufp_b          dil
+%define bufptmp         rcx
+%define block_0         rcx
+%define block_1         r8
+%define block_2         r11
+%define len             rsi 
+%define len_dw          esi 
+%define len_w           si 
+%define len_b           sil 
+%define crc_init        rdx 
+%define crc_init_dw     edx 
+%else
+%define bufp            rcx
+%define bufp_dw         ecx
+%define bufp_w          cx
+%define bufp_b          cl
+%define bufptmp         rdi
+%define block_0         rdi
+%define block_1         rsi
+%define block_2         r11
+%define len             rdx 
+%define len_dw          edx 
+%define len_w           dx 
+%define len_b           dl 
+%define crc_init        r8 
+%define crc_init_dw     r8d 
+%endif
+
+
+	push    rdi
+	push    rbx
+
+	mov     rax, crc_init           ;; rax = crc_init;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; 1) ALIGN: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+	mov     bufptmp, bufp           ;; rdi = *buf
+	neg     bufp
+	and     bufp, 7                 ;; calculate the unalignment
+					;; amount of the address
+	je      proc_block              ;; Skip if aligned
+
+	cmp     len, 8
+	jb      less_than_8
+
+	;;;; Calculate CRC of unaligned bytes of the buffer (if any) ;;;;
+	mov     rbx, [bufptmp]          ;; load a quadword from the buffer
+	add     bufptmp, bufp           ;; align buffer pointer for
+					;; quadword processing
+	sub     len, bufp               ;; update buffer length
+align_loop:
+	crc32   eax, bl                 ;;    compute crc32 of 1-byte
+	shr     rbx, 8                  ;;    get next byte
+	dec     bufp
+	jne     align_loop
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; 2) BLOCK LEVEL: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+proc_block:
+	cmp     len, 240
+	jb      bit8
+
+	lea     crc_init, [mul_table_72 wrt rip]  ;; load table base address
+
+	crcB3   640, 0x1000, 0x0c00     ; 640*3 = 1920 (Tables 1280, 640)
+	crcB3   320, 0x0c00, 0x0800     ; 320*3 =  960 (Tables  640, 320)
+	crcB3   160, 0x0800, 0x0400     ; 160*3 =  480 (Tables  320, 160)
+	crcB3    80, 0x0400, 0x0000     ;  80*3 =  240 (Tables  160,  80)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;4) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of rdx are full)
+
+bit8:
+	shl     len_b, 1                ;; shift-out MSB (bit-7)
+	jnc     bit7                    ;; jump to bit-6 if bit-7 == 0
+ %assign i 0
+ %rep 16
+	crc32   rax, [bufptmp+i]        ;; compute crc32 of 8-byte data
+	%assign i (i+8)
+ %endrep
+	je      do_return               ;; return if remaining data is zero
+	add     bufptmp, 128            ;; buf +=64; (next 64 bytes)
+
+bit7:
+	shl     len_b, 1                ;; shift-out MSB (bit-7)
+	jnc     bit6                    ;; jump to bit-6 if bit-7 == 0
+ %assign i 0
+ %rep 8
+	crc32   rax, [bufptmp+i]        ;; compute crc32 of 8-byte data
+	%assign i (i+8)
+ %endrep
+	je      do_return               ;; return if remaining data is zero
+	add     bufptmp, 64             ;; buf +=64; (next 64 bytes)
+bit6:
+	shl     len_b, 1                ;; shift-out MSB (bit-6)
+	jnc     bit5                    ;; jump to bit-5 if bit-6 == 0
+ %assign i 0
+ %rep 4
+	crc32   rax, [bufptmp+i]        ;;    compute crc32 of 8-byte data
+	%assign i (i+8)
+ %endrep
+	je      do_return               ;; return if remaining data is zero
+	add     bufptmp, 32             ;; buf +=32; (next 32 bytes)
+bit5:
+	shl     len_b, 1                ;; shift-out MSB (bit-5)
+	jnc     bit4                    ;; jump to bit-4 if bit-5 == 0
+ %assign i 0
+ %rep 2
+	crc32   rax, [bufptmp+i]        ;;    compute crc32 of 8-byte data
+	%assign i (i+8)
+ %endrep
+	je      do_return               ;; return if remaining data is zero
+	add     bufptmp, 16             ;; buf +=16; (next 16 bytes)
+bit4:
+	shl     len_b, 1                ;; shift-out MSB (bit-4)
+	jnc     bit3                    ;; jump to bit-3 if bit-4 == 0
+	crc32   rax, [bufptmp]          ;; compute crc32 of 8-byte data
+	je      do_return               ;; return if remaining data is zero
+	add     bufptmp, 8              ;; buf +=8; (next 8 bytes)
+bit3:
+	mov     rbx, [bufptmp]          ;; load a 8-bytes from the buffer:
+	shl     len_b, 1                ;; shift-out MSB (bit-3)
+	jnc     bit2                    ;; jump to bit-2 if bit-3 == 0
+	crc32   eax, ebx                ;; compute crc32 of 4-byte data
+	je      do_return               ;; return if remaining data is zero
+	shr     rbx, 32                 ;; get next 3 bytes
+bit2:
+	shl     len_b, 1                ;; shift-out MSB (bit-2)
+	jnc     bit1                    ;; jump to bit-1 if bit-2 == 0
+	crc32   eax, bx                 ;; compute crc32 of 2-byte data
+	je      do_return               ;; return if remaining data is zero
+	shr     rbx, 16                 ;; next byte
+bit1:
+	test    len_b,len_b
+	je      do_return
+	crc32   eax, bl                 ;; compute crc32 of 1-byte data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+do_return:
+
+	pop     rbx
+	pop     rdi
+	ret
+
+less_than_8:
+	test    len,4
+	jz      less_than_4
+	crc32   eax, dword[bufptmp]
+	add     bufptmp,4
+less_than_4:
+	test    len,2
+	jz      less_than_2
+	crc32   eax, word[bufptmp]
+	add     bufptmp,2
+less_than_2:
+	test    len,1
+	jz      do_return
+	crc32   rax, byte[bufptmp]
+	pop     rbx
+	pop     bufptmp
+	ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; global mul_table_72, mul_table_152, mul_table_312, mul_table_632, mul_table_1272
+
+section .data
+align   8
+mul_table_72:
+DD 0x00000000,0x39d3b296,0x73a7652c,0x4a74d7ba
+DD 0xe74eca58,0xde9d78ce,0x94e9af74,0xad3a1de2
+DD 0xcb71e241,0xf2a250d7,0xb8d6876d,0x810535fb
+DD 0x2c3f2819,0x15ec9a8f,0x5f984d35,0x664bffa3
+DD 0x930fb273,0xaadc00e5,0xe0a8d75f,0xd97b65c9
+DD 0x7441782b,0x4d92cabd,0x07e61d07,0x3e35af91
+DD 0x587e5032,0x61ade2a4,0x2bd9351e,0x120a8788
+DD 0xbf309a6a,0x86e328fc,0xcc97ff46,0xf5444dd0
+DD 0x23f31217,0x1a20a081,0x5054773b,0x6987c5ad
+DD 0xc4bdd84f,0xfd6e6ad9,0xb71abd63,0x8ec90ff5
+DD 0xe882f056,0xd15142c0,0x9b25957a,0xa2f627ec
+DD 0x0fcc3a0e,0x361f8898,0x7c6b5f22,0x45b8edb4
+DD 0xb0fca064,0x892f12f2,0xc35bc548,0xfa8877de
+DD 0x57b26a3c,0x6e61d8aa,0x24150f10,0x1dc6bd86
+DD 0x7b8d4225,0x425ef0b3,0x082a2709,0x31f9959f
+DD 0x9cc3887d,0xa5103aeb,0xef64ed51,0xd6b75fc7
+DD 0x47e6242e,0x7e3596b8,0x34414102,0x0d92f394
+DD 0xa0a8ee76,0x997b5ce0,0xd30f8b5a,0xeadc39cc
+DD 0x8c97c66f,0xb54474f9,0xff30a343,0xc6e311d5
+DD 0x6bd90c37,0x520abea1,0x187e691b,0x21addb8d
+DD 0xd4e9965d,0xed3a24cb,0xa74ef371,0x9e9d41e7
+DD 0x33a75c05,0x0a74ee93,0x40003929,0x79d38bbf
+DD 0x1f98741c,0x264bc68a,0x6c3f1130,0x55eca3a6
+DD 0xf8d6be44,0xc1050cd2,0x8b71db68,0xb2a269fe
+DD 0x64153639,0x5dc684af,0x17b25315,0x2e61e183
+DD 0x835bfc61,0xba884ef7,0xf0fc994d,0xc92f2bdb
+DD 0xaf64d478,0x96b766ee,0xdcc3b154,0xe51003c2
+DD 0x482a1e20,0x71f9acb6,0x3b8d7b0c,0x025ec99a
+DD 0xf71a844a,0xcec936dc,0x84bde166,0xbd6e53f0
+DD 0x10544e12,0x2987fc84,0x63f32b3e,0x5a2099a8
+DD 0x3c6b660b,0x05b8d49d,0x4fcc0327,0x761fb1b1
+DD 0xdb25ac53,0xe2f61ec5,0xa882c97f,0x91517be9
+DD 0x8fcc485c,0xb61ffaca,0xfc6b2d70,0xc5b89fe6
+DD 0x68828204,0x51513092,0x1b25e728,0x22f655be
+DD 0x44bdaa1d,0x7d6e188b,0x371acf31,0x0ec97da7
+DD 0xa3f36045,0x9a20d2d3,0xd0540569,0xe987b7ff
+DD 0x1cc3fa2f,0x251048b9,0x6f649f03,0x56b72d95
+DD 0xfb8d3077,0xc25e82e1,0x882a555b,0xb1f9e7cd
+DD 0xd7b2186e,0xee61aaf8,0xa4157d42,0x9dc6cfd4
+DD 0x30fcd236,0x092f60a0,0x435bb71a,0x7a88058c
+DD 0xac3f5a4b,0x95ece8dd,0xdf983f67,0xe64b8df1
+DD 0x4b719013,0x72a22285,0x38d6f53f,0x010547a9
+DD 0x674eb80a,0x5e9d0a9c,0x14e9dd26,0x2d3a6fb0
+DD 0x80007252,0xb9d3c0c4,0xf3a7177e,0xca74a5e8
+DD 0x3f30e838,0x06e35aae,0x4c978d14,0x75443f82
+DD 0xd87e2260,0xe1ad90f6,0xabd9474c,0x920af5da
+DD 0xf4410a79,0xcd92b8ef,0x87e66f55,0xbe35ddc3
+DD 0x130fc021,0x2adc72b7,0x60a8a50d,0x597b179b
+DD 0xc82a6c72,0xf1f9dee4,0xbb8d095e,0x825ebbc8
+DD 0x2f64a62a,0x16b714bc,0x5cc3c306,0x65107190
+DD 0x035b8e33,0x3a883ca5,0x70fceb1f,0x492f5989
+DD 0xe415446b,0xddc6f6fd,0x97b22147,0xae6193d1
+DD 0x5b25de01,0x62f66c97,0x2882bb2d,0x115109bb
+DD 0xbc6b1459,0x85b8a6cf,0xcfcc7175,0xf61fc3e3
+DD 0x90543c40,0xa9878ed6,0xe3f3596c,0xda20ebfa
+DD 0x771af618,0x4ec9448e,0x04bd9334,0x3d6e21a2
+DD 0xebd97e65,0xd20accf3,0x987e1b49,0xa1ada9df
+DD 0x0c97b43d,0x354406ab,0x7f30d111,0x46e36387
+DD 0x20a89c24,0x197b2eb2,0x530ff908,0x6adc4b9e
+DD 0xc7e6567c,0xfe35e4ea,0xb4413350,0x8d9281c6
+DD 0x78d6cc16,0x41057e80,0x0b71a93a,0x32a21bac
+DD 0x9f98064e,0xa64bb4d8,0xec3f6362,0xd5ecd1f4
+DD 0xb3a72e57,0x8a749cc1,0xc0004b7b,0xf9d3f9ed
+DD 0x54e9e40f,0x6d3a5699,0x274e8123,0x1e9d33b5
+
+mul_table_152:
+DD 0x00000000,0x878a92a7,0x0af953bf,0x8d73c118
+DD 0x15f2a77e,0x927835d9,0x1f0bf4c1,0x98816666
+DD 0x2be54efc,0xac6fdc5b,0x211c1d43,0xa6968fe4
+DD 0x3e17e982,0xb99d7b25,0x34eeba3d,0xb364289a
+DD 0x57ca9df8,0xd0400f5f,0x5d33ce47,0xdab95ce0
+DD 0x42383a86,0xc5b2a821,0x48c16939,0xcf4bfb9e
+DD 0x7c2fd304,0xfba541a3,0x76d680bb,0xf15c121c
+DD 0x69dd747a,0xee57e6dd,0x632427c5,0xe4aeb562
+DD 0xaf953bf0,0x281fa957,0xa56c684f,0x22e6fae8
+DD 0xba679c8e,0x3ded0e29,0xb09ecf31,0x37145d96
+DD 0x8470750c,0x03fae7ab,0x8e8926b3,0x0903b414
+DD 0x9182d272,0x160840d5,0x9b7b81cd,0x1cf1136a
+DD 0xf85fa608,0x7fd534af,0xf2a6f5b7,0x752c6710
+DD 0xedad0176,0x6a2793d1,0xe75452c9,0x60dec06e
+DD 0xd3bae8f4,0x54307a53,0xd943bb4b,0x5ec929ec
+DD 0xc6484f8a,0x41c2dd2d,0xccb11c35,0x4b3b8e92
+DD 0x5ac60111,0xdd4c93b6,0x503f52ae,0xd7b5c009
+DD 0x4f34a66f,0xc8be34c8,0x45cdf5d0,0xc2476777
+DD 0x71234fed,0xf6a9dd4a,0x7bda1c52,0xfc508ef5
+DD 0x64d1e893,0xe35b7a34,0x6e28bb2c,0xe9a2298b
+DD 0x0d0c9ce9,0x8a860e4e,0x07f5cf56,0x807f5df1
+DD 0x18fe3b97,0x9f74a930,0x12076828,0x958dfa8f
+DD 0x26e9d215,0xa16340b2,0x2c1081aa,0xab9a130d
+DD 0x331b756b,0xb491e7cc,0x39e226d4,0xbe68b473
+DD 0xf5533ae1,0x72d9a846,0xffaa695e,0x7820fbf9
+DD 0xe0a19d9f,0x672b0f38,0xea58ce20,0x6dd25c87
+DD 0xdeb6741d,0x593ce6ba,0xd44f27a2,0x53c5b505
+DD 0xcb44d363,0x4cce41c4,0xc1bd80dc,0x4637127b
+DD 0xa299a719,0x251335be,0xa860f4a6,0x2fea6601
+DD 0xb76b0067,0x30e192c0,0xbd9253d8,0x3a18c17f
+DD 0x897ce9e5,0x0ef67b42,0x8385ba5a,0x040f28fd
+DD 0x9c8e4e9b,0x1b04dc3c,0x96771d24,0x11fd8f83
+DD 0xb58c0222,0x32069085,0xbf75519d,0x38ffc33a
+DD 0xa07ea55c,0x27f437fb,0xaa87f6e3,0x2d0d6444
+DD 0x9e694cde,0x19e3de79,0x94901f61,0x131a8dc6
+DD 0x8b9beba0,0x0c117907,0x8162b81f,0x06e82ab8
+DD 0xe2469fda,0x65cc0d7d,0xe8bfcc65,0x6f355ec2
+DD 0xf7b438a4,0x703eaa03,0xfd4d6b1b,0x7ac7f9bc
+DD 0xc9a3d126,0x4e294381,0xc35a8299,0x44d0103e
+DD 0xdc517658,0x5bdbe4ff,0xd6a825e7,0x5122b740
+DD 0x1a1939d2,0x9d93ab75,0x10e06a6d,0x976af8ca
+DD 0x0feb9eac,0x88610c0b,0x0512cd13,0x82985fb4
+DD 0x31fc772e,0xb676e589,0x3b052491,0xbc8fb636
+DD 0x240ed050,0xa38442f7,0x2ef783ef,0xa97d1148
+DD 0x4dd3a42a,0xca59368d,0x472af795,0xc0a06532
+DD 0x58210354,0xdfab91f3,0x52d850eb,0xd552c24c
+DD 0x6636ead6,0xe1bc7871,0x6ccfb969,0xeb452bce
+DD 0x73c44da8,0xf44edf0f,0x793d1e17,0xfeb78cb0
+DD 0xef4a0333,0x68c09194,0xe5b3508c,0x6239c22b
+DD 0xfab8a44d,0x7d3236ea,0xf041f7f2,0x77cb6555
+DD 0xc4af4dcf,0x4325df68,0xce561e70,0x49dc8cd7
+DD 0xd15deab1,0x56d77816,0xdba4b90e,0x5c2e2ba9
+DD 0xb8809ecb,0x3f0a0c6c,0xb279cd74,0x35f35fd3
+DD 0xad7239b5,0x2af8ab12,0xa78b6a0a,0x2001f8ad
+DD 0x9365d037,0x14ef4290,0x999c8388,0x1e16112f
+DD 0x86977749,0x011de5ee,0x8c6e24f6,0x0be4b651
+DD 0x40df38c3,0xc755aa64,0x4a266b7c,0xcdacf9db
+DD 0x552d9fbd,0xd2a70d1a,0x5fd4cc02,0xd85e5ea5
+DD 0x6b3a763f,0xecb0e498,0x61c32580,0xe649b727
+DD 0x7ec8d141,0xf94243e6,0x743182fe,0xf3bb1059
+DD 0x1715a53b,0x909f379c,0x1decf684,0x9a666423
+DD 0x02e70245,0x856d90e2,0x081e51fa,0x8f94c35d
+DD 0x3cf0ebc7,0xbb7a7960,0x3609b878,0xb1832adf
+DD 0x29024cb9,0xae88de1e,0x23fb1f06,0xa4718da1
+
+mul_table_312:
+DD 0x00000000,0xbac2fd7b,0x70698c07,0xcaab717c
+DD 0xe0d3180e,0x5a11e575,0x90ba9409,0x2a786972
+DD 0xc44a46ed,0x7e88bb96,0xb423caea,0x0ee13791
+DD 0x24995ee3,0x9e5ba398,0x54f0d2e4,0xee322f9f
+DD 0x8d78fb2b,0x37ba0650,0xfd11772c,0x47d38a57
+DD 0x6dabe325,0xd7691e5e,0x1dc26f22,0xa7009259
+DD 0x4932bdc6,0xf3f040bd,0x395b31c1,0x8399ccba
+DD 0xa9e1a5c8,0x132358b3,0xd98829cf,0x634ad4b4
+DD 0x1f1d80a7,0xa5df7ddc,0x6f740ca0,0xd5b6f1db
+DD 0xffce98a9,0x450c65d2,0x8fa714ae,0x3565e9d5
+DD 0xdb57c64a,0x61953b31,0xab3e4a4d,0x11fcb736
+DD 0x3b84de44,0x8146233f,0x4bed5243,0xf12faf38
+DD 0x92657b8c,0x28a786f7,0xe20cf78b,0x58ce0af0
+DD 0x72b66382,0xc8749ef9,0x02dfef85,0xb81d12fe
+DD 0x562f3d61,0xecedc01a,0x2646b166,0x9c844c1d
+DD 0xb6fc256f,0x0c3ed814,0xc695a968,0x7c575413
+DD 0x3e3b014e,0x84f9fc35,0x4e528d49,0xf4907032
+DD 0xdee81940,0x642ae43b,0xae819547,0x1443683c
+DD 0xfa7147a3,0x40b3bad8,0x8a18cba4,0x30da36df
+DD 0x1aa25fad,0xa060a2d6,0x6acbd3aa,0xd0092ed1
+DD 0xb343fa65,0x0981071e,0xc32a7662,0x79e88b19
+DD 0x5390e26b,0xe9521f10,0x23f96e6c,0x993b9317
+DD 0x7709bc88,0xcdcb41f3,0x0760308f,0xbda2cdf4
+DD 0x97daa486,0x2d1859fd,0xe7b32881,0x5d71d5fa
+DD 0x212681e9,0x9be47c92,0x514f0dee,0xeb8df095
+DD 0xc1f599e7,0x7b37649c,0xb19c15e0,0x0b5ee89b
+DD 0xe56cc704,0x5fae3a7f,0x95054b03,0x2fc7b678
+DD 0x05bfdf0a,0xbf7d2271,0x75d6530d,0xcf14ae76
+DD 0xac5e7ac2,0x169c87b9,0xdc37f6c5,0x66f50bbe
+DD 0x4c8d62cc,0xf64f9fb7,0x3ce4eecb,0x862613b0
+DD 0x68143c2f,0xd2d6c154,0x187db028,0xa2bf4d53
+DD 0x88c72421,0x3205d95a,0xf8aea826,0x426c555d
+DD 0x7c76029c,0xc6b4ffe7,0x0c1f8e9b,0xb6dd73e0
+DD 0x9ca51a92,0x2667e7e9,0xeccc9695,0x560e6bee
+DD 0xb83c4471,0x02feb90a,0xc855c876,0x7297350d
+DD 0x58ef5c7f,0xe22da104,0x2886d078,0x92442d03
+DD 0xf10ef9b7,0x4bcc04cc,0x816775b0,0x3ba588cb
+DD 0x11dde1b9,0xab1f1cc2,0x61b46dbe,0xdb7690c5
+DD 0x3544bf5a,0x8f864221,0x452d335d,0xffefce26
+DD 0xd597a754,0x6f555a2f,0xa5fe2b53,0x1f3cd628
+DD 0x636b823b,0xd9a97f40,0x13020e3c,0xa9c0f347
+DD 0x83b89a35,0x397a674e,0xf3d11632,0x4913eb49
+DD 0xa721c4d6,0x1de339ad,0xd74848d1,0x6d8ab5aa
+DD 0x47f2dcd8,0xfd3021a3,0x379b50df,0x8d59ada4
+DD 0xee137910,0x54d1846b,0x9e7af517,0x24b8086c
+DD 0x0ec0611e,0xb4029c65,0x7ea9ed19,0xc46b1062
+DD 0x2a593ffd,0x909bc286,0x5a30b3fa,0xe0f24e81
+DD 0xca8a27f3,0x7048da88,0xbae3abf4,0x0021568f
+DD 0x424d03d2,0xf88ffea9,0x32248fd5,0x88e672ae
+DD 0xa29e1bdc,0x185ce6a7,0xd2f797db,0x68356aa0
+DD 0x8607453f,0x3cc5b844,0xf66ec938,0x4cac3443
+DD 0x66d45d31,0xdc16a04a,0x16bdd136,0xac7f2c4d
+DD 0xcf35f8f9,0x75f70582,0xbf5c74fe,0x059e8985
+DD 0x2fe6e0f7,0x95241d8c,0x5f8f6cf0,0xe54d918b
+DD 0x0b7fbe14,0xb1bd436f,0x7b163213,0xc1d4cf68
+DD 0xebaca61a,0x516e5b61,0x9bc52a1d,0x2107d766
+DD 0x5d508375,0xe7927e0e,0x2d390f72,0x97fbf209
+DD 0xbd839b7b,0x07416600,0xcdea177c,0x7728ea07
+DD 0x991ac598,0x23d838e3,0xe973499f,0x53b1b4e4
+DD 0x79c9dd96,0xc30b20ed,0x09a05191,0xb362acea
+DD 0xd028785e,0x6aea8525,0xa041f459,0x1a830922
+DD 0x30fb6050,0x8a399d2b,0x4092ec57,0xfa50112c
+DD 0x14623eb3,0xaea0c3c8,0x640bb2b4,0xdec94fcf
+DD 0xf4b126bd,0x4e73dbc6,0x84d8aaba,0x3e1a57c1
+
+mul_table_632:
+DD 0x00000000,0x6b749fb2,0xd6e93f64,0xbd9da0d6
+DD 0xa83e0839,0xc34a978b,0x7ed7375d,0x15a3a8ef
+DD 0x55906683,0x3ee4f931,0x837959e7,0xe80dc655
+DD 0xfdae6eba,0x96daf108,0x2b4751de,0x4033ce6c
+DD 0xab20cd06,0xc05452b4,0x7dc9f262,0x16bd6dd0
+DD 0x031ec53f,0x686a5a8d,0xd5f7fa5b,0xbe8365e9
+DD 0xfeb0ab85,0x95c43437,0x285994e1,0x432d0b53
+DD 0x568ea3bc,0x3dfa3c0e,0x80679cd8,0xeb13036a
+DD 0x53adecfd,0x38d9734f,0x8544d399,0xee304c2b
+DD 0xfb93e4c4,0x90e77b76,0x2d7adba0,0x460e4412
+DD 0x063d8a7e,0x6d4915cc,0xd0d4b51a,0xbba02aa8
+DD 0xae038247,0xc5771df5,0x78eabd23,0x139e2291
+DD 0xf88d21fb,0x93f9be49,0x2e641e9f,0x4510812d
+DD 0x50b329c2,0x3bc7b670,0x865a16a6,0xed2e8914
+DD 0xad1d4778,0xc669d8ca,0x7bf4781c,0x1080e7ae
+DD 0x05234f41,0x6e57d0f3,0xd3ca7025,0xb8beef97
+DD 0xa75bd9fa,0xcc2f4648,0x71b2e69e,0x1ac6792c
+DD 0x0f65d1c3,0x64114e71,0xd98ceea7,0xb2f87115
+DD 0xf2cbbf79,0x99bf20cb,0x2422801d,0x4f561faf
+DD 0x5af5b740,0x318128f2,0x8c1c8824,0xe7681796
+DD 0x0c7b14fc,0x670f8b4e,0xda922b98,0xb1e6b42a
+DD 0xa4451cc5,0xcf318377,0x72ac23a1,0x19d8bc13
+DD 0x59eb727f,0x329fedcd,0x8f024d1b,0xe476d2a9
+DD 0xf1d57a46,0x9aa1e5f4,0x273c4522,0x4c48da90
+DD 0xf4f63507,0x9f82aab5,0x221f0a63,0x496b95d1
+DD 0x5cc83d3e,0x37bca28c,0x8a21025a,0xe1559de8
+DD 0xa1665384,0xca12cc36,0x778f6ce0,0x1cfbf352
+DD 0x09585bbd,0x622cc40f,0xdfb164d9,0xb4c5fb6b
+DD 0x5fd6f801,0x34a267b3,0x893fc765,0xe24b58d7
+DD 0xf7e8f038,0x9c9c6f8a,0x2101cf5c,0x4a7550ee
+DD 0x0a469e82,0x61320130,0xdcafa1e6,0xb7db3e54
+DD 0xa27896bb,0xc90c0909,0x7491a9df,0x1fe5366d
+DD 0x4b5bc505,0x202f5ab7,0x9db2fa61,0xf6c665d3
+DD 0xe365cd3c,0x8811528e,0x358cf258,0x5ef86dea
+DD 0x1ecba386,0x75bf3c34,0xc8229ce2,0xa3560350
+DD 0xb6f5abbf,0xdd81340d,0x601c94db,0x0b680b69
+DD 0xe07b0803,0x8b0f97b1,0x36923767,0x5de6a8d5
+DD 0x4845003a,0x23319f88,0x9eac3f5e,0xf5d8a0ec
+DD 0xb5eb6e80,0xde9ff132,0x630251e4,0x0876ce56
+DD 0x1dd566b9,0x76a1f90b,0xcb3c59dd,0xa048c66f
+DD 0x18f629f8,0x7382b64a,0xce1f169c,0xa56b892e
+DD 0xb0c821c1,0xdbbcbe73,0x66211ea5,0x0d558117
+DD 0x4d664f7b,0x2612d0c9,0x9b8f701f,0xf0fbefad
+DD 0xe5584742,0x8e2cd8f0,0x33b17826,0x58c5e794
+DD 0xb3d6e4fe,0xd8a27b4c,0x653fdb9a,0x0e4b4428
+DD 0x1be8ecc7,0x709c7375,0xcd01d3a3,0xa6754c11
+DD 0xe646827d,0x8d321dcf,0x30afbd19,0x5bdb22ab
+DD 0x4e788a44,0x250c15f6,0x9891b520,0xf3e52a92
+DD 0xec001cff,0x8774834d,0x3ae9239b,0x519dbc29
+DD 0x443e14c6,0x2f4a8b74,0x92d72ba2,0xf9a3b410
+DD 0xb9907a7c,0xd2e4e5ce,0x6f794518,0x040ddaaa
+DD 0x11ae7245,0x7adaedf7,0xc7474d21,0xac33d293
+DD 0x4720d1f9,0x2c544e4b,0x91c9ee9d,0xfabd712f
+DD 0xef1ed9c0,0x846a4672,0x39f7e6a4,0x52837916
+DD 0x12b0b77a,0x79c428c8,0xc459881e,0xaf2d17ac
+DD 0xba8ebf43,0xd1fa20f1,0x6c678027,0x07131f95
+DD 0xbfadf002,0xd4d96fb0,0x6944cf66,0x023050d4
+DD 0x1793f83b,0x7ce76789,0xc17ac75f,0xaa0e58ed
+DD 0xea3d9681,0x81490933,0x3cd4a9e5,0x57a03657
+DD 0x42039eb8,0x2977010a,0x94eaa1dc,0xff9e3e6e
+DD 0x148d3d04,0x7ff9a2b6,0xc2640260,0xa9109dd2
+DD 0xbcb3353d,0xd7c7aa8f,0x6a5a0a59,0x012e95eb
+DD 0x411d5b87,0x2a69c435,0x97f464e3,0xfc80fb51
+DD 0xe92353be,0x8257cc0c,0x3fca6cda,0x54bef368
+
+mul_table_1272:
+DD 0x00000000,0xdd66cbbb,0xbf21e187,0x62472a3c
+DD 0x7bafb5ff,0xa6c97e44,0xc48e5478,0x19e89fc3
+DD 0xf75f6bfe,0x2a39a045,0x487e8a79,0x951841c2
+DD 0x8cf0de01,0x519615ba,0x33d13f86,0xeeb7f43d
+DD 0xeb52a10d,0x36346ab6,0x5473408a,0x89158b31
+DD 0x90fd14f2,0x4d9bdf49,0x2fdcf575,0xf2ba3ece
+DD 0x1c0dcaf3,0xc16b0148,0xa32c2b74,0x7e4ae0cf
+DD 0x67a27f0c,0xbac4b4b7,0xd8839e8b,0x05e55530
+DD 0xd34934eb,0x0e2fff50,0x6c68d56c,0xb10e1ed7
+DD 0xa8e68114,0x75804aaf,0x17c76093,0xcaa1ab28
+DD 0x24165f15,0xf97094ae,0x9b37be92,0x46517529
+DD 0x5fb9eaea,0x82df2151,0xe0980b6d,0x3dfec0d6
+DD 0x381b95e6,0xe57d5e5d,0x873a7461,0x5a5cbfda
+DD 0x43b42019,0x9ed2eba2,0xfc95c19e,0x21f30a25
+DD 0xcf44fe18,0x122235a3,0x70651f9f,0xad03d424
+DD 0xb4eb4be7,0x698d805c,0x0bcaaa60,0xd6ac61db
+DD 0xa37e1f27,0x7e18d49c,0x1c5ffea0,0xc139351b
+DD 0xd8d1aad8,0x05b76163,0x67f04b5f,0xba9680e4
+DD 0x542174d9,0x8947bf62,0xeb00955e,0x36665ee5
+DD 0x2f8ec126,0xf2e80a9d,0x90af20a1,0x4dc9eb1a
+DD 0x482cbe2a,0x954a7591,0xf70d5fad,0x2a6b9416
+DD 0x33830bd5,0xeee5c06e,0x8ca2ea52,0x51c421e9
+DD 0xbf73d5d4,0x62151e6f,0x00523453,0xdd34ffe8
+DD 0xc4dc602b,0x19baab90,0x7bfd81ac,0xa69b4a17
+DD 0x70372bcc,0xad51e077,0xcf16ca4b,0x127001f0
+DD 0x0b989e33,0xd6fe5588,0xb4b97fb4,0x69dfb40f
+DD 0x87684032,0x5a0e8b89,0x3849a1b5,0xe52f6a0e
+DD 0xfcc7f5cd,0x21a13e76,0x43e6144a,0x9e80dff1
+DD 0x9b658ac1,0x4603417a,0x24446b46,0xf922a0fd
+DD 0xe0ca3f3e,0x3dacf485,0x5febdeb9,0x828d1502
+DD 0x6c3ae13f,0xb15c2a84,0xd31b00b8,0x0e7dcb03
+DD 0x179554c0,0xcaf39f7b,0xa8b4b547,0x75d27efc
+DD 0x431048bf,0x9e768304,0xfc31a938,0x21576283
+DD 0x38bffd40,0xe5d936fb,0x879e1cc7,0x5af8d77c
+DD 0xb44f2341,0x6929e8fa,0x0b6ec2c6,0xd608097d
+DD 0xcfe096be,0x12865d05,0x70c17739,0xada7bc82
+DD 0xa842e9b2,0x75242209,0x17630835,0xca05c38e
+DD 0xd3ed5c4d,0x0e8b97f6,0x6cccbdca,0xb1aa7671
+DD 0x5f1d824c,0x827b49f7,0xe03c63cb,0x3d5aa870
+DD 0x24b237b3,0xf9d4fc08,0x9b93d634,0x46f51d8f
+DD 0x90597c54,0x4d3fb7ef,0x2f789dd3,0xf21e5668
+DD 0xebf6c9ab,0x36900210,0x54d7282c,0x89b1e397
+DD 0x670617aa,0xba60dc11,0xd827f62d,0x05413d96
+DD 0x1ca9a255,0xc1cf69ee,0xa38843d2,0x7eee8869
+DD 0x7b0bdd59,0xa66d16e2,0xc42a3cde,0x194cf765
+DD 0x00a468a6,0xddc2a31d,0xbf858921,0x62e3429a
+DD 0x8c54b6a7,0x51327d1c,0x33755720,0xee139c9b
+DD 0xf7fb0358,0x2a9dc8e3,0x48dae2df,0x95bc2964
+DD 0xe06e5798,0x3d089c23,0x5f4fb61f,0x82297da4
+DD 0x9bc1e267,0x46a729dc,0x24e003e0,0xf986c85b
+DD 0x17313c66,0xca57f7dd,0xa810dde1,0x7576165a
+DD 0x6c9e8999,0xb1f84222,0xd3bf681e,0x0ed9a3a5
+DD 0x0b3cf695,0xd65a3d2e,0xb41d1712,0x697bdca9
+DD 0x7093436a,0xadf588d1,0xcfb2a2ed,0x12d46956
+DD 0xfc639d6b,0x210556d0,0x43427cec,0x9e24b757
+DD 0x87cc2894,0x5aaae32f,0x38edc913,0xe58b02a8
+DD 0x33276373,0xee41a8c8,0x8c0682f4,0x5160494f
+DD 0x4888d68c,0x95ee1d37,0xf7a9370b,0x2acffcb0
+DD 0xc478088d,0x191ec336,0x7b59e90a,0xa63f22b1
+DD 0xbfd7bd72,0x62b176c9,0x00f65cf5,0xdd90974e
+DD 0xd875c27e,0x051309c5,0x675423f9,0xba32e842
+DD 0xa3da7781,0x7ebcbc3a,0x1cfb9606,0xc19d5dbd
+DD 0x2f2aa980,0xf24c623b,0x900b4807,0x4d6d83bc
+DD 0x54851c7f,0x89e3d7c4,0xeba4fdf8,0x36c23643
+
+%macro slversion 4
+global %1_slver_%2%3%4
+global %1_slver
+%1_slver:
+%1_slver_%2%3%4:
+	dw 0x%4
+	db 0x%3, 0x%2
+%endmacro
+;;;       func            core, ver, snum
+slversion crc32_iscsi_00, 00,   02,  0014
+%ifidn __OUTPUT_FORMAT__, elf64
+; inform linker that this doesn't require executable stack
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/common/crc32c_intel_fast_zero_asm.s b/src/common/crc32c_intel_fast_zero_asm.s
new file mode 100644
index 00000000..e6483963
--- /dev/null
+++ b/src/common/crc32c_intel_fast_zero_asm.s
@@ -0,0 +1,656 @@
+;
+; Copyright 2012-2013 Intel Corporation All Rights Reserved.
+; All rights reserved.
+;
+; http://opensource.org/licenses/BSD-3-Clause
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following
+; conditions are met:
+;
+; * Redistributions of source code must retain the above copyright
+;   notice, this list of conditions and the following disclaimer.
+;
+; * Redistributions in binary form must reproduce the above copyright
+;   notice, this list of conditions and the following disclaimer in
+;   the documentation and/or other materials provided with the
+;   distribution.
+;
+; * Neither the name of the Intel Corporation nor the names of its
+;   contributors may be used to endorse or promote products derived
+;   from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+; FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+; COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+; INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+; HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+; STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+; ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+; OF THE POSSIBILITY OF SUCH DAMAGE.
+;
+
+; Function to compute iscsi CRC32 with table-based recombination
+; crc done "by 3" with block sizes 1920, 960, 480, 240
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; crcB3 MACRO to implement crc32 on 3 %%bSize-byte blocks
+%macro  crcB3 3
+%define %%bSize   %1    ; 1/3 of buffer size
+%define %%td2     %2    ; table offset for crc0 (2/3 of buffer)
+%define %%td1     %3    ; table offset for crc1 (1/3 of buffer)
+
+%IF %%bSize=640
+	sub     len, %%bSize*3
+	js      %%crcB3_end           ;; jump to next level if 3*blockSize > len
+%ELSE
+	cmp     len, %%bSize*3
+	jnae    %%crcB3_end           ;; jump to next level if 3*blockSize > len
+%ENDIF
+	;;;;;; Calculate CRC of 3 blocks of the buffer ;;;;;;
+%%crcB3_loop:
+					;; rax = crc0 = initial crc
+	xor     rbx, rbx                ;; rbx = crc1 = 0;
+	xor     r10, r10                ;; r10 = crc2 = 0;
+
+ %assign i 0
+ %rep %%bSize/8 - 1
+	crc32   rax, bufptmp  ;; update crc0
+	crc32   rbx, bufptmp  ;; update crc1
+	crc32   r10, bufptmp  ;; update crc2
+	%assign i (i+8)
+ %endrep
+	crc32   rax, bufptmp  ;; update crc0
+	crc32   rbx, bufptmp  ;; update crc1
+; SKIP  ;crc32  r10, bufptmp  ;; update crc2
+
+	; merge in crc0
+	movzx   bufp_dw, al
+	mov     r9d, [crc_init + bufp*4 + %%td2]
+	movzx   bufp_dw, ah
+	shr     eax, 16
+	mov     r11d, [crc_init + bufp*4 + %%td2]
+	shl     r11, 8
+	xor     r9, r11
+
+	movzx   bufp_dw, al
+	mov     r11d, [crc_init + bufp*4 + %%td2]
+	movzx   bufp_dw, ah
+	shl     r11, 16
+	xor     r9, r11
+	mov     r11d, [crc_init + bufp*4 + %%td2]
+	shl     r11, 24
+	xor     r9, r11
+
+	; merge in crc1
+
+	movzx   bufp_dw, bl
+	mov     r11d, [crc_init + bufp*4 + %%td1]
+	movzx   bufp_dw, bh
+	shr     ebx, 16
+	xor     r9, r11
+	mov     r11d, [crc_init + bufp*4 + %%td1]
+	shl     r11, 8
+	xor     r9, r11
+
+	movzx   bufp_dw, bl
+	mov     r11d, [crc_init + bufp*4 + %%td1]
+	movzx   bufp_dw, bh
+	shl     r11, 16
+	xor     r9, r11
+	mov     r11d, [crc_init + bufp*4 + %%td1]
+	shl     r11, 24
+	xor     r9, r11
+
+	; xor     r9, [bufptmp+i + 2*%%bSize]
+	crc32   r10, r9
+	mov     rax, r10
+
+	; add     bufptmp, %%bSize*3      ;; move to next block
+	sub     len, %%bSize*3
+%IF %%bSize=640
+	jns     %%crcB3_loop
+%ENDIF
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%%crcB3_end:
+%IF %%bSize=640
+	add     len, %%bSize*3
+%ENDIF
+	je      do_return               ;; return if remaining data is zero
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define ABI_IS_AMD64
+%elifidn __OUTPUT_FORMAT__, macho64
+%define ABI_IS_AMD64
+%endif
+
+;;; ISCSI CRC 32 Implementation with crc32 Instruction
+
+;;; unsigned int crc32_iscsi_00(unsigned char * buffer, int len, unsigned int crc_init);
+;;;
+;;;        *buf = rcx
+;;;         len = rdx
+;;;    crc_init = r8
+;;;
+
+global  crc32_iscsi_zero_00:function
+crc32_iscsi_zero_00:
+
+%ifdef ABI_IS_AMD64
+%define bufp            rdi
+%define bufp_dw         edi
+%define bufp_w          di
+%define bufp_b          dil
+%define bufptmp         rcx
+%define block_0         rcx
+%define block_1         r8
+%define block_2         r11
+%define len             rsi
+%define len_dw          esi
+%define len_w           si
+%define len_b           sil
+%define crc_init        rdx
+%define crc_init_dw     edx
+%else
+%define bufp            rcx
+%define bufp_dw         ecx
+%define bufp_w          cx
+%define bufp_b          cl
+%define bufptmp         rdi
+%define block_0         rdi
+%define block_1         rsi
+%define block_2         r11
+%define len             rdx
+%define len_dw          edx
+%define len_w           dx
+%define len_b           dl
+%define crc_init        r8
+%define crc_init_dw     r8d
+%endif
+
+
+	push    rdi
+	push    rbx
+
+	mov     rax, crc_init           ;; rax = crc_init;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; 1) ALIGN: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; no need for alignment
+	xor bufptmp, bufptmp
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; 2) BLOCK LEVEL: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+proc_block:
+	cmp     len, 240
+	jb      bit8
+
+	lea     crc_init, [mul_table_72 wrt rip]  ;; load table base address
+
+	crcB3   640, 0x1000, 0x0c00     ; 640*3 = 1920 (Tables 1280, 640)
+	crcB3   320, 0x0c00, 0x0800     ; 320*3 =  960 (Tables  640, 320)
+	crcB3   160, 0x0800, 0x0400     ; 160*3 =  480 (Tables  320, 160)
+	crcB3    80, 0x0400, 0x0000     ;  80*3 =  240 (Tables  160,  80)
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;4) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of rdx are full)
+
+bit8:
+	shl     len_b, 1                ;; shift-out MSB (bit-7)
+	jnc     bit7                    ;; jump to bit-6 if bit-7 == 0
+ %assign i 0
+ %rep 16
+	crc32   rax, bufptmp        ;; compute crc32 of 8-byte data
+	%assign i (i+8)
+ %endrep
+	je      do_return               ;; return if remaining data is zero
+
+bit7:
+	shl     len_b, 1                ;; shift-out MSB (bit-7)
+	jnc     bit6                    ;; jump to bit-6 if bit-7 == 0
+ %assign i 0
+ %rep 8
+	crc32   rax, bufptmp        ;; compute crc32 of 8-byte data
+	%assign i (i+8)
+ %endrep
+	je      do_return               ;; return if remaining data is zero
+	; add     bufptmp, 64             ;; buf +=64; (next 64 bytes)
+bit6:
+	shl     len_b, 1                ;; shift-out MSB (bit-6)
+	jnc     bit5                    ;; jump to bit-5 if bit-6 == 0
+ %assign i 0
+ %rep 4
+	crc32   rax, bufptmp        ;;    compute crc32 of 8-byte data
+	%assign i (i+8)
+ %endrep
+	je      do_return               ;; return if remaining data is zero
+	; add     bufptmp, 32             ;; buf +=32; (next 32 bytes)
+bit5:
+	shl     len_b, 1                ;; shift-out MSB (bit-5)
+	jnc     bit4                    ;; jump to bit-4 if bit-5 == 0
+ %assign i 0
+ %rep 2
+	crc32   rax, bufptmp        ;;    compute crc32 of 8-byte data
+	%assign i (i+8)
+ %endrep
+	je      do_return               ;; return if remaining data is zero
+	; add     bufptmp, 16             ;; buf +=16; (next 16 bytes)
+bit4:
+	shl     len_b, 1                ;; shift-out MSB (bit-4)
+	jnc     bit3                    ;; jump to bit-3 if bit-4 == 0
+	crc32   rax, bufptmp          ;; compute crc32 of 8-byte data
+	je      do_return               ;; return if remaining data is zero
+	; add     bufptmp, 8              ;; buf +=8; (next 8 bytes)
+bit3:
+	mov     rbx, bufptmp          ;; load a 8-bytes from the buffer:
+	shl     len_b, 1                ;; shift-out MSB (bit-3)
+	jnc     bit2                    ;; jump to bit-2 if bit-3 == 0
+	crc32   eax, ebx                ;; compute crc32 of 4-byte data
+	je      do_return               ;; return if remaining data is zero
+	shr     rbx, 32                 ;; get next 3 bytes
+bit2:
+	shl     len_b, 1                ;; shift-out MSB (bit-2)
+	jnc     bit1                    ;; jump to bit-1 if bit-2 == 0
+	crc32   eax, bx                 ;; compute crc32 of 2-byte data
+	je      do_return               ;; return if remaining data is zero
+	shr     rbx, 16                 ;; next byte
+bit1:
+	test    len_b,len_b
+	je      do_return
+	crc32   eax, bl                 ;; compute crc32 of 1-byte data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+do_return:
+
+	pop     rbx
+	pop     rdi
+	ret
+
+less_than_8:
+	xor bufp, bufp
+	test    len,4
+	jz      less_than_4
+	crc32   eax, bufp_dw
+	add     bufptmp,4
+less_than_4:
+	test    len,2
+	jz      less_than_2
+	crc32   eax, bufp_w
+	add     bufptmp,2
+less_than_2:
+	test    len,1
+	jz      do_return
+	crc32   rax, bufp_b
+	pop     rbx
+	pop     bufptmp
+	ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; global mul_table_72, mul_table_152, mul_table_312, mul_table_632, mul_table_1272
+
+section .data
+align   8
+mul_table_72:
+DD 0x00000000,0x39d3b296,0x73a7652c,0x4a74d7ba
+DD 0xe74eca58,0xde9d78ce,0x94e9af74,0xad3a1de2
+DD 0xcb71e241,0xf2a250d7,0xb8d6876d,0x810535fb
+DD 0x2c3f2819,0x15ec9a8f,0x5f984d35,0x664bffa3
+DD 0x930fb273,0xaadc00e5,0xe0a8d75f,0xd97b65c9
+DD 0x7441782b,0x4d92cabd,0x07e61d07,0x3e35af91
+DD 0x587e5032,0x61ade2a4,0x2bd9351e,0x120a8788
+DD 0xbf309a6a,0x86e328fc,0xcc97ff46,0xf5444dd0
+DD 0x23f31217,0x1a20a081,0x5054773b,0x6987c5ad
+DD 0xc4bdd84f,0xfd6e6ad9,0xb71abd63,0x8ec90ff5
+DD 0xe882f056,0xd15142c0,0x9b25957a,0xa2f627ec
+DD 0x0fcc3a0e,0x361f8898,0x7c6b5f22,0x45b8edb4
+DD 0xb0fca064,0x892f12f2,0xc35bc548,0xfa8877de
+DD 0x57b26a3c,0x6e61d8aa,0x24150f10,0x1dc6bd86
+DD 0x7b8d4225,0x425ef0b3,0x082a2709,0x31f9959f
+DD 0x9cc3887d,0xa5103aeb,0xef64ed51,0xd6b75fc7
+DD 0x47e6242e,0x7e3596b8,0x34414102,0x0d92f394
+DD 0xa0a8ee76,0x997b5ce0,0xd30f8b5a,0xeadc39cc
+DD 0x8c97c66f,0xb54474f9,0xff30a343,0xc6e311d5
+DD 0x6bd90c37,0x520abea1,0x187e691b,0x21addb8d
+DD 0xd4e9965d,0xed3a24cb,0xa74ef371,0x9e9d41e7
+DD 0x33a75c05,0x0a74ee93,0x40003929,0x79d38bbf
+DD 0x1f98741c,0x264bc68a,0x6c3f1130,0x55eca3a6
+DD 0xf8d6be44,0xc1050cd2,0x8b71db68,0xb2a269fe
+DD 0x64153639,0x5dc684af,0x17b25315,0x2e61e183
+DD 0x835bfc61,0xba884ef7,0xf0fc994d,0xc92f2bdb
+DD 0xaf64d478,0x96b766ee,0xdcc3b154,0xe51003c2
+DD 0x482a1e20,0x71f9acb6,0x3b8d7b0c,0x025ec99a
+DD 0xf71a844a,0xcec936dc,0x84bde166,0xbd6e53f0
+DD 0x10544e12,0x2987fc84,0x63f32b3e,0x5a2099a8
+DD 0x3c6b660b,0x05b8d49d,0x4fcc0327,0x761fb1b1
+DD 0xdb25ac53,0xe2f61ec5,0xa882c97f,0x91517be9
+DD 0x8fcc485c,0xb61ffaca,0xfc6b2d70,0xc5b89fe6
+DD 0x68828204,0x51513092,0x1b25e728,0x22f655be
+DD 0x44bdaa1d,0x7d6e188b,0x371acf31,0x0ec97da7
+DD 0xa3f36045,0x9a20d2d3,0xd0540569,0xe987b7ff
+DD 0x1cc3fa2f,0x251048b9,0x6f649f03,0x56b72d95
+DD 0xfb8d3077,0xc25e82e1,0x882a555b,0xb1f9e7cd
+DD 0xd7b2186e,0xee61aaf8,0xa4157d42,0x9dc6cfd4
+DD 0x30fcd236,0x092f60a0,0x435bb71a,0x7a88058c
+DD 0xac3f5a4b,0x95ece8dd,0xdf983f67,0xe64b8df1
+DD 0x4b719013,0x72a22285,0x38d6f53f,0x010547a9
+DD 0x674eb80a,0x5e9d0a9c,0x14e9dd26,0x2d3a6fb0
+DD 0x80007252,0xb9d3c0c4,0xf3a7177e,0xca74a5e8
+DD 0x3f30e838,0x06e35aae,0x4c978d14,0x75443f82
+DD 0xd87e2260,0xe1ad90f6,0xabd9474c,0x920af5da
+DD 0xf4410a79,0xcd92b8ef,0x87e66f55,0xbe35ddc3
+DD 0x130fc021,0x2adc72b7,0x60a8a50d,0x597b179b
+DD 0xc82a6c72,0xf1f9dee4,0xbb8d095e,0x825ebbc8
+DD 0x2f64a62a,0x16b714bc,0x5cc3c306,0x65107190
+DD 0x035b8e33,0x3a883ca5,0x70fceb1f,0x492f5989
+DD 0xe415446b,0xddc6f6fd,0x97b22147,0xae6193d1
+DD 0x5b25de01,0x62f66c97,0x2882bb2d,0x115109bb
+DD 0xbc6b1459,0x85b8a6cf,0xcfcc7175,0xf61fc3e3
+DD 0x90543c40,0xa9878ed6,0xe3f3596c,0xda20ebfa
+DD 0x771af618,0x4ec9448e,0x04bd9334,0x3d6e21a2
+DD 0xebd97e65,0xd20accf3,0x987e1b49,0xa1ada9df
+DD 0x0c97b43d,0x354406ab,0x7f30d111,0x46e36387
+DD 0x20a89c24,0x197b2eb2,0x530ff908,0x6adc4b9e
+DD 0xc7e6567c,0xfe35e4ea,0xb4413350,0x8d9281c6
+DD 0x78d6cc16,0x41057e80,0x0b71a93a,0x32a21bac
+DD 0x9f98064e,0xa64bb4d8,0xec3f6362,0xd5ecd1f4
+DD 0xb3a72e57,0x8a749cc1,0xc0004b7b,0xf9d3f9ed
+DD 0x54e9e40f,0x6d3a5699,0x274e8123,0x1e9d33b5
+
+mul_table_152:
+DD 0x00000000,0x878a92a7,0x0af953bf,0x8d73c118
+DD 0x15f2a77e,0x927835d9,0x1f0bf4c1,0x98816666
+DD 0x2be54efc,0xac6fdc5b,0x211c1d43,0xa6968fe4
+DD 0x3e17e982,0xb99d7b25,0x34eeba3d,0xb364289a
+DD 0x57ca9df8,0xd0400f5f,0x5d33ce47,0xdab95ce0
+DD 0x42383a86,0xc5b2a821,0x48c16939,0xcf4bfb9e
+DD 0x7c2fd304,0xfba541a3,0x76d680bb,0xf15c121c
+DD 0x69dd747a,0xee57e6dd,0x632427c5,0xe4aeb562
+DD 0xaf953bf0,0x281fa957,0xa56c684f,0x22e6fae8
+DD 0xba679c8e,0x3ded0e29,0xb09ecf31,0x37145d96
+DD 0x8470750c,0x03fae7ab,0x8e8926b3,0x0903b414
+DD 0x9182d272,0x160840d5,0x9b7b81cd,0x1cf1136a
+DD 0xf85fa608,0x7fd534af,0xf2a6f5b7,0x752c6710
+DD 0xedad0176,0x6a2793d1,0xe75452c9,0x60dec06e
+DD 0xd3bae8f4,0x54307a53,0xd943bb4b,0x5ec929ec
+DD 0xc6484f8a,0x41c2dd2d,0xccb11c35,0x4b3b8e92
+DD 0x5ac60111,0xdd4c93b6,0x503f52ae,0xd7b5c009
+DD 0x4f34a66f,0xc8be34c8,0x45cdf5d0,0xc2476777
+DD 0x71234fed,0xf6a9dd4a,0x7bda1c52,0xfc508ef5
+DD 0x64d1e893,0xe35b7a34,0x6e28bb2c,0xe9a2298b
+DD 0x0d0c9ce9,0x8a860e4e,0x07f5cf56,0x807f5df1
+DD 0x18fe3b97,0x9f74a930,0x12076828,0x958dfa8f
+DD 0x26e9d215,0xa16340b2,0x2c1081aa,0xab9a130d
+DD 0x331b756b,0xb491e7cc,0x39e226d4,0xbe68b473
+DD 0xf5533ae1,0x72d9a846,0xffaa695e,0x7820fbf9
+DD 0xe0a19d9f,0x672b0f38,0xea58ce20,0x6dd25c87
+DD 0xdeb6741d,0x593ce6ba,0xd44f27a2,0x53c5b505
+DD 0xcb44d363,0x4cce41c4,0xc1bd80dc,0x4637127b
+DD 0xa299a719,0x251335be,0xa860f4a6,0x2fea6601
+DD 0xb76b0067,0x30e192c0,0xbd9253d8,0x3a18c17f
+DD 0x897ce9e5,0x0ef67b42,0x8385ba5a,0x040f28fd
+DD 0x9c8e4e9b,0x1b04dc3c,0x96771d24,0x11fd8f83
+DD 0xb58c0222,0x32069085,0xbf75519d,0x38ffc33a
+DD 0xa07ea55c,0x27f437fb,0xaa87f6e3,0x2d0d6444
+DD 0x9e694cde,0x19e3de79,0x94901f61,0x131a8dc6
+DD 0x8b9beba0,0x0c117907,0x8162b81f,0x06e82ab8
+DD 0xe2469fda,0x65cc0d7d,0xe8bfcc65,0x6f355ec2
+DD 0xf7b438a4,0x703eaa03,0xfd4d6b1b,0x7ac7f9bc
+DD 0xc9a3d126,0x4e294381,0xc35a8299,0x44d0103e
+DD 0xdc517658,0x5bdbe4ff,0xd6a825e7,0x5122b740
+DD 0x1a1939d2,0x9d93ab75,0x10e06a6d,0x976af8ca
+DD 0x0feb9eac,0x88610c0b,0x0512cd13,0x82985fb4
+DD 0x31fc772e,0xb676e589,0x3b052491,0xbc8fb636
+DD 0x240ed050,0xa38442f7,0x2ef783ef,0xa97d1148
+DD 0x4dd3a42a,0xca59368d,0x472af795,0xc0a06532
+DD 0x58210354,0xdfab91f3,0x52d850eb,0xd552c24c
+DD 0x6636ead6,0xe1bc7871,0x6ccfb969,0xeb452bce
+DD 0x73c44da8,0xf44edf0f,0x793d1e17,0xfeb78cb0
+DD 0xef4a0333,0x68c09194,0xe5b3508c,0x6239c22b
+DD 0xfab8a44d,0x7d3236ea,0xf041f7f2,0x77cb6555
+DD 0xc4af4dcf,0x4325df68,0xce561e70,0x49dc8cd7
+DD 0xd15deab1,0x56d77816,0xdba4b90e,0x5c2e2ba9
+DD 0xb8809ecb,0x3f0a0c6c,0xb279cd74,0x35f35fd3
+DD 0xad7239b5,0x2af8ab12,0xa78b6a0a,0x2001f8ad
+DD 0x9365d037,0x14ef4290,0x999c8388,0x1e16112f
+DD 0x86977749,0x011de5ee,0x8c6e24f6,0x0be4b651
+DD 0x40df38c3,0xc755aa64,0x4a266b7c,0xcdacf9db
+DD 0x552d9fbd,0xd2a70d1a,0x5fd4cc02,0xd85e5ea5
+DD 0x6b3a763f,0xecb0e498,0x61c32580,0xe649b727
+DD 0x7ec8d141,0xf94243e6,0x743182fe,0xf3bb1059
+DD 0x1715a53b,0x909f379c,0x1decf684,0x9a666423
+DD 0x02e70245,0x856d90e2,0x081e51fa,0x8f94c35d
+DD 0x3cf0ebc7,0xbb7a7960,0x3609b878,0xb1832adf
+DD 0x29024cb9,0xae88de1e,0x23fb1f06,0xa4718da1
+
+mul_table_312:
+DD 0x00000000,0xbac2fd7b,0x70698c07,0xcaab717c
+DD 0xe0d3180e,0x5a11e575,0x90ba9409,0x2a786972
+DD 0xc44a46ed,0x7e88bb96,0xb423caea,0x0ee13791
+DD 0x24995ee3,0x9e5ba398,0x54f0d2e4,0xee322f9f
+DD 0x8d78fb2b,0x37ba0650,0xfd11772c,0x47d38a57
+DD 0x6dabe325,0xd7691e5e,0x1dc26f22,0xa7009259
+DD 0x4932bdc6,0xf3f040bd,0x395b31c1,0x8399ccba
+DD 0xa9e1a5c8,0x132358b3,0xd98829cf,0x634ad4b4
+DD 0x1f1d80a7,0xa5df7ddc,0x6f740ca0,0xd5b6f1db
+DD 0xffce98a9,0x450c65d2,0x8fa714ae,0x3565e9d5
+DD 0xdb57c64a,0x61953b31,0xab3e4a4d,0x11fcb736
+DD 0x3b84de44,0x8146233f,0x4bed5243,0xf12faf38
+DD 0x92657b8c,0x28a786f7,0xe20cf78b,0x58ce0af0
+DD 0x72b66382,0xc8749ef9,0x02dfef85,0xb81d12fe
+DD 0x562f3d61,0xecedc01a,0x2646b166,0x9c844c1d
+DD 0xb6fc256f,0x0c3ed814,0xc695a968,0x7c575413
+DD 0x3e3b014e,0x84f9fc35,0x4e528d49,0xf4907032
+DD 0xdee81940,0x642ae43b,0xae819547,0x1443683c
+DD 0xfa7147a3,0x40b3bad8,0x8a18cba4,0x30da36df
+DD 0x1aa25fad,0xa060a2d6,0x6acbd3aa,0xd0092ed1
+DD 0xb343fa65,0x0981071e,0xc32a7662,0x79e88b19
+DD 0x5390e26b,0xe9521f10,0x23f96e6c,0x993b9317
+DD 0x7709bc88,0xcdcb41f3,0x0760308f,0xbda2cdf4
+DD 0x97daa486,0x2d1859fd,0xe7b32881,0x5d71d5fa
+DD 0x212681e9,0x9be47c92,0x514f0dee,0xeb8df095
+DD 0xc1f599e7,0x7b37649c,0xb19c15e0,0x0b5ee89b
+DD 0xe56cc704,0x5fae3a7f,0x95054b03,0x2fc7b678
+DD 0x05bfdf0a,0xbf7d2271,0x75d6530d,0xcf14ae76
+DD 0xac5e7ac2,0x169c87b9,0xdc37f6c5,0x66f50bbe
+DD 0x4c8d62cc,0xf64f9fb7,0x3ce4eecb,0x862613b0
+DD 0x68143c2f,0xd2d6c154,0x187db028,0xa2bf4d53
+DD 0x88c72421,0x3205d95a,0xf8aea826,0x426c555d
+DD 0x7c76029c,0xc6b4ffe7,0x0c1f8e9b,0xb6dd73e0
+DD 0x9ca51a92,0x2667e7e9,0xeccc9695,0x560e6bee
+DD 0xb83c4471,0x02feb90a,0xc855c876,0x7297350d
+DD 0x58ef5c7f,0xe22da104,0x2886d078,0x92442d03
+DD 0xf10ef9b7,0x4bcc04cc,0x816775b0,0x3ba588cb
+DD 0x11dde1b9,0xab1f1cc2,0x61b46dbe,0xdb7690c5
+DD 0x3544bf5a,0x8f864221,0x452d335d,0xffefce26
+DD 0xd597a754,0x6f555a2f,0xa5fe2b53,0x1f3cd628
+DD 0x636b823b,0xd9a97f40,0x13020e3c,0xa9c0f347
+DD 0x83b89a35,0x397a674e,0xf3d11632,0x4913eb49
+DD 0xa721c4d6,0x1de339ad,0xd74848d1,0x6d8ab5aa
+DD 0x47f2dcd8,0xfd3021a3,0x379b50df,0x8d59ada4
+DD 0xee137910,0x54d1846b,0x9e7af517,0x24b8086c
+DD 0x0ec0611e,0xb4029c65,0x7ea9ed19,0xc46b1062
+DD 0x2a593ffd,0x909bc286,0x5a30b3fa,0xe0f24e81
+DD 0xca8a27f3,0x7048da88,0xbae3abf4,0x0021568f
+DD 0x424d03d2,0xf88ffea9,0x32248fd5,0x88e672ae
+DD 0xa29e1bdc,0x185ce6a7,0xd2f797db,0x68356aa0
+DD 0x8607453f,0x3cc5b844,0xf66ec938,0x4cac3443
+DD 0x66d45d31,0xdc16a04a,0x16bdd136,0xac7f2c4d
+DD 0xcf35f8f9,0x75f70582,0xbf5c74fe,0x059e8985
+DD 0x2fe6e0f7,0x95241d8c,0x5f8f6cf0,0xe54d918b
+DD 0x0b7fbe14,0xb1bd436f,0x7b163213,0xc1d4cf68
+DD 0xebaca61a,0x516e5b61,0x9bc52a1d,0x2107d766
+DD 0x5d508375,0xe7927e0e,0x2d390f72,0x97fbf209
+DD 0xbd839b7b,0x07416600,0xcdea177c,0x7728ea07
+DD 0x991ac598,0x23d838e3,0xe973499f,0x53b1b4e4
+DD 0x79c9dd96,0xc30b20ed,0x09a05191,0xb362acea
+DD 0xd028785e,0x6aea8525,0xa041f459,0x1a830922
+DD 0x30fb6050,0x8a399d2b,0x4092ec57,0xfa50112c
+DD 0x14623eb3,0xaea0c3c8,0x640bb2b4,0xdec94fcf
+DD 0xf4b126bd,0x4e73dbc6,0x84d8aaba,0x3e1a57c1
+
+mul_table_632:
+DD 0x00000000,0x6b749fb2,0xd6e93f64,0xbd9da0d6
+DD 0xa83e0839,0xc34a978b,0x7ed7375d,0x15a3a8ef
+DD 0x55906683,0x3ee4f931,0x837959e7,0xe80dc655
+DD 0xfdae6eba,0x96daf108,0x2b4751de,0x4033ce6c
+DD 0xab20cd06,0xc05452b4,0x7dc9f262,0x16bd6dd0
+DD 0x031ec53f,0x686a5a8d,0xd5f7fa5b,0xbe8365e9
+DD 0xfeb0ab85,0x95c43437,0x285994e1,0x432d0b53
+DD 0x568ea3bc,0x3dfa3c0e,0x80679cd8,0xeb13036a
+DD 0x53adecfd,0x38d9734f,0x8544d399,0xee304c2b
+DD 0xfb93e4c4,0x90e77b76,0x2d7adba0,0x460e4412
+DD 0x063d8a7e,0x6d4915cc,0xd0d4b51a,0xbba02aa8
+DD 0xae038247,0xc5771df5,0x78eabd23,0x139e2291
+DD 0xf88d21fb,0x93f9be49,0x2e641e9f,0x4510812d
+DD 0x50b329c2,0x3bc7b670,0x865a16a6,0xed2e8914
+DD 0xad1d4778,0xc669d8ca,0x7bf4781c,0x1080e7ae
+DD 0x05234f41,0x6e57d0f3,0xd3ca7025,0xb8beef97
+DD 0xa75bd9fa,0xcc2f4648,0x71b2e69e,0x1ac6792c
+DD 0x0f65d1c3,0x64114e71,0xd98ceea7,0xb2f87115
+DD 0xf2cbbf79,0x99bf20cb,0x2422801d,0x4f561faf
+DD 0x5af5b740,0x318128f2,0x8c1c8824,0xe7681796
+DD 0x0c7b14fc,0x670f8b4e,0xda922b98,0xb1e6b42a
+DD 0xa4451cc5,0xcf318377,0x72ac23a1,0x19d8bc13
+DD 0x59eb727f,0x329fedcd,0x8f024d1b,0xe476d2a9
+DD 0xf1d57a46,0x9aa1e5f4,0x273c4522,0x4c48da90
+DD 0xf4f63507,0x9f82aab5,0x221f0a63,0x496b95d1
+DD 0x5cc83d3e,0x37bca28c,0x8a21025a,0xe1559de8
+DD 0xa1665384,0xca12cc36,0x778f6ce0,0x1cfbf352
+DD 0x09585bbd,0x622cc40f,0xdfb164d9,0xb4c5fb6b
+DD 0x5fd6f801,0x34a267b3,0x893fc765,0xe24b58d7
+DD 0xf7e8f038,0x9c9c6f8a,0x2101cf5c,0x4a7550ee
+DD 0x0a469e82,0x61320130,0xdcafa1e6,0xb7db3e54
+DD 0xa27896bb,0xc90c0909,0x7491a9df,0x1fe5366d
+DD 0x4b5bc505,0x202f5ab7,0x9db2fa61,0xf6c665d3
+DD 0xe365cd3c,0x8811528e,0x358cf258,0x5ef86dea
+DD 0x1ecba386,0x75bf3c34,0xc8229ce2,0xa3560350
+DD 0xb6f5abbf,0xdd81340d,0x601c94db,0x0b680b69
+DD 0xe07b0803,0x8b0f97b1,0x36923767,0x5de6a8d5
+DD 0x4845003a,0x23319f88,0x9eac3f5e,0xf5d8a0ec
+DD 0xb5eb6e80,0xde9ff132,0x630251e4,0x0876ce56
+DD 0x1dd566b9,0x76a1f90b,0xcb3c59dd,0xa048c66f
+DD 0x18f629f8,0x7382b64a,0xce1f169c,0xa56b892e
+DD 0xb0c821c1,0xdbbcbe73,0x66211ea5,0x0d558117
+DD 0x4d664f7b,0x2612d0c9,0x9b8f701f,0xf0fbefad
+DD 0xe5584742,0x8e2cd8f0,0x33b17826,0x58c5e794
+DD 0xb3d6e4fe,0xd8a27b4c,0x653fdb9a,0x0e4b4428
+DD 0x1be8ecc7,0x709c7375,0xcd01d3a3,0xa6754c11
+DD 0xe646827d,0x8d321dcf,0x30afbd19,0x5bdb22ab
+DD 0x4e788a44,0x250c15f6,0x9891b520,0xf3e52a92
+DD 0xec001cff,0x8774834d,0x3ae9239b,0x519dbc29
+DD 0x443e14c6,0x2f4a8b74,0x92d72ba2,0xf9a3b410
+DD 0xb9907a7c,0xd2e4e5ce,0x6f794518,0x040ddaaa
+DD 0x11ae7245,0x7adaedf7,0xc7474d21,0xac33d293
+DD 0x4720d1f9,0x2c544e4b,0x91c9ee9d,0xfabd712f
+DD 0xef1ed9c0,0x846a4672,0x39f7e6a4,0x52837916
+DD 0x12b0b77a,0x79c428c8,0xc459881e,0xaf2d17ac
+DD 0xba8ebf43,0xd1fa20f1,0x6c678027,0x07131f95
+DD 0xbfadf002,0xd4d96fb0,0x6944cf66,0x023050d4
+DD 0x1793f83b,0x7ce76789,0xc17ac75f,0xaa0e58ed
+DD 0xea3d9681,0x81490933,0x3cd4a9e5,0x57a03657
+DD 0x42039eb8,0x2977010a,0x94eaa1dc,0xff9e3e6e
+DD 0x148d3d04,0x7ff9a2b6,0xc2640260,0xa9109dd2
+DD 0xbcb3353d,0xd7c7aa8f,0x6a5a0a59,0x012e95eb
+DD 0x411d5b87,0x2a69c435,0x97f464e3,0xfc80fb51
+DD 0xe92353be,0x8257cc0c,0x3fca6cda,0x54bef368
+
+mul_table_1272:
+DD 0x00000000,0xdd66cbbb,0xbf21e187,0x62472a3c
+DD 0x7bafb5ff,0xa6c97e44,0xc48e5478,0x19e89fc3
+DD 0xf75f6bfe,0x2a39a045,0x487e8a79,0x951841c2
+DD 0x8cf0de01,0x519615ba,0x33d13f86,0xeeb7f43d
+DD 0xeb52a10d,0x36346ab6,0x5473408a,0x89158b31
+DD 0x90fd14f2,0x4d9bdf49,0x2fdcf575,0xf2ba3ece
+DD 0x1c0dcaf3,0xc16b0148,0xa32c2b74,0x7e4ae0cf
+DD 0x67a27f0c,0xbac4b4b7,0xd8839e8b,0x05e55530
+DD 0xd34934eb,0x0e2fff50,0x6c68d56c,0xb10e1ed7
+DD 0xa8e68114,0x75804aaf,0x17c76093,0xcaa1ab28
+DD 0x24165f15,0xf97094ae,0x9b37be92,0x46517529
+DD 0x5fb9eaea,0x82df2151,0xe0980b6d,0x3dfec0d6
+DD 0x381b95e6,0xe57d5e5d,0x873a7461,0x5a5cbfda
+DD 0x43b42019,0x9ed2eba2,0xfc95c19e,0x21f30a25
+DD 0xcf44fe18,0x122235a3,0x70651f9f,0xad03d424
+DD 0xb4eb4be7,0x698d805c,0x0bcaaa60,0xd6ac61db
+DD 0xa37e1f27,0x7e18d49c,0x1c5ffea0,0xc139351b
+DD 0xd8d1aad8,0x05b76163,0x67f04b5f,0xba9680e4
+DD 0x542174d9,0x8947bf62,0xeb00955e,0x36665ee5
+DD 0x2f8ec126,0xf2e80a9d,0x90af20a1,0x4dc9eb1a
+DD 0x482cbe2a,0x954a7591,0xf70d5fad,0x2a6b9416
+DD 0x33830bd5,0xeee5c06e,0x8ca2ea52,0x51c421e9
+DD 0xbf73d5d4,0x62151e6f,0x00523453,0xdd34ffe8
+DD 0xc4dc602b,0x19baab90,0x7bfd81ac,0xa69b4a17
+DD 0x70372bcc,0xad51e077,0xcf16ca4b,0x127001f0
+DD 0x0b989e33,0xd6fe5588,0xb4b97fb4,0x69dfb40f
+DD 0x87684032,0x5a0e8b89,0x3849a1b5,0xe52f6a0e
+DD 0xfcc7f5cd,0x21a13e76,0x43e6144a,0x9e80dff1
+DD 0x9b658ac1,0x4603417a,0x24446b46,0xf922a0fd
+DD 0xe0ca3f3e,0x3dacf485,0x5febdeb9,0x828d1502
+DD 0x6c3ae13f,0xb15c2a84,0xd31b00b8,0x0e7dcb03
+DD 0x179554c0,0xcaf39f7b,0xa8b4b547,0x75d27efc
+DD 0x431048bf,0x9e768304,0xfc31a938,0x21576283
+DD 0x38bffd40,0xe5d936fb,0x879e1cc7,0x5af8d77c
+DD 0xb44f2341,0x6929e8fa,0x0b6ec2c6,0xd608097d
+DD 0xcfe096be,0x12865d05,0x70c17739,0xada7bc82
+DD 0xa842e9b2,0x75242209,0x17630835,0xca05c38e
+DD 0xd3ed5c4d,0x0e8b97f6,0x6cccbdca,0xb1aa7671
+DD 0x5f1d824c,0x827b49f7,0xe03c63cb,0x3d5aa870
+DD 0x24b237b3,0xf9d4fc08,0x9b93d634,0x46f51d8f
+DD 0x90597c54,0x4d3fb7ef,0x2f789dd3,0xf21e5668
+DD 0xebf6c9ab,0x36900210,0x54d7282c,0x89b1e397
+DD 0x670617aa,0xba60dc11,0xd827f62d,0x05413d96
+DD 0x1ca9a255,0xc1cf69ee,0xa38843d2,0x7eee8869
+DD 0x7b0bdd59,0xa66d16e2,0xc42a3cde,0x194cf765
+DD 0x00a468a6,0xddc2a31d,0xbf858921,0x62e3429a
+DD 0x8c54b6a7,0x51327d1c,0x33755720,0xee139c9b
+DD 0xf7fb0358,0x2a9dc8e3,0x48dae2df,0x95bc2964
+DD 0xe06e5798,0x3d089c23,0x5f4fb61f,0x82297da4
+DD 0x9bc1e267,0x46a729dc,0x24e003e0,0xf986c85b
+DD 0x17313c66,0xca57f7dd,0xa810dde1,0x7576165a
+DD 0x6c9e8999,0xb1f84222,0xd3bf681e,0x0ed9a3a5
+DD 0x0b3cf695,0xd65a3d2e,0xb41d1712,0x697bdca9
+DD 0x7093436a,0xadf588d1,0xcfb2a2ed,0x12d46956
+DD 0xfc639d6b,0x210556d0,0x43427cec,0x9e24b757
+DD 0x87cc2894,0x5aaae32f,0x38edc913,0xe58b02a8
+DD 0x33276373,0xee41a8c8,0x8c0682f4,0x5160494f
+DD 0x4888d68c,0x95ee1d37,0xf7a9370b,0x2acffcb0
+DD 0xc478088d,0x191ec336,0x7b59e90a,0xa63f22b1
+DD 0xbfd7bd72,0x62b176c9,0x00f65cf5,0xdd90974e
+DD 0xd875c27e,0x051309c5,0x675423f9,0xba32e842
+DD 0xa3da7781,0x7ebcbc3a,0x1cfb9606,0xc19d5dbd
+DD 0x2f2aa980,0xf24c623b,0x900b4807,0x4d6d83bc
+DD 0x54851c7f,0x89e3d7c4,0xeba4fdf8,0x36c23643
+
+%macro slversion 4
+global %1_slver_%2%3%4
+global %1_slver
+%1_slver:
+%1_slver_%2%3%4:
+	dw 0x%4
+	db 0x%3, 0x%2
+%endmacro
+;;;       func            core, ver, snum
+slversion crc32_iscsi_zero_00, 00,   02,  0014
+%ifidn __OUTPUT_FORMAT__, elf64
+; inform linker that this doesn't require executable stack
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/common/crc32c_ppc.c b/src/common/crc32c_ppc.c
new file mode 100644
index 00000000..52fd1c4e
--- /dev/null
+++ b/src/common/crc32c_ppc.c
@@ -0,0 +1,148 @@
+/* Copyright (C) 2017 International Business Machines Corp.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#define CRC_TABLE
+#define FAST_ZERO_TABLE
+
+#include "acconfig.h"
+#include "include/int_types.h"
+#include "crc32c_ppc_constants.h"
+#include "reverse.h"
+
+#include <stdlib.h>
+#include <strings.h>
+
+#define VMX_ALIGN	16
+#define VMX_ALIGN_MASK	(VMX_ALIGN-1)
+
+#ifdef HAVE_PPC64LE
+#ifdef REFLECT
+static unsigned int crc32_align(unsigned int crc, unsigned char const *p,
+                                unsigned long len)
+{
+  while (len--)
+    crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+  return crc;
+}
+#else
+static unsigned int crc32_align(unsigned int crc, unsigned char const *p,
+                                unsigned long len)
+{
+  while (len--)
+    crc = crc_table[((crc >> 24) ^ *p++) & 0xff] ^ (crc << 8);
+  return crc;
+}
+#endif
+
+static inline unsigned long polynomial_multiply(unsigned int a, unsigned int b) {
+        vector unsigned int va = {a, 0, 0, 0};
+        vector unsigned int vb = {b, 0, 0, 0};
+        vector unsigned long vt;
+
+        __asm__("vpmsumw %0,%1,%2" : "=v"(vt) : "v"(va), "v"(vb));
+
+        return vt[0];
+}
+
+unsigned int barrett_reduction(unsigned long val);
+
+static inline unsigned int gf_multiply(unsigned int a, unsigned int b) {
+        return barrett_reduction(polynomial_multiply(a, b));
+}
+
+unsigned int append_zeros(unsigned int crc, unsigned long length) {
+        unsigned long i = 0;
+
+        while (length) {
+                if (length & 1) {
+                        crc = gf_multiply(crc, crc_zero[i]);
+                }
+                i++;
+                length /= 2;
+        }
+
+        return crc;
+}
+
+
+unsigned int __crc32_vpmsum(unsigned int crc, unsigned char const *p,
+                            unsigned long len);
+
+static uint32_t crc32_vpmsum(uint32_t crc, unsigned char const *data,
+                             unsigned len)
+{
+  unsigned int prealign;
+  unsigned int tail;
+
+#ifdef CRC_XOR
+  crc ^= 0xffffffff;
+#endif
+
+  if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
+    crc = crc32_align(crc, data, (unsigned long)len);
+    goto out;
+  }
+
+  if ((unsigned long)data & VMX_ALIGN_MASK) {
+    prealign = VMX_ALIGN - ((unsigned long)data & VMX_ALIGN_MASK);
+    crc = crc32_align(crc, data, prealign);
+    len -= prealign;
+    data += prealign;
+  }
+
+  crc = __crc32_vpmsum(crc, data, (unsigned long)len & ~VMX_ALIGN_MASK);
+
+  tail = len & VMX_ALIGN_MASK;
+  if (tail) {
+    data += len & ~VMX_ALIGN_MASK;
+    crc = crc32_align(crc, data, tail);
+  }
+
+out:
+#ifdef CRC_XOR
+  crc ^= 0xffffffff;
+#endif
+
+  return crc;
+}
+
+/* This wrapper function works around the fact that crc32_vpmsum 
+ * does not gracefully handle the case where the data pointer is NULL.
+ */
+uint32_t ceph_crc32c_ppc(uint32_t crc, unsigned char const *data, unsigned len)
+{
+  if (!data) {
+    /* Handle the NULL buffer case. */
+#ifdef REFLECT
+    crc = reverse_bits(crc);
+#endif
+
+    crc = append_zeros(crc, len);
+
+#ifdef REFLECT
+    crc = reverse_bits(crc);
+#endif
+  } else {
+    /* Handle the valid buffer case. */
+    crc = crc32_vpmsum(crc, data, (unsigned long)len);
+  }
+  return crc;
+}
+
+#else /* HAVE_PPC64LE */
+
+/* This symbol has to exist on non-ppc architectures (and on legacy
+ * ppc systems using power7 or below) in order to compile properly
+ * there, even though it won't be called.
+ */
+uint32_t ceph_crc32c_ppc(uint32_t crc, unsigned char const *data, unsigned len)
+{
+  return 0;
+}
+
+#endif /* HAVE_PPC64LE */
diff --git a/src/common/crc32c_ppc.h b/src/common/crc32c_ppc.h
new file mode 100644
index 00000000..18021638
--- /dev/null
+++ b/src/common/crc32c_ppc.h
@@ -0,0 +1,22 @@
+/* Copyright (C) 2017 International Business Machines Corp.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef CEPH_COMMON_CRC32C_PPC_H
+#define CEPH_COMMON_CRC32C_PPC_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint32_t ceph_crc32c_ppc(uint32_t crc, unsigned char const *buffer, unsigned len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/common/crc32c_ppc_asm.S b/src/common/crc32c_ppc_asm.S
new file mode 100644
index 00000000..1dc6dd1c
--- /dev/null
+++ b/src/common/crc32c_ppc_asm.S
@@ -0,0 +1,771 @@
+/*
+ * Calculate the checksum of data that is 16 byte aligned and a multiple of
+ * 16 bytes.
+ *
+ * The first step is to reduce it to 1024 bits. We do this in 8 parallel
+ * chunks in order to mask the latency of the vpmsum instructions. If we
+ * have more than 32 kB of data to checksum we repeat this step multiple
+ * times, passing in the previous 1024 bits.
+ *
+ * The next step is to reduce the 1024 bits to 64 bits. This step adds
+ * 32 bits of 0s to the end - this matches what a CRC does. We just
+ * calculate constants that land the data in this 32 bits.
+ *
+ * We then use fixed point Barrett reduction to compute a mod n over GF(2)
+ * for n = CRC using POWER8 instructions. We use x = 32.
+ *
+ * http://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ * Copyright (C) 2017 International Business Machines Corp.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <ppc-asm.h>
+#include "common/ppc-opcode.h"
+
+#undef toc
+
+#ifndef r1
+#define r1 1
+#endif
+
+#ifndef r2
+#define r2 2
+#endif
+
+	.section	.rodata
+.balign 16
+
+.byteswap_constant:
+	/* byte reverse permute constant */
+	.octa 0x0F0E0D0C0B0A09080706050403020100
+
+#define __ASSEMBLY__
+#include "crc32c_ppc_constants.h"
+
+	.text
+
+#if defined(__BIG_ENDIAN__) && defined(REFLECT)
+#define BYTESWAP_DATA
+#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
+#define BYTESWAP_DATA
+#else
+#undef BYTESWAP_DATA
+#endif
+
+#define off16		r25
+#define off32		r26
+#define off48		r27
+#define off64		r28
+#define off80		r29
+#define off96		r30
+#define off112		r31
+
+#define const1		v24
+#define const2		v25
+
+#define byteswap	v26
+#define	mask_32bit	v27
+#define	mask_64bit	v28
+#define zeroes		v29
+
+#ifdef BYTESWAP_DATA
+#define VPERM(A, B, C, D) vperm	A, B, C, D
+#else
+#define VPERM(A, B, C, D)
+#endif
+
+/* unsigned int __crc32_vpmsum(unsigned int crc, void *p, unsigned long len) */
+FUNC_START(__crc32_vpmsum)
+	std	r31,-8(r1)
+	std	r30,-16(r1)
+	std	r29,-24(r1)
+	std	r28,-32(r1)
+	std	r27,-40(r1)
+	std	r26,-48(r1)
+	std	r25,-56(r1)
+
+	li	off16,16
+	li	off32,32
+	li	off48,48
+	li	off64,64
+	li	off80,80
+	li	off96,96
+	li	off112,112
+	li	r0,0
+
+	/* Enough room for saving 10 non volatile VMX registers */
+	subi	r6,r1,56+10*16
+	subi	r7,r1,56+2*16
+
+	stvx	v20,0,r6
+	stvx	v21,off16,r6
+	stvx	v22,off32,r6
+	stvx	v23,off48,r6
+	stvx	v24,off64,r6
+	stvx	v25,off80,r6
+	stvx	v26,off96,r6
+	stvx	v27,off112,r6
+	stvx	v28,0,r7
+	stvx	v29,off16,r7
+
+	mr	r10,r3
+
+	vxor	zeroes,zeroes,zeroes
+	vspltisw v0,-1
+
+	vsldoi	mask_32bit,zeroes,v0,4
+	vsldoi	mask_64bit,zeroes,v0,8
+
+	/* Get the initial value into v8 */
+	vxor	v8,v8,v8
+	MTVRD(v8, r3)
+#ifdef REFLECT
+	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
+#else
+	vsldoi	v8,v8,zeroes,4	/* shift into top 32 bits */
+#endif
+
+#ifdef BYTESWAP_DATA
+	addis	r3,r2,.byteswap_constant@toc@ha
+	addi	r3,r3,.byteswap_constant@toc@l
+
+	lvx	byteswap,0,r3
+	addi	r3,r3,16
+#endif
+
+	cmpdi	r5,256
+	blt	.Lshort
+
+	rldicr	r6,r5,0,56
+
+	/* Checksum in blocks of MAX_SIZE */
+1:	lis	r7,MAX_SIZE@h
+	ori	r7,r7,MAX_SIZE@l
+	mr	r9,r7
+	cmpd	r6,r7
+	bgt	2f
+	mr	r7,r6
+2:	subf	r6,r7,r6
+
+	/* our main loop does 128 bytes at a time */
+	srdi	r7,r7,7
+
+	/*
+	 * Work out the offset into the constants table to start at. Each
+	 * constant is 16 bytes, and it is used against 128 bytes of input
+	 * data - 128 / 16 = 8
+	 */
+	sldi	r8,r7,4
+	srdi	r9,r9,3
+	subf	r8,r8,r9
+
+	/* We reduce our final 128 bytes in a separate step */
+	addi	r7,r7,-1
+	mtctr	r7
+
+	addis	r3,r2,.constants@toc@ha
+	addi	r3,r3,.constants@toc@l
+
+	/* Find the start of our constants */
+	add	r3,r3,r8
+
+	/* zero v0-v7 which will contain our checksums */
+	vxor	v0,v0,v0
+	vxor	v1,v1,v1
+	vxor	v2,v2,v2
+	vxor	v3,v3,v3
+	vxor	v4,v4,v4
+	vxor	v5,v5,v5
+	vxor	v6,v6,v6
+	vxor	v7,v7,v7
+
+	lvx	const1,0,r3
+
+	/*
+	 * If we are looping back to consume more data we use the values
+	 * already in v16-v23.
+	 */
+	cmpdi	r0,1
+	beq	2f
+
+	/* First warm up pass */
+	lvx	v16,0,r4
+	lvx	v17,off16,r4
+	VPERM(v16,v16,v16,byteswap)
+	VPERM(v17,v17,v17,byteswap)
+	lvx	v18,off32,r4
+	lvx	v19,off48,r4
+	VPERM(v18,v18,v18,byteswap)
+	VPERM(v19,v19,v19,byteswap)
+	lvx	v20,off64,r4
+	lvx	v21,off80,r4
+	VPERM(v20,v20,v20,byteswap)
+	VPERM(v21,v21,v21,byteswap)
+	lvx	v22,off96,r4
+	lvx	v23,off112,r4
+	VPERM(v22,v22,v22,byteswap)
+	VPERM(v23,v23,v23,byteswap)
+	addi	r4,r4,8*16
+
+	/* xor in initial value */
+	vxor	v16,v16,v8
+
+2:	bdz	.Lfirst_warm_up_done
+
+	addi	r3,r3,16
+	lvx	const2,0,r3
+
+	/* Second warm up pass */
+	VPMSUMD(v8,v16,const1)
+	lvx	v16,0,r4
+	VPERM(v16,v16,v16,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v9,v17,const1)
+	lvx	v17,off16,r4
+	VPERM(v17,v17,v17,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v10,v18,const1)
+	lvx	v18,off32,r4
+	VPERM(v18,v18,v18,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v11,v19,const1)
+	lvx	v19,off48,r4
+	VPERM(v19,v19,v19,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v12,v20,const1)
+	lvx	v20,off64,r4
+	VPERM(v20,v20,v20,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v13,v21,const1)
+	lvx	v21,off80,r4
+	VPERM(v21,v21,v21,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v14,v22,const1)
+	lvx	v22,off96,r4
+	VPERM(v22,v22,v22,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v15,v23,const1)
+	lvx	v23,off112,r4
+	VPERM(v23,v23,v23,byteswap)
+
+	addi	r4,r4,8*16
+
+	bdz	.Lfirst_cool_down
+
+	/*
+	 * main loop. We modulo schedule it such that it takes three iterations
+	 * to complete - first iteration load, second iteration vpmsum, third
+	 * iteration xor.
+	 */
+	.balign	16
+4:	lvx	const1,0,r3
+	addi	r3,r3,16
+	ori	r2,r2,0
+
+	vxor	v0,v0,v8
+	VPMSUMD(v8,v16,const2)
+	lvx	v16,0,r4
+	VPERM(v16,v16,v16,byteswap)
+	ori	r2,r2,0
+
+	vxor	v1,v1,v9
+	VPMSUMD(v9,v17,const2)
+	lvx	v17,off16,r4
+	VPERM(v17,v17,v17,byteswap)
+	ori	r2,r2,0
+
+	vxor	v2,v2,v10
+	VPMSUMD(v10,v18,const2)
+	lvx	v18,off32,r4
+	VPERM(v18,v18,v18,byteswap)
+	ori	r2,r2,0
+
+	vxor	v3,v3,v11
+	VPMSUMD(v11,v19,const2)
+	lvx	v19,off48,r4
+	VPERM(v19,v19,v19,byteswap)
+	lvx	const2,0,r3
+	ori	r2,r2,0
+
+	vxor	v4,v4,v12
+	VPMSUMD(v12,v20,const1)
+	lvx	v20,off64,r4
+	VPERM(v20,v20,v20,byteswap)
+	ori	r2,r2,0
+
+	vxor	v5,v5,v13
+	VPMSUMD(v13,v21,const1)
+	lvx	v21,off80,r4
+	VPERM(v21,v21,v21,byteswap)
+	ori	r2,r2,0
+
+	vxor	v6,v6,v14
+	VPMSUMD(v14,v22,const1)
+	lvx	v22,off96,r4
+	VPERM(v22,v22,v22,byteswap)
+	ori	r2,r2,0
+
+	vxor	v7,v7,v15
+	VPMSUMD(v15,v23,const1)
+	lvx	v23,off112,r4
+	VPERM(v23,v23,v23,byteswap)
+
+	addi	r4,r4,8*16
+
+	bdnz	4b
+
+.Lfirst_cool_down:
+	/* First cool down pass */
+	lvx	const1,0,r3
+	addi	r3,r3,16
+
+	vxor	v0,v0,v8
+	VPMSUMD(v8,v16,const1)
+	ori	r2,r2,0
+
+	vxor	v1,v1,v9
+	VPMSUMD(v9,v17,const1)
+	ori	r2,r2,0
+
+	vxor	v2,v2,v10
+	VPMSUMD(v10,v18,const1)
+	ori	r2,r2,0
+
+	vxor	v3,v3,v11
+	VPMSUMD(v11,v19,const1)
+	ori	r2,r2,0
+
+	vxor	v4,v4,v12
+	VPMSUMD(v12,v20,const1)
+	ori	r2,r2,0
+
+	vxor	v5,v5,v13
+	VPMSUMD(v13,v21,const1)
+	ori	r2,r2,0
+
+	vxor	v6,v6,v14
+	VPMSUMD(v14,v22,const1)
+	ori	r2,r2,0
+
+	vxor	v7,v7,v15
+	VPMSUMD(v15,v23,const1)
+	ori	r2,r2,0
+
+.Lsecond_cool_down:
+	/* Second cool down pass */
+	vxor	v0,v0,v8
+	vxor	v1,v1,v9
+	vxor	v2,v2,v10
+	vxor	v3,v3,v11
+	vxor	v4,v4,v12
+	vxor	v5,v5,v13
+	vxor	v6,v6,v14
+	vxor	v7,v7,v15
+
+#ifdef REFLECT
+	/*
+	 * vpmsumd produces a 96 bit result in the least significant bits
+	 * of the register. Since we are bit reflected we have to shift it
+	 * left 32 bits so it occupies the least significant bits in the
+	 * bit reflected domain.
+	 */
+	vsldoi	v0,v0,zeroes,4
+	vsldoi	v1,v1,zeroes,4
+	vsldoi	v2,v2,zeroes,4
+	vsldoi	v3,v3,zeroes,4
+	vsldoi	v4,v4,zeroes,4
+	vsldoi	v5,v5,zeroes,4
+	vsldoi	v6,v6,zeroes,4
+	vsldoi	v7,v7,zeroes,4
+#endif
+
+	/* xor with last 1024 bits */
+	lvx	v8,0,r4
+	lvx	v9,off16,r4
+	VPERM(v8,v8,v8,byteswap)
+	VPERM(v9,v9,v9,byteswap)
+	lvx	v10,off32,r4
+	lvx	v11,off48,r4
+	VPERM(v10,v10,v10,byteswap)
+	VPERM(v11,v11,v11,byteswap)
+	lvx	v12,off64,r4
+	lvx	v13,off80,r4
+	VPERM(v12,v12,v12,byteswap)
+	VPERM(v13,v13,v13,byteswap)
+	lvx	v14,off96,r4
+	lvx	v15,off112,r4
+	VPERM(v14,v14,v14,byteswap)
+	VPERM(v15,v15,v15,byteswap)
+
+	addi	r4,r4,8*16
+
+	vxor	v16,v0,v8
+	vxor	v17,v1,v9
+	vxor	v18,v2,v10
+	vxor	v19,v3,v11
+	vxor	v20,v4,v12
+	vxor	v21,v5,v13
+	vxor	v22,v6,v14
+	vxor	v23,v7,v15
+
+	li	r0,1
+	cmpdi	r6,0
+	addi	r6,r6,128
+	bne	1b
+
+	/* Work out how many bytes we have left */
+	andi.	r5,r5,127
+
+	/* Calculate where in the constant table we need to start */
+	subfic	r6,r5,128
+	add	r3,r3,r6
+
+	/* How many 16 byte chunks are in the tail */
+	srdi	r7,r5,4
+	mtctr	r7
+
+	/*
+	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
+	 * 32 bits to include the trailing 32 bits of zeros
+	 */
+	lvx	v0,0,r3
+	lvx	v1,off16,r3
+	lvx	v2,off32,r3
+	lvx	v3,off48,r3
+	lvx	v4,off64,r3
+	lvx	v5,off80,r3
+	lvx	v6,off96,r3
+	lvx	v7,off112,r3
+	addi	r3,r3,8*16
+
+	VPMSUMW(v0,v16,v0)
+	VPMSUMW(v1,v17,v1)
+	VPMSUMW(v2,v18,v2)
+	VPMSUMW(v3,v19,v3)
+	VPMSUMW(v4,v20,v4)
+	VPMSUMW(v5,v21,v5)
+	VPMSUMW(v6,v22,v6)
+	VPMSUMW(v7,v23,v7)
+
+	/* Now reduce the tail (0 - 112 bytes) */
+	cmpdi	r7,0
+	beq	1f
+
+	lvx	v16,0,r4
+	lvx	v17,0,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off32,r4
+	lvx	v17,off32,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off64,r4
+	lvx	v17,off64,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off96,r4
+	lvx	v17,off96,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+
+	/* Now xor all the parallel chunks together */
+1:	vxor	v0,v0,v1
+	vxor	v2,v2,v3
+	vxor	v4,v4,v5
+	vxor	v6,v6,v7
+
+	vxor	v0,v0,v2
+	vxor	v4,v4,v6
+
+	vxor	v0,v0,v4
+
+.Lbarrett_reduction:
+	/* Barrett constants */
+	addis	r3,r2,.barrett_constants@toc@ha
+	addi	r3,r3,.barrett_constants@toc@l
+
+	lvx	const1,0,r3
+	lvx	const2,off16,r3
+
+	vsldoi	v1,v0,v0,8
+	vxor	v0,v0,v1		/* xor two 64 bit results together */
+
+#ifdef REFLECT
+	/* shift left one bit */
+	vspltisb v1,1
+	vsl	v0,v0,v1
+#endif
+
+	vand	v0,v0,mask_64bit
+
+#ifndef REFLECT
+	/*
+	 * Now for the Barrett reduction algorithm. The idea is to calculate q,
+	 * the multiple of our polynomial that we need to subtract. By
+	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
+	 * result back down 2x bits, we round down to the nearest multiple.
+	 */
+	VPMSUMD(v1,v0,const1)	/* ma */
+	vsldoi	v1,zeroes,v1,8	/* q = floor(ma/(2^64)) */
+	VPMSUMD(v1,v1,const2)	/* qn */
+	vxor	v0,v0,v1	/* a - qn, subtraction is xor in GF(2) */
+
+	/*
+	 * Get the result into r3. We need to shift it left 8 bytes:
+	 * V0 [ 0 1 2 X ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+	vsldoi	v0,v0,zeroes,8	/* shift result into top 64 bits */
+#else
+	/*
+	 * The reflected version of Barrett reduction. Instead of bit
+	 * reflecting our data (which is expensive to do), we bit reflect our
+	 * constants and our algorithm, which means the intermediate data in
+	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
+	 * the algorithm because we don't carry in mod 2 arithmetic.
+	 */
+	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
+	VPMSUMD(v1,v1,const1)		/* ma */
+	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
+	VPMSUMD(v1,v1,const2)		/* qn */
+	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */
+
+	/*
+	 * Since we are bit reflected, the result (ie the low 32 bits) is in
+	 * the high 32 bits. We just need to shift it left 4 bytes
+	 * V0 [ 0 1 X 3 ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
+#endif
+
+	/* Get it into r3 */
+	MFVRD(r3, v0)
+
+.Lout:
+	subi	r6,r1,56+10*16
+	subi	r7,r1,56+2*16
+
+	lvx	v20,0,r6
+	lvx	v21,off16,r6
+	lvx	v22,off32,r6
+	lvx	v23,off48,r6
+	lvx	v24,off64,r6
+	lvx	v25,off80,r6
+	lvx	v26,off96,r6
+	lvx	v27,off112,r6
+	lvx	v28,0,r7
+	lvx	v29,off16,r7
+
+	ld	r31,-8(r1)
+	ld	r30,-16(r1)
+	ld	r29,-24(r1)
+	ld	r28,-32(r1)
+	ld	r27,-40(r1)
+	ld	r26,-48(r1)
+	ld	r25,-56(r1)
+
+	blr
+
+.Lfirst_warm_up_done:
+	lvx	const1,0,r3
+	addi	r3,r3,16
+
+	VPMSUMD(v8,v16,const1)
+	VPMSUMD(v9,v17,const1)
+	VPMSUMD(v10,v18,const1)
+	VPMSUMD(v11,v19,const1)
+	VPMSUMD(v12,v20,const1)
+	VPMSUMD(v13,v21,const1)
+	VPMSUMD(v14,v22,const1)
+	VPMSUMD(v15,v23,const1)
+
+	b	.Lsecond_cool_down
+
+.Lshort:
+	cmpdi	r5,0
+	beq	.Lzero
+
+	addis	r3,r2,.short_constants@toc@ha
+	addi	r3,r3,.short_constants@toc@l
+
+	/* Calculate where in the constant table we need to start */
+	subfic	r6,r5,256
+	add	r3,r3,r6
+
+	/* How many 16 byte chunks? */
+	srdi	r7,r5,4
+	mtctr	r7
+
+	vxor	v19,v19,v19
+	vxor	v20,v20,v20
+
+	lvx	v0,0,r4
+	lvx	v16,0,r3
+	VPERM(v0,v0,v16,byteswap)
+	vxor	v0,v0,v8	/* xor in initial value */
+	VPMSUMW(v0,v0,v16)
+	bdz	.Lv0
+
+	lvx	v1,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v1,v1,v17,byteswap)
+	VPMSUMW(v1,v1,v17)
+	bdz	.Lv1
+
+	lvx	v2,off32,r4
+	lvx	v16,off32,r3
+	VPERM(v2,v2,v16,byteswap)
+	VPMSUMW(v2,v2,v16)
+	bdz	.Lv2
+
+	lvx	v3,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v3,v3,v17,byteswap)
+	VPMSUMW(v3,v3,v17)
+	bdz	.Lv3
+
+	lvx	v4,off64,r4
+	lvx	v16,off64,r3
+	VPERM(v4,v4,v16,byteswap)
+	VPMSUMW(v4,v4,v16)
+	bdz	.Lv4
+
+	lvx	v5,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v5,v5,v17,byteswap)
+	VPMSUMW(v5,v5,v17)
+	bdz	.Lv5
+
+	lvx	v6,off96,r4
+	lvx	v16,off96,r3
+	VPERM(v6,v6,v16,byteswap)
+	VPMSUMW(v6,v6,v16)
+	bdz	.Lv6
+
+	lvx	v7,off112,r4
+	lvx	v17,off112,r3
+	VPERM(v7,v7,v17,byteswap)
+	VPMSUMW(v7,v7,v17)
+	bdz	.Lv7
+
+	addi	r3,r3,128
+	addi	r4,r4,128
+
+	lvx	v8,0,r4
+	lvx	v16,0,r3
+	VPERM(v8,v8,v16,byteswap)
+	VPMSUMW(v8,v8,v16)
+	bdz	.Lv8
+
+	lvx	v9,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v9,v9,v17,byteswap)
+	VPMSUMW(v9,v9,v17)
+	bdz	.Lv9
+
+	lvx	v10,off32,r4
+	lvx	v16,off32,r3
+	VPERM(v10,v10,v16,byteswap)
+	VPMSUMW(v10,v10,v16)
+	bdz	.Lv10
+
+	lvx	v11,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v11,v11,v17,byteswap)
+	VPMSUMW(v11,v11,v17)
+	bdz	.Lv11
+
+	lvx	v12,off64,r4
+	lvx	v16,off64,r3
+	VPERM(v12,v12,v16,byteswap)
+	VPMSUMW(v12,v12,v16)
+	bdz	.Lv12
+
+	lvx	v13,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v13,v13,v17,byteswap)
+	VPMSUMW(v13,v13,v17)
+	bdz	.Lv13
+
+	lvx	v14,off96,r4
+	lvx	v16,off96,r3
+	VPERM(v14,v14,v16,byteswap)
+	VPMSUMW(v14,v14,v16)
+	bdz	.Lv14
+
+	lvx	v15,off112,r4
+	lvx	v17,off112,r3
+	VPERM(v15,v15,v17,byteswap)
+	VPMSUMW(v15,v15,v17)
+
+.Lv15:	vxor	v19,v19,v15
+.Lv14:	vxor	v20,v20,v14
+.Lv13:	vxor	v19,v19,v13
+.Lv12:	vxor	v20,v20,v12
+.Lv11:	vxor	v19,v19,v11
+.Lv10:	vxor	v20,v20,v10
+.Lv9:	vxor	v19,v19,v9
+.Lv8:	vxor	v20,v20,v8
+.Lv7:	vxor	v19,v19,v7
+.Lv6:	vxor	v20,v20,v6
+.Lv5:	vxor	v19,v19,v5
+.Lv4:	vxor	v20,v20,v4
+.Lv3:	vxor	v19,v19,v3
+.Lv2:	vxor	v20,v20,v2
+.Lv1:	vxor	v19,v19,v1
+.Lv0:	vxor	v20,v20,v0
+
+	vxor	v0,v19,v20
+
+	b	.Lbarrett_reduction
+
+.Lzero:
+	mr	r3,r10
+	b	.Lout
+
+FUNC_END(__crc32_vpmsum)
diff --git a/src/common/crc32c_ppc_constants.h b/src/common/crc32c_ppc_constants.h
new file mode 100644
index 00000000..12a1e1d5
--- /dev/null
+++ b/src/common/crc32c_ppc_constants.h
@@ -0,0 +1,979 @@
+/* Copyright (C) 2017 International Business Machines Corp.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#define CRC 0x1edc6f41
+#define REFLECT
+
+#ifndef __ASSEMBLY__
+#ifdef CRC_TABLE
+static const unsigned int crc_table[] = {
+	0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4,
+	0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb,
+	0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b,
+	0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24,
+	0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b,
+	0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
+	0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54,
+	0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b,
+	0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a,
+	0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35,
+	0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5,
+	0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
+	0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45,
+	0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a,
+	0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a,
+	0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595,
+	0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48,
+	0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
+	0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687,
+	0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198,
+	0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927,
+	0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38,
+	0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8,
+	0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7,
+	0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096,
+	0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789,
+	0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859,
+	0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46,
+	0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9,
+	0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
+	0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36,
+	0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829,
+	0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c,
+	0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93,
+	0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043,
+	0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
+	0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3,
+	0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc,
+	0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c,
+	0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033,
+	0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652,
+	0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
+	0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d,
+	0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982,
+	0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d,
+	0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622,
+	0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2,
+	0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed,
+	0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530,
+	0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f,
+	0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff,
+	0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0,
+	0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f,
+	0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540,
+	0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90,
+	0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f,
+	0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee,
+	0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1,
+	0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321,
+	0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
+	0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81,
+	0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e,
+	0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e,
+	0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351,};
+
+#endif
+
+#ifdef FAST_ZERO_TABLE
+/* fast zero table */
+unsigned int crc_zero[] = {
+	0x100,
+	0x10000,
+	0x1edc6f41,
+	0x3aab4576,
+	0x18571d18,
+	0x59a3508a,
+	0xaa97d41d,
+	0xe78dbf1d,
+	0x4ef6a711,
+	0x2506c32e,
+	0x68d4e827,
+	0x546ea6b0,
+	0x465cebac,
+	0x26a86214,
+	0x964aa2fd,
+	0x3b4c5747,
+	0x6702ee7f,
+	0xd086629f,
+	0xf1f2043c,
+	0xc761a1ca,
+	0xa8964e9a,
+	0x90cab2ce,
+	0xc6e3583d,
+	0x3344e0be,
+	0x7d53914b,
+	0x3d953297,
+	0xfcf2eda0,
+	0x42f878a5,
+	0x2,
+	0x4,
+	0x10,
+	0x100,
+	0x10000,
+	0x1edc6f41,
+	0x3aab4576,
+	0x18571d18,
+	0x59a3508a,
+	0xaa97d41d,
+	0xe78dbf1d,
+	0x4ef6a711,
+	0x2506c32e,
+	0x68d4e827,
+	0x546ea6b0,
+	0x465cebac,
+	0x26a86214,
+	0x964aa2fd,
+	0x3b4c5747,
+	0x6702ee7f,
+	0xd086629f,
+	0xf1f2043c,
+	0xc761a1ca,
+	0xa8964e9a,
+	0x90cab2ce,
+	0xc6e3583d,
+	0x3344e0be,
+	0x7d53914b,
+	0x3d953297,
+	0xfcf2eda0,
+	0x42f878a5,
+	0x2,
+	0x4,
+	0x10,
+	0x100,
+	0x10000
+};
+#endif
+
+#else
+#define MAX_SIZE	32768
+.constants:
+
+	/* Reduce 262144 kbits to 1024 bits */
+	/* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
+	.octa 0x00000000b6ca9e20000000009c37c408
+
+	/* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
+	.octa 0x00000000350249a800000001b51df26c
+
+	/* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
+	.octa 0x00000001862dac54000000000724b9d0
+
+	/* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
+	.octa 0x00000001d87fb48c00000001c00532fe
+
+	/* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
+	.octa 0x00000001f39b699e00000000f05a9362
+
+	/* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
+	.octa 0x0000000101da11b400000001e1007970
+
+	/* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
+	.octa 0x00000001cab571e000000000a57366ee
+
+	/* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
+	.octa 0x00000000c7020cfe0000000192011284
+
+	/* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
+	.octa 0x00000000cdaed1ae0000000162716d9a
+
+	/* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
+	.octa 0x00000001e804effc00000000cd97ecde
+
+	/* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
+	.octa 0x0000000077c3ea3a0000000058812bc0
+
+	/* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
+	.octa 0x0000000068df31b40000000088b8c12e
+
+	/* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
+	.octa 0x00000000b059b6c200000001230b234c
+
+	/* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
+	.octa 0x0000000145fb8ed800000001120b416e
+
+	/* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
+	.octa 0x00000000cbc0916800000001974aecb0
+
+	/* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
+	.octa 0x000000005ceeedc2000000008ee3f226
+
+	/* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
+	.octa 0x0000000047d74e8600000001089aba9a
+
+	/* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
+	.octa 0x00000001407e9e220000000065113872
+
+	/* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
+	.octa 0x00000001da967bda000000005c07ec10
+
+	/* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
+	.octa 0x000000006c8983680000000187590924
+
+	/* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
+	.octa 0x00000000f2d14c9800000000e35da7c6
+
+	/* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
+	.octa 0x00000001993c6ad4000000000415855a
+
+	/* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
+	.octa 0x000000014683d1ac0000000073617758
+
+	/* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
+	.octa 0x00000001a7c93e6c0000000176021d28
+
+	/* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
+	.octa 0x000000010211e90a00000001c358fd0a
+
+	/* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
+	.octa 0x000000001119403e00000001ff7a2c18
+
+	/* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
+	.octa 0x000000001c3261aa00000000f2d9f7e4
+
+	/* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
+	.octa 0x000000014e37a634000000016cf1f9c8
+
+	/* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
+	.octa 0x0000000073786c0c000000010af9279a
+
+	/* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
+	.octa 0x000000011dc037f80000000004f101e8
+
+	/* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
+	.octa 0x0000000031433dfc0000000070bcf184
+
+	/* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
+	.octa 0x000000009cde8348000000000a8de642
+
+	/* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
+	.octa 0x0000000038d3c2a60000000062ea130c
+
+	/* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
+	.octa 0x000000011b25f26000000001eb31cbb2
+
+	/* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
+	.octa 0x000000001629e6f00000000170783448
+
+	/* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
+	.octa 0x0000000160838b4c00000001a684b4c6
+
+	/* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
+	.octa 0x000000007a44011c00000000253ca5b4
+
+	/* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
+	.octa 0x00000000226f417a0000000057b4b1e2
+
+	/* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
+	.octa 0x0000000045eb2eb400000000b6bd084c
+
+	/* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
+	.octa 0x000000014459d70c0000000123c2d592
+
+	/* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
+	.octa 0x00000001d406ed8200000000159dafce
+
+	/* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
+	.octa 0x0000000160c8e1a80000000127e1a64e
+
+	/* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
+	.octa 0x0000000027ba80980000000056860754
+
+	/* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
+	.octa 0x000000006d92d01800000001e661aae8
+
+	/* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
+	.octa 0x000000012ed7e3f200000000f82c6166
+
+	/* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
+	.octa 0x000000002dc8778800000000c4f9c7ae
+
+	/* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
+	.octa 0x0000000018240bb80000000074203d20
+
+	/* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
+	.octa 0x000000001ad381580000000198173052
+
+	/* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
+	.octa 0x00000001396b78f200000001ce8aba54
+
+	/* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
+	.octa 0x000000011a68133400000001850d5d94
+
+	/* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
+	.octa 0x000000012104732e00000001d609239c
+
+	/* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
+	.octa 0x00000000a140d90c000000001595f048
+
+	/* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
+	.octa 0x00000001b7215eda0000000042ccee08
+
+	/* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
+	.octa 0x00000001aaf1df3c000000010a389d74
+
+	/* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
+	.octa 0x0000000029d15b8a000000012a840da6
+
+	/* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
+	.octa 0x00000000f1a96922000000001d181c0c
+
+	/* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
+	.octa 0x00000001ac80d03c0000000068b7d1f6
+
+	/* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
+	.octa 0x000000000f11d56a000000005b0f14fc
+
+	/* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
+	.octa 0x00000001f1c022a20000000179e9e730
+
+	/* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
+	.octa 0x0000000173d00ae200000001ce1368d6
+
+	/* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
+	.octa 0x00000001d4ffe4ac0000000112c3a84c
+
+	/* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
+	.octa 0x000000016edc5ae400000000de940fee
+
+	/* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
+	.octa 0x00000001f1a0214000000000fe896b7e
+
+	/* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
+	.octa 0x00000000ca0b28a000000001f797431c
+
+	/* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
+	.octa 0x00000001928e30a20000000053e989ba
+
+	/* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
+	.octa 0x0000000097b1b002000000003920cd16
+
+	/* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
+	.octa 0x00000000b15bf90600000001e6f579b8
+
+	/* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
+	.octa 0x00000000411c5d52000000007493cb0a
+
+	/* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
+	.octa 0x00000001c36f330000000001bdd376d8
+
+	/* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
+	.octa 0x00000001119227e0000000016badfee6
+
+	/* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
+	.octa 0x00000000114d47020000000071de5c58
+
+	/* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
+	.octa 0x00000000458b5b9800000000453f317c
+
+	/* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
+	.octa 0x000000012e31fb8e0000000121675cce
+
+	/* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
+	.octa 0x000000005cf619d800000001f409ee92
+
+	/* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
+	.octa 0x0000000063f4d8b200000000f36b9c88
+
+	/* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
+	.octa 0x000000004138dc8a0000000036b398f4
+
+	/* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
+	.octa 0x00000001d29ee8e000000001748f9adc
+
+	/* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
+	.octa 0x000000006a08ace800000001be94ec00
+
+	/* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
+	.octa 0x0000000127d4201000000000b74370d6
+
+	/* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
+	.octa 0x0000000019d76b6200000001174d0b98
+
+	/* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
+	.octa 0x00000001b1471f6e00000000befc06a4
+
+	/* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
+	.octa 0x00000001f64c19cc00000001ae125288
+
+	/* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
+	.octa 0x00000000003c0ea00000000095c19b34
+
+	/* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
+	.octa 0x000000014d73abf600000001a78496f2
+
+	/* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
+	.octa 0x00000001620eb84400000001ac5390a0
+
+	/* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
+	.octa 0x0000000147655048000000002a80ed6e
+
+	/* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
+	.octa 0x0000000067b5077e00000001fa9b0128
+
+	/* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
+	.octa 0x0000000010ffe20600000001ea94929e
+
+	/* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
+	.octa 0x000000000fee8f1e0000000125f4305c
+
+	/* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
+	.octa 0x00000001da26fbae00000001471e2002
+
+	/* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
+	.octa 0x00000001b3a8bd880000000132d2253a
+
+	/* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
+	.octa 0x00000000e8f3898e00000000f26b3592
+
+	/* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
+	.octa 0x00000000b0d0d28c00000000bc8b67b0
+
+	/* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
+	.octa 0x0000000030f2a798000000013a826ef2
+
+	/* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
+	.octa 0x000000000fba10020000000081482c84
+
+	/* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
+	.octa 0x00000000bdb9bd7200000000e77307c2
+
+	/* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
+	.octa 0x0000000075d3bf5a00000000d4a07ec8
+
+	/* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
+	.octa 0x00000000ef1f98a00000000017102100
+
+	/* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
+	.octa 0x00000000689c760200000000db406486
+
+	/* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
+	.octa 0x000000016d5fa5fe0000000192db7f88
+
+	/* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
+	.octa 0x00000001d0d2b9ca000000018bf67b1e
+
+	/* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
+	.octa 0x0000000041e7b470000000007c09163e
+
+	/* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
+	.octa 0x00000001cbb6495e000000000adac060
+
+	/* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
+	.octa 0x000000010052a0b000000000bd8316ae
+
+	/* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
+	.octa 0x00000001d8effb5c000000019f09ab54
+
+	/* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
+	.octa 0x00000001d969853c0000000125155542
+
+	/* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
+	.octa 0x00000000523ccce2000000018fdb5882
+
+	/* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
+	.octa 0x000000001e2436bc00000000e794b3f4
+
+	/* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
+	.octa 0x00000000ddd1c3a2000000016f9bb022
+
+	/* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
+	.octa 0x0000000019fcfe3800000000290c9978
+
+	/* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
+	.octa 0x00000001ce95db640000000083c0f350
+
+	/* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
+	.octa 0x00000000af5828060000000173ea6628
+
+	/* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
+	.octa 0x00000001006388f600000001c8b4e00a
+
+	/* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
+	.octa 0x0000000179eca00a00000000de95d6aa
+
+	/* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
+	.octa 0x0000000122410a6a000000010b7f7248
+
+	/* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
+	.octa 0x000000004288e87c00000001326e3a06
+
+	/* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
+	.octa 0x000000016c5490da00000000bb62c2e6
+
+	/* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
+	.octa 0x00000000d1c71f6e0000000156a4b2c2
+
+	/* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
+	.octa 0x00000001b4ce08a6000000011dfe763a
+
+	/* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
+	.octa 0x00000001466ba60c000000007bcca8e2
+
+	/* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
+	.octa 0x00000001f6c488a40000000186118faa
+
+	/* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
+	.octa 0x000000013bfb06820000000111a65a88
+
+	/* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
+	.octa 0x00000000690e9e54000000003565e1c4
+
+	/* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
+	.octa 0x00000000281346b6000000012ed02a82
+
+	/* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
+	.octa 0x000000015646402400000000c486ecfc
+
+	/* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
+	.octa 0x000000016063a8dc0000000001b951b2
+
+	/* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
+	.octa 0x0000000116a663620000000048143916
+
+	/* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
+	.octa 0x000000017e8aa4d200000001dc2ae124
+
+	/* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
+	.octa 0x00000001728eb10c00000001416c58d6
+
+	/* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
+	.octa 0x00000001b08fd7fa00000000a479744a
+
+	/* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
+	.octa 0x00000001092a16e80000000096ca3a26
+
+	/* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
+	.octa 0x00000000a505637c00000000ff223d4e
+
+	/* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
+	.octa 0x00000000d94869b2000000010e84da42
+
+	/* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
+	.octa 0x00000001c8b203ae00000001b61ba3d0
+
+	/* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
+	.octa 0x000000005704aea000000000680f2de8
+
+	/* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
+	.octa 0x000000012e295fa2000000008772a9a8
+
+	/* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
+	.octa 0x000000011d0908bc0000000155f295bc
+
+	/* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
+	.octa 0x0000000193ed97ea00000000595f9282
+
+	/* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
+	.octa 0x000000013a0f1c520000000164b1c25a
+
+	/* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
+	.octa 0x000000010c2c40c000000000fbd67c50
+
+	/* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
+	.octa 0x00000000ff6fac3e0000000096076268
+
+	/* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
+	.octa 0x000000017b3609c000000001d288e4cc
+
+	/* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
+	.octa 0x0000000088c8c92200000001eaac1bdc
+
+	/* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
+	.octa 0x00000001751baae600000001f1ea39e2
+
+	/* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
+	.octa 0x000000010795297200000001eb6506fc
+
+	/* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
+	.octa 0x0000000162b00abe000000010f806ffe
+
+	/* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
+	.octa 0x000000000d7b404c000000010408481e
+
+	/* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
+	.octa 0x00000000763b13d40000000188260534
+
+	/* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
+	.octa 0x00000000f6dc22d80000000058fc73e0
+
+	/* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
+	.octa 0x000000007daae06000000000391c59b8
+
+	/* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
+	.octa 0x000000013359ab7c000000018b638400
+
+	/* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
+	.octa 0x000000008add438a000000011738f5c4
+
+	/* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
+	.octa 0x00000001edbefdea000000008cf7c6da
+
+	/* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
+	.octa 0x000000004104e0f800000001ef97fb16
+
+	/* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
+	.octa 0x00000000b48a82220000000102130e20
+
+	/* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
+	.octa 0x00000001bcb4684400000000db968898
+
+	/* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
+	.octa 0x000000013293ce0a00000000b5047b5e
+
+	/* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
+	.octa 0x00000001710d0844000000010b90fdb2
+
+	/* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
+	.octa 0x0000000117907f6e000000004834a32e
+
+	/* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
+	.octa 0x0000000087ddf93e0000000059c8f2b0
+
+	/* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
+	.octa 0x000000005970e9b00000000122cec508
+
+	/* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
+	.octa 0x0000000185b2b7d0000000000a330cda
+
+	/* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
+	.octa 0x00000001dcee0efc000000014a47148c
+
+	/* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
+	.octa 0x0000000030da27220000000042c61cb8
+
+	/* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
+	.octa 0x000000012f925a180000000012fe6960
+
+	/* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
+	.octa 0x00000000dd2e357c00000000dbda2c20
+
+	/* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
+	.octa 0x00000000071c80de000000011122410c
+
+	/* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
+	.octa 0x000000011513140a00000000977b2070
+
+	/* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
+	.octa 0x00000001df876e8e000000014050438e
+
+	/* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
+	.octa 0x000000015f81d6ce0000000147c840e8
+
+	/* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
+	.octa 0x000000019dd94dbe00000001cc7c88ce
+
+	/* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
+	.octa 0x00000001373d206e00000001476b35a4
+
+	/* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
+	.octa 0x00000000668ccade000000013d52d508
+
+	/* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
+	.octa 0x00000001b192d268000000008e4be32e
+
+	/* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
+	.octa 0x00000000e30f3a7800000000024120fe
+
+	/* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
+	.octa 0x000000010ef1f7bc00000000ddecddb4
+
+	/* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
+	.octa 0x00000001f5ac738000000000d4d403bc
+
+	/* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
+	.octa 0x000000011822ea7000000001734b89aa
+
+	/* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
+	.octa 0x00000000c3a33848000000010e7a58d6
+
+	/* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
+	.octa 0x00000001bd151c2400000001f9f04e9c
+
+	/* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
+	.octa 0x0000000056002d7600000000b692225e
+
+	/* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
+	.octa 0x000000014657c4f4000000019b8d3f3e
+
+	/* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
+	.octa 0x0000000113742d7c00000001a874f11e
+
+	/* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
+	.octa 0x000000019c5920ba000000010d5a4254
+
+	/* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
+	.octa 0x000000005216d2d600000000bbb2f5d6
+
+	/* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
+	.octa 0x0000000136f5ad8a0000000179cc0e36
+
+	/* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
+	.octa 0x000000018b07beb600000001dca1da4a
+
+	/* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
+	.octa 0x00000000db1e93b000000000feb1a192
+
+	/* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
+	.octa 0x000000000b96fa3a00000000d1eeedd6
+
+	/* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
+	.octa 0x00000001d9968af0000000008fad9bb4
+
+	/* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
+	.octa 0x000000000e4a77a200000001884938e4
+
+	/* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
+	.octa 0x00000000508c2ac800000001bc2e9bc0
+
+	/* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
+	.octa 0x0000000021572a8000000001f9658a68
+
+	/* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
+	.octa 0x00000001b859daf2000000001b9224fc
+
+	/* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
+	.octa 0x000000016f7884740000000055b2fb84
+
+	/* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
+	.octa 0x00000001b438810e000000018b090348
+
+	/* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
+	.octa 0x0000000095ddc6f2000000011ccbd5ea
+
+	/* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
+	.octa 0x00000001d977c20c0000000007ae47f8
+
+	/* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
+	.octa 0x00000000ebedb99a0000000172acbec0
+
+	/* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
+	.octa 0x00000001df9e9e9200000001c6e3ff20
+
+	/* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
+	.octa 0x00000001a4a3f95200000000e1b38744
+
+	/* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
+	.octa 0x00000000e2f5122000000000791585b2
+
+	/* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
+	.octa 0x000000004aa01f3e00000000ac53b894
+
+	/* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
+	.octa 0x00000000b3e90a5800000001ed5f2cf4
+
+	/* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
+	.octa 0x000000000c9ca2aa00000001df48b2e0
+
+	/* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
+	.octa 0x000000015168231600000000049c1c62
+
+	/* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
+	.octa 0x0000000036fce78c000000017c460c12
+
+	/* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
+	.octa 0x000000009037dc10000000015be4da7e
+
+	/* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
+	.octa 0x00000000d3298582000000010f38f668
+
+	/* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
+	.octa 0x00000001b42e8ad60000000039f40a00
+
+	/* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
+	.octa 0x00000000142a983800000000bd4c10c4
+
+	/* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
+	.octa 0x0000000109c7f1900000000042db1d98
+
+	/* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
+	.octa 0x0000000056ff931000000001c905bae6
+
+	/* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
+	.octa 0x00000001594513aa00000000069d40ea
+
+	/* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
+	.octa 0x00000001e3b5b1e8000000008e4fbad0
+
+	/* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
+	.octa 0x000000011dd5fc080000000047bedd46
+
+	/* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
+	.octa 0x00000001675f0cc20000000026396bf8
+
+	/* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
+	.octa 0x00000000d1c8dd4400000000379beb92
+
+	/* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
+	.octa 0x0000000115ebd3d8000000000abae54a
+
+	/* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
+	.octa 0x00000001ecbd0dac0000000007e6a128
+
+	/* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
+	.octa 0x00000000cdf67af2000000000ade29d2
+
+	/* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
+	.octa 0x000000004c01ff4c00000000f974c45c
+
+	/* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
+	.octa 0x00000000f2d8657e00000000e77ac60a
+
+	/* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
+	.octa 0x000000006bae74c40000000145895816
+
+	/* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
+	.octa 0x0000000152af8aa00000000038e362be
+
+	/* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
+	.octa 0x0000000004663802000000007f991a64
+
+	/* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
+	.octa 0x00000001ab2f5afc00000000fa366d3a
+
+	/* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
+	.octa 0x0000000074a4ebd400000001a2bb34f0
+
+	/* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
+	.octa 0x00000001d7ab3a4c0000000028a9981e
+
+	/* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
+	.octa 0x00000001a8da60c600000001dbc672be
+
+	/* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
+	.octa 0x000000013cf6382000000000b04d77f6
+
+	/* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
+	.octa 0x00000000bec12e1e0000000124400d96
+
+	/* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
+	.octa 0x00000001c6368010000000014ca4b414
+
+	/* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
+	.octa 0x00000001e6e78758000000012fe2c938
+
+	/* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
+	.octa 0x000000008d7f2b3c00000001faed01e6
+
+	/* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
+	.octa 0x000000016b4a156e000000007e80ecfe
+
+	/* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
+	.octa 0x00000001c63cfeb60000000098daee94
+
+	/* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
+	.octa 0x000000015f902670000000010a04edea
+
+	/* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
+	.octa 0x00000001cd5de11e00000001c00b4524
+
+	/* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
+	.octa 0x000000001acaec540000000170296550
+
+	/* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
+	.octa 0x000000002bd0ca780000000181afaa48
+
+	/* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
+	.octa 0x0000000032d63d5c0000000185a31ffa
+
+	/* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
+	.octa 0x000000001c6d4e4c000000002469f608
+
+	/* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
+	.octa 0x0000000106a60b92000000006980102a
+
+	/* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
+	.octa 0x00000000d3855e120000000111ea9ca8
+
+	/* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
+	.octa 0x00000000e312563600000001bd1d29ce
+
+	/* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
+	.octa 0x000000009e8f7ea400000001b34b9580
+
+	/* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
+	.octa 0x00000001c82e562c000000003076054e
+
+	/* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
+	.octa 0x00000000ca9f09ce000000012a608ea4
+
+	/* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
+	.octa 0x00000000c63764e600000000784d05fe
+
+	/* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
+	.octa 0x0000000168d2e49e000000016ef0d82a
+
+	/* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
+	.octa 0x00000000e986c1480000000075bda454
+
+	/* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
+	.octa 0x00000000cfb65894000000003dc0a1c4
+
+	/* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
+	.octa 0x0000000111cadee400000000e9a5d8be
+
+	/* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
+	.octa 0x0000000171fb63ce00000001609bc4b4
+
+.short_constants:
+
+	/* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */
+	/* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod p(x)` */
+	.octa 0x7fec2963e5bf80485cf015c388e56f72
+
+	/* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod p(x)` */
+	.octa 0x38e888d4844752a9963a18920246e2e6
+
+	/* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod p(x)` */
+	.octa 0x42316c00730206ad419a441956993a31
+
+	/* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod p(x)` */
+	.octa 0x543d5c543e65ddf9924752ba2b830011
+
+	/* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod p(x)` */
+	.octa 0x78e87aaf56767c9255bd7f9518e4a304
+
+	/* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod p(x)` */
+	.octa 0x8f68fcec1903da7f6d76739fe0553f1e
+
+	/* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod p(x)` */
+	.octa 0x3f4840246791d588c133722b1fe0b5c3
+
+	/* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod p(x)` */
+	.octa 0x34c96751b04de25a64b67ee0e55ef1f3
+
+	/* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` */
+	.octa 0x156c8e180b4a395b069db049b8fdb1e7
+
+	/* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */
+	.octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e
+
+	/* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */
+	.octa 0x041d37768cd75659817cdc5119b29a35
+
+	/* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */
+	.octa 0x3a0777818cfaa9651ce9d94b36c41f1c
+
+	/* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */
+	.octa 0x0e148e8252377a554f256efcb82be955
+
+	/* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */
+	.octa 0x9c25531d19e65ddeec1631edb2dea967
+
+	/* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */
+	.octa 0x790606ff9957c0a65d27e147510ac59a
+
+	/* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */
+	.octa 0x82f63b786ea2d55ca66805eb18b8ea18
+
+
+.barrett_constants:
+	/* 33 bit reflected Barrett constant m - (4^32)/n */
+	.octa 0x000000000000000000000000dea713f1	/* x^64 div p(x)` */
+	/* 33 bit reflected Barrett constant n */
+	.octa 0x00000000000000000000000105ec76f1
+#endif
diff --git a/src/common/crc32c_ppc_fast_zero_asm.S b/src/common/crc32c_ppc_fast_zero_asm.S
new file mode 100644
index 00000000..a53df1de
--- /dev/null
+++ b/src/common/crc32c_ppc_fast_zero_asm.S
@@ -0,0 +1,77 @@
+/*
+ * Use the fixed point version of Barrett reduction to compute a mod n
+ * over GF(2) for given n using POWER8 instructions. We use k = 32.
+ *
+ * http://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of either:
+ *
+ *  a) the GNU General Public License as published by the Free Software
+ *     Foundation; either version 2 of the License, or (at your option)
+ *     any later version, or
+ *  b) the Apache License, Version 2.0
+ */
+#include <ppc-asm.h>
+#include "common/ppc-opcode.h"
+
+#undef toc
+
+#ifndef r1
+#define r1 1
+#endif
+
+#ifndef r2
+#define r2 2
+#endif
+
+	.section	.data
+.balign 16
+
+.barrett_fz_constants:
+	/* Barrett constant m - (4^32)/n */
+	.octa 0x0000000000000000000000011f91caf6	/* x^64 div p(x) */
+	/* Barrett constant n */
+	.octa 0x0000000000000000000000011edc6f41
+
+.text
+/* unsigned int barrett_reduction(unsigned long val) */
+FUNC_START(barrett_reduction)
+	addis	r4,r2,.barrett_fz_constants@toc@ha
+	addi	r4,r4,.barrett_fz_constants@toc@l
+
+	li	r5,16
+	vxor	v1,v1,v1	/* zero v1 */
+
+	/* Get a into v0 */
+	MTVRD(v0, r3)
+	vsldoi	v0,v1,v0,8	/* shift into bottom 64 bits, this is a */
+
+	/* Load constants */
+	lvx	v2,0,r4		/* m */
+	lvx	v3,r5,r4	/* n */
+
+	/*
+	 * Now for the actual algorithm. The idea is to calculate q,
+	 * the multiple of our polynomial that we need to subtract. By
+	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
+	 * result back down 2x bits, we round down to the nearest multiple.
+	 */
+	VPMSUMD(v4,v0,v2)	/* ma */
+	vsldoi	v4,v1,v4,8	/* q = floor(ma/(2^64)) */
+	VPMSUMD(v4,v4,v3)	/* qn */
+	vxor	v0,v0,v4	/* a - qn, subtraction is xor in GF(2) */
+
+	/*
+	 * Get the result into r3. We need to shift it left 8 bytes:
+	 * V0 [ 0 1 2 X ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+	vsldoi	v0,v0,v1,8	/* shift result into top 64 bits of v0 */
+	MFVRD(r3, v0)
+
+	blr
+FUNC_END(barrett_reduction)
+	
diff --git a/src/common/darwin_errno.cc b/src/common/darwin_errno.cc
new file mode 100644
index 00000000..4409abcd
--- /dev/null
+++ b/src/common/darwin_errno.cc
@@ -0,0 +1,233 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <errno.h>
+#include "include/types.h"
+#include "include/compat.h"
+
+
+// converts from linux errno values to host values
+__s32 ceph_to_hostos_errno(__s32 r)
+{
+  if (r < -34) {
+    switch (r) {
+      case -35:
+        return -EDEADLK;
+      case -36:
+        return -ENAMETOOLONG;
+      case -37:
+        return -ENOLCK;
+      case -38:
+        return -ENOSYS;
+      case -39:
+        return -ENOTEMPTY;
+      case -40:
+        return -ELOOP;
+      case -42:
+        return -ENOMSG;
+      case -43:
+        return -EIDRM;
+      case -44:
+        return -EPERM; //TODO ECHRNG
+      case -45:
+        return -EPERM; //TODO EL2NSYNC
+      case -46:
+        return -EPERM; //TODO EL3HLT
+      case -47:
+        return -EPERM; //TODO EL3RST
+      case -48:
+        return -EPERM; //TODO ELNRNG
+      case -49:
+        return -EPERM; //TODO EUNATCH
+      case -51:
+        return -EPERM; //TODO EL2HLT;
+      case -52:
+        return -EPERM; //TODO EBADE
+      case -53:
+        return -EPERM; //TODO EBADR
+      case -54:
+        return -EPERM; //TODO EXFULL
+      case -55:
+        return -EPERM; //TODO ENOANO
+      case -56:
+        return -EPERM; //TODO EBADRQC
+      case -57:
+        return -EPERM; //TODO EBADSLT
+      case -59:
+        return -EPERM; //TODO EBFONT
+      case -60:
+        return -ENOSTR;
+      case -61:
+        return -ENODATA;
+      case -62:
+        return -ETIME;
+      case -63:
+        return -ENOSR;
+      case -64:
+        return -EPERM; //TODO ENONET
+      case -65:
+        return -EPERM; //TODO ENOPKG
+      case -66:
+        return -EREMOTE;
+      case -67:
+        return -ENOLINK;
+      case -68:
+        return -EPERM; //TODO EADV
+      case -69:
+        return -EPERM; //TODO ESRMNT
+      case -70:
+        return -EPERM; //TODO ECOMM
+      case -71:
+        return -EPROTO;
+      case -72:
+        return -EMULTIHOP;
+      case -73:
+        return -EPERM; //TODO EDOTDOT
+      case -74:
+        return -EBADMSG;
+      case -75:
+        return -EOVERFLOW;
+      case -76:
+        return -EPERM; //TODO ENOTUNIQ
+      case -77:
+        return -EPERM; //TODO EBADFD
+      case -78:
+        return -EPERM; //TODO EREMCHG
+      case -79:
+        return -EPERM; //TODO ELIBACC
+      case -80:
+        return -EPERM; //TODO ELIBBAD
+      case -81:
+        return -EPERM; //TODO ELIBSCN
+      case -82:
+        return -EPERM; //TODO ELIBMAX
+      case -83:
+	return -EPERM; // TODO ELIBEXEC
+      case -84:
+        return -EILSEQ;
+      case -85:
+        return -EINTR;
+      case -86:
+        return -EPERM; //ESTRPIPE;
+      case -87:
+        return -EUSERS;
+      case -88:
+        return -ENOTSOCK;
+      case -89:
+        return -EDESTADDRREQ;
+      case -90:
+        return -EMSGSIZE;
+      case -91:
+        return -EPROTOTYPE;
+      case -92:
+        return -ENOPROTOOPT;
+      case -93:
+        return -EPROTONOSUPPORT;
+      case -94:
+        return -ESOCKTNOSUPPORT;
+      case -95:
+        return -EOPNOTSUPP;
+      case -96:
+        return -EPFNOSUPPORT;
+      case -97:
+        return -EAFNOSUPPORT;
+      case -98:
+        return -EADDRINUSE;
+      case -99:
+        return -EADDRNOTAVAIL;
+      case -100:
+        return -ENETDOWN;
+      case -101:
+        return -ENETUNREACH;
+      case -102:
+        return -ENETRESET;
+      case -103:
+        return -ECONNABORTED;
+      case -104:
+        return -ECONNRESET;
+      case -105:
+        return -ENOBUFS;
+      case -106:
+        return -EISCONN;
+      case -107:
+        return -ENOTCONN;
+      case -108:
+        return -ESHUTDOWN;
+      case -109:
+        return -ETOOMANYREFS;
+      case -110:
+        return -ETIMEDOUT;
+      case -111:
+        return -ECONNREFUSED;
+      case -112:
+        return -EHOSTDOWN;
+      case -113:
+        return -EHOSTUNREACH;
+      case -114:
+        return -EALREADY;
+      case -115:
+        return -EINPROGRESS;
+      case -116:
+        return -ESTALE;
+      case -117:
+        return -EPERM; //TODO EUCLEAN
+      case -118:
+        return -EPERM; //TODO ENOTNAM
+      case -119:
+        return -EPERM; //TODO ENAVAIL
+      case -120:
+        return -EPERM; //TODO EISNAM
+      case -121:
+        return -EREMOTEIO;
+      case -122:
+        return -EDQUOT;
+      case -123:
+        return -EPERM; //TODO ENOMEDIUM
+      case -124:
+        return -EPERM; //TODO EMEDIUMTYPE - not used
+      case -125:
+        return -ECANCELED;
+      case -126:
+        return -EPERM; //TODO ENOKEY
+      case -127:
+        return -EPERM; //TODO EKEYEXPIRED
+      case -128:
+        return -EPERM; //TODO EKEYREVOKED
+      case -129:
+        return -EPERM; //TODO EKEYREJECTED
+      case -130:
+        return -EOWNERDEAD;
+      case -131:
+        return -ENOTRECOVERABLE;
+      case -132:
+        return -EPERM; //TODO ERFKILL
+      case -133:
+        return -EPERM; //TODO EHWPOISON
+
+      default: {
+        break;
+      }
+    }
+  }
+  return r; // otherwise return original value
+}
+
+// converts Host OS errno values to linux/Ceph values
+// XXX Currently not worked out
+__s32 hostos_to_ceph_errno(__s32 r)
+{
+  return r;
+}
+
+
diff --git a/src/common/debug.h b/src/common/debug.h
new file mode 100644
index 00000000..1d4c7470
--- /dev/null
+++ b/src/common/debug.h
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_DEBUG_H
+#define CEPH_DEBUG_H
+
+#include "common/dout.h"
+
+/* Global version of the stuff in common/dout.h
+ */
+
+#define dout(v) ldout((dout_context), (v))
+
+#define pdout(v, p) lpdout((dout_context), (v), (p))
+
+#define dlog_p(sub, v) ldlog_p1((dout_context), (sub), (v))
+
+#define generic_dout(v) lgeneric_dout((dout_context), (v))
+
+#define derr lderr((dout_context))
+
+#define generic_derr lgeneric_derr((dout_context))
+
+#endif
diff --git a/src/common/deleter.h b/src/common/deleter.h
new file mode 100644
index 00000000..767ef4b1
--- /dev/null
+++ b/src/common/deleter.h
@@ -0,0 +1,258 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_COMMON_DELETER_H
+#define CEPH_COMMON_DELETER_H
+
+#include <atomic>
+
+/// \addtogroup memory-module
+/// @{
+
+/// Provides a mechanism for managing the lifetime of a buffer.
+///
+/// A \c deleter is an object that is used to inform the consumer
+/// of some buffer (not referenced by the deleter itself) how to
+/// delete the buffer.  This can be by calling an arbitrary function
+/// or destroying an object carried by the deleter.  Examples of
+/// a deleter's encapsulated actions are:
+///
+///  - calling \c std::free(p) on some captured pointer, p
+///  - calling \c delete \c p on some captured pointer, p
+///  - decrementing a reference count somewhere
+///
+/// A deleter performs its action from its destructor.
+class deleter final {
+ public:
+  /// \cond internal
+  struct impl;
+  struct raw_object_tag {};
+  /// \endcond
+ private:
+  // if bit 0 set, point to object to be freed directly.
+  impl* _impl = nullptr;
+ public:
+  /// Constructs an empty deleter that does nothing in its destructor.
+  deleter() = default;
+  deleter(const deleter&) = delete;
+  /// Moves a deleter.
+  deleter(deleter&& x) noexcept : _impl(x._impl) { x._impl = nullptr; }
+  /// \cond internal
+  explicit deleter(impl* i) : _impl(i) {}
+  deleter(raw_object_tag tag, void* object)
+          : _impl(from_raw_object(object)) {}
+  /// \endcond
+  /// Destroys the deleter and carries out the encapsulated action.
+  ~deleter();
+  deleter& operator=(deleter&& x);
+  deleter& operator=(deleter&) = delete;
+  /// Performs a sharing operation.  The encapsulated action will only
+  /// be carried out after both the original deleter and the returned
+  /// deleter are both destroyed.
+  ///
+  /// \return a deleter with the same encapsulated action as this one.
+  deleter share();
+  /// Checks whether the deleter has an associated action.
+  explicit operator bool() const { return bool(_impl); }
+  /// \cond internal
+  void reset(impl* i) {
+    this->~deleter();
+    new (this) deleter(i);
+  }
+  /// \endcond
+  /// Appends another deleter to this deleter.  When this deleter is
+  /// destroyed, both encapsulated actions will be carried out.
+  void append(deleter d);
+ private:
+  static bool is_raw_object(impl* i) {
+    auto x = reinterpret_cast<uintptr_t>(i);
+    return x & 1;
+  }
+  bool is_raw_object() const {
+    return is_raw_object(_impl);
+  }
+  static void* to_raw_object(impl* i) {
+    auto x = reinterpret_cast<uintptr_t>(i);
+    return reinterpret_cast<void*>(x & ~uintptr_t(1));
+  }
+  void* to_raw_object() const {
+    return to_raw_object(_impl);
+  }
+  impl* from_raw_object(void* object) {
+    auto x = reinterpret_cast<uintptr_t>(object);
+    return reinterpret_cast<impl*>(x | 1);
+  }
+};
+
+/// \cond internal
+struct deleter::impl {
+  std::atomic_uint refs;
+  deleter next;
+  impl(deleter next) : refs(1), next(std::move(next)) {}
+  virtual ~impl() {}
+};
+/// \endcond
+
+inline deleter::~deleter() {
+  if (is_raw_object()) {
+    std::free(to_raw_object());
+    return;
+  }
+  if (_impl && --_impl->refs == 0) {
+    delete _impl;
+  }
+}
+
+inline deleter& deleter::operator=(deleter&& x) {
+  if (this != &x) {
+    this->~deleter();
+    new (this) deleter(std::move(x));
+  }
+  return *this;
+}
+
+/// \cond internal
+template <typename Deleter>
+struct lambda_deleter_impl final : deleter::impl {
+  Deleter del;
+  lambda_deleter_impl(deleter next, Deleter&& del)
+          : impl(std::move(next)), del(std::move(del)) {}
+  ~lambda_deleter_impl() override { del(); }
+};
+
+template <typename Object>
+struct object_deleter_impl final : deleter::impl {
+  Object obj;
+  object_deleter_impl(deleter next, Object&& obj)
+          : impl(std::move(next)), obj(std::move(obj)) {}
+};
+
+template <typename Object>
+inline
+object_deleter_impl<Object>* make_object_deleter_impl(deleter next, Object obj) {
+  return new object_deleter_impl<Object>(std::move(next), std::move(obj));
+}
+/// \endcond
+
+/// Makes a \ref deleter that encapsulates the action of
+/// destroying an object, as well as running another deleter.  The input
+/// object is moved to the deleter, and destroyed when the deleter is destroyed.
+///
+/// \param d deleter that will become part of the new deleter's encapsulated action
+/// \param o object whose destructor becomes part of the new deleter's encapsulated action
+/// \related deleter
+template <typename Object>
+deleter make_deleter(deleter next, Object o) {
+  return deleter(new lambda_deleter_impl<Object>(std::move(next), std::move(o)));
+}
+
+/// Makes a \ref deleter that encapsulates the action of destroying an object.  The input
+/// object is moved to the deleter, and destroyed when the deleter is destroyed.
+///
+/// \param o object whose destructor becomes the new deleter's encapsulated action
+/// \related deleter
+template <typename Object>
+deleter make_deleter(Object o) {
+  return make_deleter(deleter(), std::move(o));
+}
+
+/// \cond internal
+struct free_deleter_impl final : deleter::impl {
+  void* obj;
+  free_deleter_impl(void* obj) : impl(deleter()), obj(obj) {}
+  ~free_deleter_impl() override { std::free(obj); }
+};
+/// \endcond
+
+inline deleter deleter::share() {
+  if (!_impl) {
+    return deleter();
+  }
+  if (is_raw_object()) {
+    _impl = new free_deleter_impl(to_raw_object());
+  }
+  ++_impl->refs;
+  return deleter(_impl);
+}
+
+// Appends 'd' to the chain of deleters. Avoids allocation if possible. For
+// performance reasons the current chain should be shorter and 'd' should be
+// longer.
+inline void deleter::append(deleter d) {
+  if (!d._impl) {
+    return;
+  }
+  impl* next_impl = _impl;
+  deleter* next_d = this;
+  while (next_impl) {
+    if (next_impl == d._impl)
+      return ;
+    if (is_raw_object(next_impl)) {
+      next_d->_impl = next_impl = new free_deleter_impl(to_raw_object(next_impl));
+    }
+    if (next_impl->refs != 1) {
+      next_d->_impl = next_impl = make_object_deleter_impl(std::move(next_impl->next), deleter(next_impl));
+    }
+    next_d = &next_impl->next;
+    next_impl = next_d->_impl;
+  }
+  next_d->_impl = d._impl;
+  d._impl = nullptr;
+}
+
+/// Makes a deleter that calls \c std::free() when it is destroyed.
+///
+/// \param obj object to free.
+/// \related deleter
+inline deleter make_free_deleter(void* obj) {
+  if (!obj) {
+    return deleter();
+  }
+  return deleter(deleter::raw_object_tag(), obj);
+}
+
+/// Makes a deleter that calls \c std::free() when it is destroyed, as well
+/// as invoking the encapsulated action of another deleter.
+///
+/// \param d deleter to invoke.
+/// \param obj object to free.
+/// \related deleter
+inline deleter make_free_deleter(deleter next, void* obj) {
+  return make_deleter(std::move(next), [obj] () mutable { std::free(obj); });
+}
+
+/// \see make_deleter(Object)
+/// \related deleter
+template <typename T>
+inline deleter make_object_deleter(T&& obj) {
+  return deleter{make_object_deleter_impl(deleter(), std::move(obj))};
+}
+
+/// \see make_deleter(deleter, Object)
+/// \related deleter
+template <typename T>
+inline deleter make_object_deleter(deleter d, T&& obj) {
+  return deleter{make_object_deleter_impl(std::move(d), std::move(obj))};
+}
+
+/// @}
+
+#endif /* CEPH_COMMON_DELETER_H */
diff --git a/src/common/dns_resolve.cc b/src/common/dns_resolve.cc
new file mode 100644
index 00000000..7d5c4841
--- /dev/null
+++ b/src/common/dns_resolve.cc
@@ -0,0 +1,371 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <arpa/inet.h>
+
+#include "include/scope_guard.h"
+#include "dns_resolve.h"
+#include "common/debug.h"
+
+#define dout_subsys ceph_subsys_
+
+
+namespace ceph {
+
+#ifdef HAVE_RES_NQUERY
+
+int ResolvHWrapper::res_nquery(res_state s, const char *hostname, int cls,
+    int type, u_char *buf, int bufsz) {
+  return ::res_nquery(s, hostname, cls, type, buf, bufsz);
+}
+
+int ResolvHWrapper::res_nsearch(res_state s, const char *hostname, int cls,
+    int type, u_char *buf, int bufsz) {
+  return ::res_nsearch(s, hostname, cls, type, buf, bufsz);
+}
+
+#else
+
+int ResolvHWrapper::res_query(const char *hostname, int cls,
+    int type, u_char *buf, int bufsz) {
+  return ::res_query(hostname, cls, type, buf, bufsz);
+}
+
+int ResolvHWrapper::res_search(const char *hostname, int cls,
+    int type, u_char *buf, int bufsz) {
+  return ::res_search(hostname, cls, type, buf, bufsz);
+}
+
+#endif
+
+DNSResolver::~DNSResolver()
+{
+#ifdef HAVE_RES_NQUERY
+  list<res_state>::iterator iter;
+  for (iter = states.begin(); iter != states.end(); ++iter) {
+    struct __res_state *s = *iter;
+    delete s;
+  }
+#endif
+  delete resolv_h;
+}
+
+#ifdef HAVE_RES_NQUERY
+int DNSResolver::get_state(CephContext *cct, res_state *ps)
+{
+  lock.lock();
+  if (!states.empty()) {
+    res_state s = states.front();
+    states.pop_front();
+    lock.unlock();
+    *ps = s;
+    return 0;
+  }
+  lock.unlock();
+  struct __res_state *s = new struct __res_state;
+  s->options = 0;
+  if (res_ninit(s) < 0) {
+    delete s;
+    lderr(cct) << "ERROR: failed to call res_ninit()" << dendl;
+    return -EINVAL;
+  }
+  *ps = s;
+  return 0;
+}
+
+void DNSResolver::put_state(res_state s)
+{
+  std::lock_guard l(lock);
+  states.push_back(s);
+}
+#endif
+
+int DNSResolver::resolve_cname(CephContext *cct, const string& hostname,
+    string *cname, bool *found)
+{
+  *found = false;
+
+#ifdef HAVE_RES_NQUERY
+  res_state res;
+  int r = get_state(cct, &res);
+  if (r < 0) {
+    return r;
+  }
+  auto put_state = make_scope_guard([res, this] {
+      this->put_state(res);
+    });
+#endif
+
+#define LARGE_ENOUGH_DNS_BUFSIZE 1024
+  unsigned char buf[LARGE_ENOUGH_DNS_BUFSIZE];
+
+#define MAX_FQDN_SIZE 255
+  char host[MAX_FQDN_SIZE + 1];
+  const char *origname = hostname.c_str();
+  unsigned char *pt, *answer;
+  unsigned char *answend;
+  int len;
+
+#ifdef HAVE_RES_NQUERY
+  len = resolv_h->res_nquery(res, origname, ns_c_in, ns_t_cname, buf, sizeof(buf));
+#else
+  {
+# ifndef HAVE_THREAD_SAFE_RES_QUERY
+    std::lock_guard l(lock);
+# endif
+    len = resolv_h->res_query(origname, ns_c_in, ns_t_cname, buf, sizeof(buf));
+  }
+#endif
+  if (len < 0) {
+    lderr(cct) << "res_query() failed" << dendl;
+    return 0;
+  }
+
+  answer = buf;
+  pt = answer + NS_HFIXEDSZ;
+  answend = answer + len;
+
+  /* read query */
+  if ((len = dn_expand(answer, answend, pt, host, sizeof(host))) < 0) {
+    lderr(cct) << "ERROR: dn_expand() failed" << dendl;
+    return -EINVAL;
+  }
+  pt += len;
+
+  if (pt + 4 > answend) {
+    lderr(cct) << "ERROR: bad reply" << dendl;
+    return -EIO;
+  }
+
+  int type;
+  NS_GET16(type, pt);
+
+  if (type != ns_t_cname) {
+    lderr(cct) << "ERROR: failed response type: type=" << type <<
+      " (was expecting " << ns_t_cname << ")" << dendl;
+    return -EIO;
+  }
+
+  pt += NS_INT16SZ; /* class */
+
+  /* read answer */
+  if ((len = dn_expand(answer, answend, pt, host, sizeof(host))) < 0) {
+    return 0;
+  }
+  pt += len;
+  ldout(cct, 20) << "name=" << host << dendl;
+
+  if (pt + 10 > answend) {
+    lderr(cct) << "ERROR: bad reply" << dendl;
+    return -EIO;
+  }
+
+  NS_GET16(type, pt);
+  pt += NS_INT16SZ; /* class */
+  pt += NS_INT32SZ; /* ttl */
+  pt += NS_INT16SZ; /* size */
+
+  if ((len = dn_expand(answer, answend, pt, host, sizeof(host))) < 0) {
+    return 0;
+  }
+  ldout(cct, 20) << "cname host=" << host << dendl;
+  *cname = host;
+
+  *found = true;
+  return 0;
+}
+
+
+int DNSResolver::resolve_ip_addr(CephContext *cct, const string& hostname,
+    entity_addr_t *addr) {
+
+#ifdef HAVE_RES_NQUERY
+  res_state res;
+  int r = get_state(cct, &res);
+  if (r < 0) {
+    return r;
+  }
+  auto put_state = make_scope_guard([res, this] {
+      this->put_state(res);
+    });
+  return this->resolve_ip_addr(cct, &res, hostname, addr);
+#else
+  return this->resolve_ip_addr(cct, NULL, hostname, addr);
+#endif
+
+}
+
+int DNSResolver::resolve_ip_addr(CephContext *cct, res_state *res, const string& hostname, 
+    entity_addr_t *addr) {
+
+  u_char nsbuf[NS_PACKETSZ];
+  int len;
+  int family = cct->_conf->ms_bind_ipv6 ? AF_INET6 : AF_INET;
+  int type = cct->_conf->ms_bind_ipv6 ? ns_t_aaaa : ns_t_a;
+
+#ifdef HAVE_RES_NQUERY
+  len = resolv_h->res_nquery(*res, hostname.c_str(), ns_c_in, type, nsbuf, sizeof(nsbuf));
+#else
+  {
+# ifndef HAVE_THREAD_SAFE_RES_QUERY
+    std::lock_guard l(lock);
+# endif
+    len = resolv_h->res_query(hostname.c_str(), ns_c_in, type, nsbuf, sizeof(nsbuf));
+  }
+#endif
+  if (len < 0) {
+    lderr(cct) << "res_query() failed" << dendl;
+    return len;
+  }
+  else if (len == 0) {
+    ldout(cct, 20) << "no address found for hostname " << hostname << dendl;
+    return -1;
+  }
+
+  ns_msg handle;
+  ns_initparse(nsbuf, len, &handle);
+
+  if (ns_msg_count(handle, ns_s_an) == 0) {
+    ldout(cct, 20) << "no address found for hostname " << hostname << dendl;
+    return -1;
+  }
+
+  ns_rr rr;
+  int r;
+  if ((r = ns_parserr(&handle, ns_s_an, 0, &rr)) < 0) {
+      lderr(cct) << "error while parsing DNS record" << dendl;
+      return r;
+  }
+
+  char addr_buf[64];
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(addr_buf, 0, sizeof(addr_buf));
+  inet_ntop(family, ns_rr_rdata(rr), addr_buf, sizeof(addr_buf));
+  if (!addr->parse(addr_buf)) {
+      lderr(cct) << "failed to parse address '" << (const char *)ns_rr_rdata(rr) 
+        << "'" << dendl;
+      return -1;
+  }
+
+  return 0;
+}
+
+int DNSResolver::resolve_srv_hosts(CephContext *cct, const string& service_name, 
+    const SRV_Protocol trans_protocol,
+    map<string, DNSResolver::Record> *srv_hosts) {
+  return this->resolve_srv_hosts(cct, service_name, trans_protocol, "", srv_hosts);
+}
+
+int DNSResolver::resolve_srv_hosts(CephContext *cct, const string& service_name, 
+    const SRV_Protocol trans_protocol, const string& domain,
+    map<string, DNSResolver::Record> *srv_hosts) {
+
+#ifdef HAVE_RES_NQUERY
+  res_state res;
+  int r = get_state(cct, &res);
+  if (r < 0) {
+    return r;
+  }
+  auto put_state = make_scope_guard([res, this] {
+      this->put_state(res);
+    });
+#endif
+
+  u_char nsbuf[NS_PACKETSZ];
+  int num_hosts;
+
+  string proto_str = srv_protocol_to_str(trans_protocol);
+  string query_str = "_"+service_name+"._"+proto_str+(domain.empty() ? ""
+      : "."+domain);
+  int len;
+
+#ifdef HAVE_RES_NQUERY
+  len = resolv_h->res_nsearch(res, query_str.c_str(), ns_c_in, ns_t_srv, nsbuf,
+      sizeof(nsbuf));
+#else
+  {
+# ifndef HAVE_THREAD_SAFE_RES_QUERY
+    std::lock_guard l(lock);
+# endif
+    len = resolv_h->res_search(query_str.c_str(), ns_c_in, ns_t_srv, nsbuf,
+        sizeof(nsbuf));
+  }
+#endif
+  if (len < 0) {
+    lderr(cct) << "failed for service " << query_str << dendl;
+    return len;
+  }
+  else if (len == 0) {
+    ldout(cct, 20) << "No hosts found for service " << query_str << dendl;
+    return 0;
+  }
+
+  ns_msg handle;
+
+  ns_initparse(nsbuf, len, &handle);
+
+  num_hosts = ns_msg_count (handle, ns_s_an);
+  if (num_hosts == 0) {
+    ldout(cct, 20) << "No hosts found for service " << query_str << dendl;
+    return 0;
+  }
+
+  ns_rr rr;
+  char full_target[NS_MAXDNAME];
+
+  for (int i = 0; i < num_hosts; i++) {
+    int r;
+    if ((r = ns_parserr(&handle, ns_s_an, i, &rr)) < 0) {
+      lderr(cct) << "Error while parsing DNS record" << dendl;
+      return r;
+    }
+
+    string full_srv_name = ns_rr_name(rr);
+    string protocol = "_" + proto_str;
+    string srv_domain = full_srv_name.substr(full_srv_name.find(protocol)
+        + protocol.length());
+
+    auto rdata = ns_rr_rdata(rr);
+    uint16_t priority = ns_get16(rdata); rdata += NS_INT16SZ;
+    rdata += NS_INT16SZ;	// weight
+    uint16_t port = ns_get16(rdata); rdata += NS_INT16SZ;
+    // FIPS zeroization audit 20191115: this memset is not security related.
+    memset(full_target, 0, sizeof(full_target));
+    ns_name_uncompress(ns_msg_base(handle), ns_msg_end(handle),
+                       rdata, full_target, sizeof(full_target));
+
+    entity_addr_t addr;
+#ifdef HAVE_RES_NQUERY
+    r = this->resolve_ip_addr(cct, &res, full_target, &addr);
+#else
+    r = this->resolve_ip_addr(cct, NULL, full_target, &addr);
+#endif
+
+    if (r == 0) {
+      addr.set_port(port);
+      string target = full_target;
+      auto end = target.find(srv_domain);
+      if (end == target.npos) {
+	lderr(cct) << "resolved target not in search domain: "
+		   << target << " / " << srv_domain << dendl;
+	return -EINVAL;
+      }
+      target = target.substr(0, end);
+      (*srv_hosts)[target] = {priority, addr};
+    }
+  }
+  return 0;
+}
+
+}
diff --git a/src/common/dns_resolve.h b/src/common/dns_resolve.h
new file mode 100644
index 00000000..f5d94109
--- /dev/null
+++ b/src/common/dns_resolve.h
@@ -0,0 +1,162 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 SUSE LINUX GmbH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_DNS_RESOLVE_H
+#define CEPH_DNS_RESOLVE_H
+
+#include <netinet/in.h>
+#include <resolv.h>
+
+#include "common/ceph_mutex.h"
+#include "msg/msg_types.h"		// for entity_addr_t
+
+namespace ceph {
+
+/**
+ * this class is used to facilitate the testing of
+ * resolv.h functions.
+ */
+class ResolvHWrapper {
+  public:
+    virtual ~ResolvHWrapper() {}
+
+#ifdef HAVE_RES_NQUERY
+    virtual int res_nquery(res_state s, const char *hostname, int cls, int type, 
+        u_char *buf, int bufsz);
+
+    virtual int res_nsearch(res_state s, const char *hostname, int cls, int type, 
+        u_char *buf, int bufsz);
+#else
+    virtual int res_query(const char *hostname, int cls, int type,
+        u_char *buf, int bufsz);
+
+    virtual int res_search(const char *hostname, int cls, int type,
+        u_char *buf, int bufsz);
+#endif
+
+};
+
+
+/**
+ * @class DNSResolver
+ *
+ * This is a singleton class that exposes the functionality of DNS querying.
+ */
+class DNSResolver {
+
+  public:
+    // singleton declaration
+    static DNSResolver *get_instance()
+    {
+      static DNSResolver instance;
+      return &instance;
+    }
+    DNSResolver(DNSResolver const&) = delete;
+    void operator=(DNSResolver const&) = delete;
+
+    // this function is used by the unit test
+    static DNSResolver *get_instance(ResolvHWrapper *resolv_wrapper) {
+      DNSResolver *resolv = DNSResolver::get_instance();
+      delete resolv->resolv_h;
+      resolv->resolv_h = resolv_wrapper;
+      return resolv;
+    }
+
+    enum class SRV_Protocol {
+      TCP, UDP
+    };
+
+
+    struct Record {
+      uint16_t priority;
+      entity_addr_t addr;
+    };
+
+    int resolve_cname(CephContext *cct, const std::string& hostname,
+        std::string *cname, bool *found);
+
+    /**
+     * Resolves the address given a hostname.
+     *
+     * @param hostname the hostname to resolved
+     * @param[out] addr the hostname's address
+     * @returns 0 on success, negative error code on failure
+     */
+    int resolve_ip_addr(CephContext *cct, const std::string& hostname,
+        entity_addr_t *addr);
+
+    /**
+     * Returns the list of hostnames and addresses that provide a given
+     * service configured as DNS SRV records.
+     *
+     * @param service_name the service name
+     * @param trans_protocol the IP protocol used by the service (TCP or UDP)
+     * @param[out] srv_hosts the hostname to address map of available hosts
+     *             providing the service. If no host exists the map is not
+     *             changed.
+     * @returns 0 on success, negative error code on failure
+     */
+    int resolve_srv_hosts(CephContext *cct, const std::string& service_name,
+        const SRV_Protocol trans_protocol, std::map<std::string, Record> *srv_hosts);
+
+    /**
+     * Returns the list of hostnames and addresses that provide a given
+     * service configured as DNS SRV records.
+     *
+     * @param service_name the service name
+     * @param trans_protocol the IP protocol used by the service (TCP or UDP)
+     * @param domain the domain of the service
+     * @param[out] srv_hosts the hostname to address map of available hosts
+     *             providing the service. If no host exists the map is not
+     *             changed.
+     * @returns 0 on success, negative error code on failure
+     */
+    int resolve_srv_hosts(CephContext *cct, const std::string& service_name,
+        const SRV_Protocol trans_protocol, const std::string& domain,
+        std::map<std::string, Record> *srv_hosts);
+
+  private:
+    DNSResolver() { resolv_h = new ResolvHWrapper(); }
+    ~DNSResolver();
+
+    ceph::mutex lock = ceph::make_mutex("DNSResolver::lock");
+    ResolvHWrapper *resolv_h;
+#ifdef HAVE_RES_NQUERY
+    std::list<res_state> states;
+
+    int get_state(CephContext *cct, res_state *ps);
+    void put_state(res_state s);
+#endif
+
+    /* this private function allows to reuse the res_state structure used
+     * by other function of this class
+     */
+    int resolve_ip_addr(CephContext *cct, res_state *res,
+        const std::string& hostname, entity_addr_t *addr);
+
+    std::string srv_protocol_to_str(SRV_Protocol proto) {
+      switch (proto) {
+        case SRV_Protocol::TCP:
+          return "tcp";
+        case SRV_Protocol::UDP:
+          return "udp";
+      }
+      return "";
+    }
+
+};
+
+}
+
+#endif
+
diff --git a/src/common/dout.cc b/src/common/dout.cc
new file mode 100644
index 00000000..4bbbfc8f
--- /dev/null
+++ b/src/common/dout.cc
@@ -0,0 +1,14 @@
+
+#include <iostream>
+
+void dout_emergency(const char * const str)
+{
+  std::cerr << str;
+  std::cerr.flush();
+}
+
+void dout_emergency(const std::string &str)
+{
+  std::cerr << str;
+  std::cerr.flush();
+}
diff --git a/src/common/dout.h b/src/common/dout.h
new file mode 100644
index 00000000..e1bfdca7
--- /dev/null
+++ b/src/common/dout.h
@@ -0,0 +1,191 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2010 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2010 Dreamhost
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_DOUT_H
+#define CEPH_DOUT_H
+
+#include <type_traits>
+
+#include "include/ceph_assert.h"
+#ifdef WITH_SEASTAR
+#include <seastar/util/log.hh>
+#include "crimson/common/log.h"
+#include "crimson/common/config_proxy.h"
+#else
+#include "global/global_context.h"
+#include "common/ceph_context.h"
+#include "common/config.h"
+#include "common/likely.h"
+#include "common/Clock.h"
+#include "log/Log.h"
+#endif
+
+extern void dout_emergency(const char * const str);
+extern void dout_emergency(const std::string &str);
+
+// intentionally conflict with endl
+class _bad_endl_use_dendl_t { public: _bad_endl_use_dendl_t(int) {} };
+static const _bad_endl_use_dendl_t endl = 0;
+inline std::ostream& operator<<(std::ostream& out, _bad_endl_use_dendl_t) {
+  ceph_abort_msg("you are using the wrong endl.. use std::endl or dendl");
+  return out;
+}
+
+class DoutPrefixProvider {
+public:
+  virtual std::ostream& gen_prefix(std::ostream& out) const = 0;
+  virtual CephContext *get_cct() const = 0;
+  virtual unsigned get_subsys() const = 0;
+  virtual ~DoutPrefixProvider() {}
+};
+
+// a prefix provider with empty prefix
+class NoDoutPrefix : public DoutPrefixProvider {
+  CephContext *const cct;
+  const unsigned subsys;
+ public:
+  NoDoutPrefix(CephContext *cct, unsigned subsys) : cct(cct), subsys(subsys) {}
+
+  std::ostream& gen_prefix(std::ostream& out) const override { return out; }
+  CephContext *get_cct() const override { return cct; }
+  unsigned get_subsys() const override { return subsys; }
+};
+
+// a prefix provider with static (const char*) prefix
+class DoutPrefix : public NoDoutPrefix {
+  const char *const prefix;
+ public:
+  DoutPrefix(CephContext *cct, unsigned subsys, const char *prefix)
+    : NoDoutPrefix(cct, subsys), prefix(prefix) {}
+
+  std::ostream& gen_prefix(std::ostream& out) const override {
+    return out << prefix;
+  }
+};
+
+// a prefix provider that composes itself on top of another
+class DoutPrefixPipe : public DoutPrefixProvider {
+  const DoutPrefixProvider& dpp;
+ public:
+  DoutPrefixPipe(const DoutPrefixProvider& dpp) : dpp(dpp) {}
+
+  std::ostream& gen_prefix(std::ostream& out) const override final {
+    dpp.gen_prefix(out);
+    add_prefix(out);
+    return out;
+  }
+  CephContext *get_cct() const override { return dpp.get_cct(); }
+  unsigned get_subsys() const override { return dpp.get_subsys(); }
+
+  virtual void add_prefix(std::ostream& out) const = 0;
+};
+
+// helpers
+namespace ceph::dout {
+
+template<typename T>
+struct dynamic_marker_t {
+  T value;
+  operator T() const { return value; }
+};
+
+template<typename T>
+dynamic_marker_t<T> need_dynamic(T&& t) {
+  return dynamic_marker_t<T>{ std::forward<T>(t) };
+}
+
+template<typename T>
+struct is_dynamic : public std::false_type {};
+
+template<typename T>
+struct is_dynamic<dynamic_marker_t<T>> : public std::true_type {};
+
+} // ceph::dout
+
+// generic macros
+#define dout_prefix *_dout
+
+#ifdef WITH_SEASTAR
+#define dout_impl(cct, sub, v)                                          \
+  do {                                                                  \
+    if (ceph::common::local_conf()->subsys.should_gather(sub, v)) {     \
+      seastar::logger& _logger = ceph::get_logger(sub);                 \
+      const auto _lv = v;                                               \
+      std::ostringstream _out;                                          \
+      std::ostream* _dout = &_out;
+#define dendl_impl                              \
+     "";                                        \
+      const std::string _s = _out.str();        \
+      if (_lv < 0) {                            \
+        _logger.error(_s.c_str());              \
+      } else if (_lv < 1) {                     \
+        _logger.warn(_s.c_str());               \
+      } else if (_lv < 5) {                     \
+        _logger.info(_s.c_str());               \
+      } else if (_lv < 10) {                    \
+        _logger.debug(_s.c_str());              \
+      } else {                                  \
+        _logger.trace(_s.c_str());              \
+      }                                         \
+    }                                           \
+  } while (0)
+#else
+#define dout_impl(cct, sub, v)						\
+  do {									\
+  const bool should_gather = [&](const auto cctX) {			\
+    if constexpr (ceph::dout::is_dynamic<decltype(sub)>::value ||	\
+		  ceph::dout::is_dynamic<decltype(v)>::value) {		\
+      return cctX->_conf->subsys.should_gather(sub, v);			\
+    } else {								\
+      /* The parentheses are **essential** because commas in angle	\
+       * brackets are NOT ignored on macro expansion! A language's	\
+       * limitation, sorry. */						\
+      return (cctX->_conf->subsys.template should_gather<sub, v>());	\
+    }									\
+  }(cct);								\
+									\
+  if (should_gather) {							\
+    ceph::logging::MutableEntry _dout_e(v, sub);                        \
+    static_assert(std::is_convertible<decltype(&*cct), 			\
+				      CephContext* >::value,		\
+		  "provided cct must be compatible with CephContext*"); \
+    auto _dout_cct = cct;						\
+    std::ostream* _dout = &_dout_e.get_ostream();
+
+#define dendl_impl std::flush;                                          \
+    _dout_cct->_log->submit_entry(std::move(_dout_e));                  \
+  }                                                                     \
+  } while (0)
+#endif	// WITH_SEASTAR
+
+#define lsubdout(cct, sub, v)  dout_impl(cct, ceph_subsys_##sub, v) dout_prefix
+#define ldout(cct, v)  dout_impl(cct, dout_subsys, v) dout_prefix
+#define lderr(cct) dout_impl(cct, ceph_subsys_, -1) dout_prefix
+
+#define ldpp_dout(dpp, v) 						\
+  if (decltype(auto) pdpp = (dpp); pdpp) /* workaround -Wnonnull-compare for 'this' */ \
+    dout_impl(pdpp->get_cct(), ceph::dout::need_dynamic(pdpp->get_subsys()), v) \
+      pdpp->gen_prefix(*_dout)
+
+#define lgeneric_subdout(cct, sub, v) dout_impl(cct, ceph_subsys_##sub, v) *_dout
+#define lgeneric_dout(cct, v) dout_impl(cct, ceph_subsys_, v) *_dout
+#define lgeneric_derr(cct) dout_impl(cct, ceph_subsys_, -1) *_dout
+
+#define ldlog_p1(cct, sub, lvl)                 \
+  (cct->_conf->subsys.should_gather((sub), (lvl)))
+
+#define dendl dendl_impl
+
+#endif
diff --git a/src/common/dummy.cc b/src/common/dummy.cc
new file mode 100644
index 00000000..26267142
--- /dev/null
+++ b/src/common/dummy.cc
@@ -0,0 +1,20 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+/*
+ * A dummy file with a .cc extension to make autotools link
+ * ceph_test_librbd_fsx with a C++ linker.  An approach w/o a physical
+ * dummy.cc recommended in 8.3.5 Libtool Convenience Libraries works,
+ * but breaks 'make tags' and friends.
+ */
diff --git a/src/common/entity_name.cc b/src/common/entity_name.cc
new file mode 100644
index 00000000..b9c33d7d
--- /dev/null
+++ b/src/common/entity_name.cc
@@ -0,0 +1,165 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/entity_name.h"
+
+#include <sstream>
+
+using std::string;
+
+extern const char *ceph_entity_type_name(int type);
+
+const std::array<EntityName::str_to_entity_type_t, 6> EntityName::STR_TO_ENTITY_TYPE = {{
+  { CEPH_ENTITY_TYPE_AUTH, "auth" },
+  { CEPH_ENTITY_TYPE_MON, "mon" },
+  { CEPH_ENTITY_TYPE_OSD, "osd" },
+  { CEPH_ENTITY_TYPE_MDS, "mds" },
+  { CEPH_ENTITY_TYPE_MGR, "mgr" },
+  { CEPH_ENTITY_TYPE_CLIENT, "client" },
+}};
+
+const std::string& EntityName::
+to_str() const
+{
+  return type_id;
+}
+
+const char* EntityName::
+to_cstr() const
+{
+  return type_id.c_str();
+}
+
+bool EntityName::
+from_str(const string& s)
+{
+  size_t pos = s.find('.');
+
+  if (pos == string::npos)
+    return false;
+ 
+  string type_ = s.substr(0, pos);
+  string id_ = s.substr(pos + 1);
+  if (set(type_, id_))
+    return false;
+  return true;
+}
+
+void EntityName::
+set(uint32_t type_, const std::string &id_)
+{
+  type = type_;
+  id = id_;
+
+  if (type) {
+    std::ostringstream oss;
+    oss << ceph_entity_type_name(type_) << "." << id_;
+    type_id = oss.str();
+  } else {
+    type_id.clear();
+  }
+}
+
+int EntityName::
+set(const std::string &type_, const std::string &id_)
+{
+  uint32_t t = str_to_ceph_entity_type(type_.c_str());
+  if (t == CEPH_ENTITY_TYPE_ANY)
+    return -EINVAL;
+  set(t, id_);
+  return 0;
+}
+
+void EntityName::
+set_type(uint32_t type_)
+{
+  set(type_, id);
+}
+
+int EntityName::
+set_type(const char *type_)
+{
+  return set(type_, id);
+}
+
+void EntityName::
+set_id(const std::string &id_)
+{
+  set(type, id_);
+}
+
+void EntityName::set_name(entity_name_t n)
+{
+  char s[40];
+  sprintf(s, "%lld", (long long)n.num());
+  set(n.type(), s);
+}
+
+const char* EntityName::
+get_type_str() const
+{
+  return ceph_entity_type_name(type);
+}
+
+const char *EntityName::
+get_type_name() const
+{
+  return ceph_entity_type_name(type);
+}
+
+const std::string &EntityName::
+get_id() const
+{
+  return id;
+}
+
+bool EntityName::
+has_default_id() const
+{
+  return (id == "admin");
+}
+
+std::string EntityName::
+get_valid_types_as_str()
+{
+  std::ostringstream out;
+  size_t i;
+  for (i = 0; i < STR_TO_ENTITY_TYPE.size(); ++i) {
+    if (i > 0) {
+      out << ", ";
+    }
+    out << STR_TO_ENTITY_TYPE[i].str;
+  }
+  return out.str();
+}
+
+uint32_t EntityName::str_to_ceph_entity_type(std::string_view s)
+{
+  size_t i;
+  for (i = 0; i < STR_TO_ENTITY_TYPE.size(); ++i) {
+    if (s == STR_TO_ENTITY_TYPE[i].str)
+      return STR_TO_ENTITY_TYPE[i].type;
+  }
+  return CEPH_ENTITY_TYPE_ANY;
+}
+
+bool operator<(const EntityName& a, const EntityName& b)
+{
+  return (a.type < b.type) || (a.type == b.type && a.id < b.id);
+}
+
+std::ostream& operator<<(std::ostream& out, const EntityName& n)
+{
+  return out << n.to_str();
+}
diff --git a/src/common/entity_name.h b/src/common/entity_name.h
new file mode 100644
index 00000000..be194ed7
--- /dev/null
+++ b/src/common/entity_name.h
@@ -0,0 +1,90 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_ENTITY_NAME_H
+#define CEPH_COMMON_ENTITY_NAME_H
+
+#include <ifaddrs.h>
+
+#include "msg/msg_types.h"
+
+/* Represents a Ceph entity name.
+ *
+ * For example, mds.0 is the name of the first metadata server.
+ * client
+ */
+struct EntityName
+{
+  void encode(bufferlist& bl) const {
+    using ceph::encode;
+    encode(type, bl);
+    encode(id, bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    using ceph::decode;
+    uint32_t type_;
+    std::string id_;
+    decode(type_, bl);
+    decode(id_, bl);
+    set(type_, id_);
+  }
+
+  const std::string& to_str() const;
+  const char *to_cstr() const;
+  bool from_str(const std::string& s);
+  void set(uint32_t type_, const std::string &id_);
+  int set(const std::string &type_, const std::string &id_);
+  void set_type(uint32_t type_);
+  int set_type(const char *type);
+  void set_id(const std::string &id_);
+  void set_name(entity_name_t n);
+
+  const char* get_type_str() const;
+
+  uint32_t get_type() const { return type; }
+  bool is_osd() const { return get_type() == CEPH_ENTITY_TYPE_OSD; }
+  bool is_mgr() const { return get_type() == CEPH_ENTITY_TYPE_MGR; }
+  bool is_mds() const { return get_type() == CEPH_ENTITY_TYPE_MDS; }
+  bool is_client() const { return get_type() == CEPH_ENTITY_TYPE_CLIENT; }
+  bool is_mon() const { return get_type() == CEPH_ENTITY_TYPE_MON; }
+
+  const char * get_type_name() const;
+  const std::string &get_id() const;
+  bool has_default_id() const;
+
+  static std::string get_valid_types_as_str();
+  static uint32_t str_to_ceph_entity_type(std::string_view);
+
+  friend bool operator<(const EntityName& a, const EntityName& b);
+  friend std::ostream& operator<<(std::ostream& out, const EntityName& n);
+  friend bool operator==(const EntityName& a, const EntityName& b);
+  friend bool operator!=(const EntityName& a, const EntityName& b);
+
+private:
+  struct str_to_entity_type_t {
+    uint32_t type;
+    const char *str;
+  };
+  static const std::array<str_to_entity_type_t, 6> STR_TO_ENTITY_TYPE;
+
+  uint32_t type = 0;
+  std::string id;
+  std::string type_id;
+};
+
+WRITE_CLASS_ENCODER(EntityName)
+
+WRITE_EQ_OPERATORS_2(EntityName, type, id)
+
+#endif
diff --git a/src/common/environment.cc b/src/common/environment.cc
new file mode 100644
index 00000000..a71bb346
--- /dev/null
+++ b/src/common/environment.cc
@@ -0,0 +1,43 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/environment.h"
+
+#include <stdlib.h>
+#include <strings.h>
+
+bool get_env_bool(const char *key)
+{
+  const char *val = getenv(key);
+  if (!val)
+    return false;
+  if (strcasecmp(val, "off") == 0)
+    return false;
+  if (strcasecmp(val, "no") == 0)
+    return false;
+  if (strcasecmp(val, "false") == 0)
+    return false;
+  if (strcasecmp(val, "0") == 0)
+    return false;
+  return true;
+}
+
+int get_env_int(const char *key)
+{
+  const char *val = getenv(key);
+  if (!val)
+    return 0;
+  int v = atoi(val);
+  return v;
+}
diff --git a/src/common/environment.h b/src/common/environment.h
new file mode 100644
index 00000000..9967a0ba
--- /dev/null
+++ b/src/common/environment.h
@@ -0,0 +1,21 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_ENVIRONMENT_H
+#define CEPH_COMMON_ENVIRONMENT_H
+
+extern bool get_env_bool(const char *key);
+extern int get_env_int(const char *key);
+
+#endif
diff --git a/src/common/errno.cc b/src/common/errno.cc
new file mode 100644
index 00000000..69e54254
--- /dev/null
+++ b/src/common/errno.cc
@@ -0,0 +1,28 @@
+#include "common/errno.h"
+#include "acconfig.h"
+
+#include <sstream>
+#include <string.h>
+
+std::string cpp_strerror(int err)
+{
+  char buf[128];
+  char *errmsg;
+
+  if (err < 0)
+    err = -err;
+  std::ostringstream oss;
+  buf[0] = '\0';
+
+  // strerror_r returns char * on Linux, and does not always fill buf
+#ifdef STRERROR_R_CHAR_P
+  errmsg = strerror_r(err, buf, sizeof(buf));
+#else
+  strerror_r(err, buf, sizeof(buf));
+  errmsg = buf;
+#endif
+
+  oss << "(" << err << ") " << errmsg;
+
+  return oss.str();
+}
diff --git a/src/common/errno.h b/src/common/errno.h
new file mode 100644
index 00000000..9dbd1438
--- /dev/null
+++ b/src/common/errno.h
@@ -0,0 +1,9 @@
+#ifndef CEPH_ERRNO_H
+#define CEPH_ERRNO_H
+
+#include <string>
+
+/* Return a given error code as a string */
+std::string cpp_strerror(int err);
+
+#endif
diff --git a/src/common/escape.cc b/src/common/escape.cc
new file mode 100644
index 00000000..67d68326
--- /dev/null
+++ b/src/common/escape.cc
@@ -0,0 +1,286 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/escape.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <iomanip>
+#include <boost/optional.hpp>
+
+/*
+ * Some functions for escaping RGW responses
+ */
+
+/* Static string length */
+#define SSTRL(x) ((sizeof(x)/sizeof(x[0])) - 1)
+
+#define LESS_THAN_XESCAPE		"&lt;"
+#define AMPERSAND_XESCAPE		"&amp;"
+#define GREATER_THAN_XESCAPE		"&gt;"
+#define SGL_QUOTE_XESCAPE		"&apos;"
+#define DBL_QUOTE_XESCAPE		"&quot;"
+
+size_t escape_xml_attr_len(const char *buf)
+{
+	const char *b;
+	size_t ret = 0;
+	for (b = buf; *b; ++b) {
+		unsigned char c = *b;
+		switch (c) {
+		case '<':
+			ret += SSTRL(LESS_THAN_XESCAPE);
+			break;
+		case '&':
+			ret += SSTRL(AMPERSAND_XESCAPE);
+			break;
+		case '>':
+			ret += SSTRL(GREATER_THAN_XESCAPE);
+			break;
+		case '\'':
+			ret += SSTRL(SGL_QUOTE_XESCAPE);
+			break;
+		case '"':
+			ret += SSTRL(DBL_QUOTE_XESCAPE);
+			break;
+		default:
+			// Escape control characters.
+			if (((c < 0x20) && (c != 0x09) && (c != 0x0a)) ||
+				    (c == 0x7f)) {
+				ret += 6;
+			}
+			else {
+				ret++;
+			}
+		}
+	}
+	// leave room for null terminator
+	ret++;
+	return ret;
+}
+
+void escape_xml_attr(const char *buf, char *out)
+{
+	char *o = out;
+	const char *b;
+	for (b = buf; *b; ++b) {
+		unsigned char c = *b;
+		switch (c) {
+		case '<':
+			memcpy(o, LESS_THAN_XESCAPE, SSTRL(LESS_THAN_XESCAPE));
+			o += SSTRL(LESS_THAN_XESCAPE);
+			break;
+		case '&':
+			memcpy(o, AMPERSAND_XESCAPE, SSTRL(AMPERSAND_XESCAPE));
+			o += SSTRL(AMPERSAND_XESCAPE);
+			break;
+		case '>':
+			memcpy(o, GREATER_THAN_XESCAPE, SSTRL(GREATER_THAN_XESCAPE));
+			o += SSTRL(GREATER_THAN_XESCAPE);
+			break;
+		case '\'':
+			memcpy(o, SGL_QUOTE_XESCAPE, SSTRL(SGL_QUOTE_XESCAPE));
+			o += SSTRL(SGL_QUOTE_XESCAPE);
+			break;
+		case '"':
+			memcpy(o, DBL_QUOTE_XESCAPE, SSTRL(DBL_QUOTE_XESCAPE));
+			o += SSTRL(DBL_QUOTE_XESCAPE);
+			break;
+		default:
+			// Escape control characters.
+			if (((c < 0x20) && (c != 0x09) && (c != 0x0a)) ||
+				    (c == 0x7f)) {
+				snprintf(o, 7, "&#x%02x;", c);
+				o += 6;
+			}
+			else {
+				*o++ = c;
+			}
+			break;
+		}
+	}
+	// null terminator
+	*o = '\0';
+}
+
+// applies hex formatting on construction, restores on destruction
+struct hex_formatter {
+  std::ostream& out;
+  const char old_fill;
+  const std::ostream::fmtflags old_flags;
+
+  explicit hex_formatter(std::ostream& out)
+    : out(out),
+      old_fill(out.fill('0')),
+      old_flags(out.setf(out.hex, out.basefield))
+  {}
+  ~hex_formatter() {
+    out.fill(old_fill);
+    out.flags(old_flags);
+  }
+};
+
+std::ostream& operator<<(std::ostream& out, const xml_stream_escaper& e)
+{
+  boost::optional<hex_formatter> fmt;
+
+  for (unsigned char c : e.str) {
+    switch (c) {
+    case '<':
+      out << LESS_THAN_XESCAPE;
+      break;
+    case '&':
+      out << AMPERSAND_XESCAPE;
+      break;
+    case '>':
+      out << GREATER_THAN_XESCAPE;
+      break;
+    case '\'':
+      out << SGL_QUOTE_XESCAPE;
+      break;
+    case '"':
+      out << DBL_QUOTE_XESCAPE;
+      break;
+    default:
+      // Escape control characters.
+      if (((c < 0x20) && (c != 0x09) && (c != 0x0a)) || (c == 0x7f)) {
+        if (!fmt) {
+          fmt.emplace(out); // enable hex formatting
+        }
+        out << "&#x" << std::setw(2) << static_cast<unsigned int>(c) << ';';
+      } else {
+        out << c;
+      }
+      break;
+    }
+  }
+  return out;
+}
+
+#define DBL_QUOTE_JESCAPE "\\\""
+#define BACKSLASH_JESCAPE "\\\\"
+#define TAB_JESCAPE "\\t"
+#define NEWLINE_JESCAPE "\\n"
+
+size_t escape_json_attr_len(const char *buf, size_t src_len)
+{
+	const char *b;
+	size_t i, ret = 0;
+	for (i = 0, b = buf; i < src_len; ++i, ++b) {
+		unsigned char c = *b;
+		switch (c) {
+		case '"':
+			ret += SSTRL(DBL_QUOTE_JESCAPE);
+			break;
+		case '\\':
+			ret += SSTRL(BACKSLASH_JESCAPE);
+			break;
+		case '\t':
+			ret += SSTRL(TAB_JESCAPE);
+			break;
+		case '\n':
+			ret += SSTRL(NEWLINE_JESCAPE);
+			break;
+		default:
+			// Escape control characters.
+			if ((c < 0x20) || (c == 0x7f)) {
+				ret += 6;
+			}
+			else {
+				ret++;
+			}
+		}
+	}
+	// leave room for null terminator
+	ret++;
+	return ret;
+}
+
+void escape_json_attr(const char *buf, size_t src_len, char *out)
+{
+	char *o = out;
+	const char *b;
+	size_t i;
+	for (i = 0, b = buf; i < src_len; ++i, ++b) {
+		unsigned char c = *b;
+		switch (c) {
+		case '"':
+			// cppcheck-suppress invalidFunctionArg
+			memcpy(o, DBL_QUOTE_JESCAPE, SSTRL(DBL_QUOTE_JESCAPE));
+			o += SSTRL(DBL_QUOTE_JESCAPE);
+			break;
+		case '\\':
+			// cppcheck-suppress invalidFunctionArg
+			memcpy(o, BACKSLASH_JESCAPE, SSTRL(BACKSLASH_JESCAPE));
+			o += SSTRL(BACKSLASH_JESCAPE);
+			break;
+		case '\t':
+			// cppcheck-suppress invalidFunctionArg
+			memcpy(o, TAB_JESCAPE, SSTRL(TAB_JESCAPE));
+			o += SSTRL(TAB_JESCAPE);
+			break;
+		case '\n':
+			// cppcheck-suppress invalidFunctionArg
+			memcpy(o, NEWLINE_JESCAPE, SSTRL(NEWLINE_JESCAPE));
+			o += SSTRL(NEWLINE_JESCAPE);
+			break;
+		default:
+			// Escape control characters.
+			if ((c < 0x20) || (c == 0x7f)) {
+				snprintf(o, 7, "\\u%04x", c);
+				o += 6;
+			}
+			else {
+				*o++ = c;
+			}
+			break;
+		}
+	}
+	// null terminator
+	*o = '\0';
+}
+
+std::ostream& operator<<(std::ostream& out, const json_stream_escaper& e)
+{
+  boost::optional<hex_formatter> fmt;
+
+  for (unsigned char c : e.str) {
+    switch (c) {
+    case '"':
+      out << DBL_QUOTE_JESCAPE;
+      break;
+    case '\\':
+      out << BACKSLASH_JESCAPE;
+      break;
+    case '\t':
+      out << TAB_JESCAPE;
+      break;
+    case '\n':
+      out << NEWLINE_JESCAPE;
+      break;
+    default:
+      // Escape control characters.
+      if ((c < 0x20) || (c == 0x7f)) {
+        if (!fmt) {
+          fmt.emplace(out); // enable hex formatting
+        }
+        out << "\\u" << std::setw(4) << static_cast<unsigned int>(c);
+      } else {
+        out << c;
+      }
+      break;
+    }
+  }
+  return out;
+}
diff --git a/src/common/escape.h b/src/common/escape.h
new file mode 100644
index 00000000..2d877015
--- /dev/null
+++ b/src/common/escape.h
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RGW_ESCAPE_H
+#define CEPH_RGW_ESCAPE_H
+
+#include <ostream>
+#include <boost/utility/string_view.hpp>
+
+/* Returns the length of a buffer that would be needed to escape 'buf'
+ * as an XML attribute
+ */
+size_t escape_xml_attr_len(const char *buf);
+
+/* Escapes 'buf' as an XML attribute. Assumes that 'out' is at least long
+ * enough to fit the output. You can find out the required length by calling
+ * escape_xml_attr_len first.
+ */
+void escape_xml_attr(const char *buf, char *out);
+
+/* Returns the length of a buffer that would be needed to escape 'buf'
+ * as an JSON attribute
+ */
+size_t escape_json_attr_len(const char *buf, size_t src_len);
+
+/* Escapes 'buf' as an JSON attribute. Assumes that 'out' is at least long
+ * enough to fit the output. You can find out the required length by calling
+ * escape_json_attr_len first.
+ */
+void escape_json_attr(const char *buf, size_t src_len, char *out);
+
+/* Note: we escape control characters. Although the XML spec doesn't actually
+ * require this, Amazon does it in their XML responses.
+ */
+
+// stream output operators that write escaped text without making a copy
+// usage:
+//   std::string xml_input = ...;
+//   std::cout << xml_stream_escaper(xml_input) << std::endl;
+
+struct xml_stream_escaper {
+  boost::string_view str;
+  xml_stream_escaper(std::string_view str) : str(str.data(), str.size()) {}
+};
+std::ostream& operator<<(std::ostream& out, const xml_stream_escaper& e);
+
+struct json_stream_escaper {
+  boost::string_view str;
+  json_stream_escaper(std::string_view str) : str(str.data(), str.size()) {}
+};
+std::ostream& operator<<(std::ostream& out, const json_stream_escaper& e);
+
+#endif
diff --git a/src/common/event_socket.h b/src/common/event_socket.h
new file mode 100644
index 00000000..9224f768
--- /dev/null
+++ b/src/common/event_socket.h
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_EVENT_SOCKET_H
+#define CEPH_COMMON_EVENT_SOCKET_H
+
+#include <unistd.h>
+#if defined(__FreeBSD__) || defined(__APPLE__)
+#include <errno.h>
+#endif
+#include "include/event_type.h"
+
+class EventSocket {
+  int socket;
+  int type;
+
+ public:
+  EventSocket(): socket(-1), type(EVENT_SOCKET_TYPE_NONE) {}
+  bool is_valid() const { return socket != -1; }
+  int init(int fd, int t) {
+    switch (t) {
+      case EVENT_SOCKET_TYPE_PIPE:
+#ifdef HAVE_EVENTFD
+      case EVENT_SOCKET_TYPE_EVENTFD:
+#endif
+      {
+        socket = fd;
+        type = t;
+        return 0;
+      }
+    }
+    return -EINVAL;
+  }
+  int notify() {
+    int ret;
+    switch (type) {
+      case EVENT_SOCKET_TYPE_PIPE:
+      {
+        char buf[1];
+        buf[0] = 'i';
+        ret = write(socket, buf, 1);
+        if (ret < 0)
+          ret = -errno;
+        else
+          ret = 0;
+        break;
+      }
+#ifdef HAVE_EVENTFD
+      case EVENT_SOCKET_TYPE_EVENTFD:
+      {
+        uint64_t value = 1;
+        ret = write(socket, &value, sizeof (value));
+        if (ret < 0)
+          ret = -errno;
+        else
+          ret = 0;
+        break;
+      }
+#endif
+      default:
+      {
+        ret = -1;
+        break;
+      }
+    }
+    return ret;
+  }
+};
+
+#endif
diff --git a/src/common/fd.cc b/src/common/fd.cc
new file mode 100644
index 00000000..931d4353
--- /dev/null
+++ b/src/common/fd.cc
@@ -0,0 +1,53 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2012 Inktank
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+#include "debug.h"
+#include "errno.h"
+
+void dump_open_fds(CephContext *cct)
+{
+#ifdef __APPLE__
+  const char *fn = "/dev/fd";
+#else
+  const char *fn = PROCPREFIX "/proc/self/fd";
+#endif
+  DIR *d = opendir(fn);
+  if (!d) {
+    lderr(cct) << "dump_open_fds unable to open " << fn << dendl;
+    return;
+  }
+  struct dirent *de = nullptr;
+
+  int n = 0;
+  while ((de = ::readdir(d))) {
+    if (de->d_name[0] == '.')
+      continue;
+    char path[PATH_MAX];
+    snprintf(path, sizeof(path), "%s/%s", fn, de->d_name);
+    char target[PATH_MAX];
+    ssize_t r = readlink(path, target, sizeof(target) - 1);
+    if (r < 0) {
+      r = -errno;
+      lderr(cct) << "dump_open_fds unable to readlink " << path << ": " << cpp_strerror(r) << dendl;
+      continue;
+    }
+    target[r] = 0;
+    lderr(cct) << "dump_open_fds " << de->d_name << " -> " << target << dendl;
+    n++;
+  }
+  lderr(cct) << "dump_open_fds dumped " << n << " open files" << dendl;
+
+  closedir(d);
+}
diff --git a/src/common/fd.h b/src/common/fd.h
new file mode 100644
index 00000000..581e1b0a
--- /dev/null
+++ b/src/common/fd.h
@@ -0,0 +1,22 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2012 Inktank
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_FD_H
+#define CEPH_COMMON_FD_H
+
+class CephContext;
+
+void dump_open_fds(CephContext *cct);
+
+#endif
diff --git a/src/common/fork_function.h b/src/common/fork_function.h
new file mode 100644
index 00000000..6f533c87
--- /dev/null
+++ b/src/common/fork_function.h
@@ -0,0 +1,162 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+// Run a function in a forked child, with a timeout.
+
+#pragma once
+
+#include <functional>
+#include <signal.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <ostream>
+
+#include "include/ceph_assert.h"
+#include "common/errno.h"
+
+static void _fork_function_dummy_sighandler(int sig) {}
+
+// Run a function post-fork, with a timeout.  Function can return
+// int8_t only due to unix exit code limitations.  Returns -ETIMEDOUT
+// if timeout is reached.
+
+static inline int fork_function(
+  int timeout,
+  std::ostream& errstr,
+  std::function<int8_t(void)> f)
+{
+  // first fork the forker.
+  pid_t forker_pid = fork();
+  if (forker_pid) {
+    // just wait
+    int status;
+    while (waitpid(forker_pid, &status, 0) == -1) {
+      ceph_assert(errno == EINTR);
+    }
+    if (WIFSIGNALED(status)) {
+      errstr << ": got signal: " << WTERMSIG(status) << "\n";
+      return 128 + WTERMSIG(status);
+    }
+    if (WIFEXITED(status)) {
+      int8_t r = WEXITSTATUS(status);
+      errstr << ": exit status: " << (int)r << "\n";
+      return r;
+    }
+    errstr << ": waitpid: unknown status returned\n";
+    return -1;
+  }
+
+  // we are forker (first child)
+
+  // close all fds
+  int maxfd = sysconf(_SC_OPEN_MAX);
+  if (maxfd == -1)
+    maxfd = 16384;
+  for (int fd = 0; fd <= maxfd; fd++) {
+    if (fd == STDIN_FILENO)
+      continue;
+    if (fd == STDOUT_FILENO)
+      continue;
+    if (fd == STDERR_FILENO)
+      continue;
+    ::close(fd);
+  }
+
+  sigset_t mask, oldmask;
+  int pid;
+
+  // Restore default action for SIGTERM in case the parent process decided
+  // to ignore it.
+  if (signal(SIGTERM, SIG_DFL) == SIG_ERR) {
+    std::cerr << ": signal failed: " << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+  // Because SIGCHLD is ignored by default, setup dummy handler for it,
+  // so we can mask it.
+  if (signal(SIGCHLD, _fork_function_dummy_sighandler) == SIG_ERR) {
+    std::cerr << ": signal failed: " << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+  // Setup timeout handler.
+  if (signal(SIGALRM, timeout_sighandler) == SIG_ERR) {
+    std::cerr << ": signal failed: " << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+  // Block interesting signals.
+  sigemptyset(&mask);
+  sigaddset(&mask, SIGINT);
+  sigaddset(&mask, SIGTERM);
+  sigaddset(&mask, SIGCHLD);
+  sigaddset(&mask, SIGALRM);
+  if (sigprocmask(SIG_SETMASK, &mask, &oldmask) == -1) {
+    std::cerr << ": sigprocmask failed: "
+	      << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+
+  pid = fork();
+
+  if (pid == -1) {
+    std::cerr << ": fork failed: " << cpp_strerror(errno) << "\n";
+    goto fail_exit;
+  }
+
+  if (pid == 0) { // we are second child
+    // Restore old sigmask.
+    if (sigprocmask(SIG_SETMASK, &oldmask, NULL) == -1) {
+      std::cerr << ": sigprocmask failed: "
+		<< cpp_strerror(errno) << "\n";
+      goto fail_exit;
+    }
+    (void)setpgid(0, 0); // Become process group leader.
+    int8_t r = f();
+    _exit((uint8_t)r);
+  }
+
+  // Parent
+  (void)alarm(timeout);
+
+  for (;;) {
+    int signo;
+    if (sigwait(&mask, &signo) == -1) {
+      std::cerr << ": sigwait failed: " << cpp_strerror(errno) << "\n";
+      goto fail_exit;
+    }
+    switch (signo) {
+    case SIGCHLD:
+      int status;
+      if (waitpid(pid, &status, WNOHANG) == -1) {
+	std::cerr << ": waitpid failed: " << cpp_strerror(errno) << "\n";
+	goto fail_exit;
+      }
+      if (WIFEXITED(status))
+	_exit(WEXITSTATUS(status));
+      if (WIFSIGNALED(status))
+	_exit(128 + WTERMSIG(status));
+      std::cerr << ": unknown status returned\n";
+      goto fail_exit;
+    case SIGINT:
+    case SIGTERM:
+      // Pass SIGINT and SIGTERM, which are usually used to terminate
+      // a process, to the child.
+      if (::kill(pid, signo) == -1) {
+	std::cerr << ": kill failed: " << cpp_strerror(errno) << "\n";
+	goto fail_exit;
+      }
+      continue;
+    case SIGALRM:
+      std::cerr << ": timed out (" << timeout << " sec)\n";
+      if (::killpg(pid, SIGKILL) == -1) {
+	std::cerr << ": kill failed: " << cpp_strerror(errno) << "\n";
+	goto fail_exit;
+      }
+      _exit(-ETIMEDOUT);
+    default:
+      std::cerr << ": sigwait: invalid signal: " << signo << "\n";
+      goto fail_exit;
+    }
+  }
+  return 0;
+fail_exit:
+  _exit(EXIT_FAILURE);
+}
diff --git a/src/common/freebsd_errno.cc b/src/common/freebsd_errno.cc
new file mode 100644
index 00000000..259ce7be
--- /dev/null
+++ b/src/common/freebsd_errno.cc
@@ -0,0 +1,219 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <errno.h>
+#include "include/types.h"
+#include "include/compat.h"
+
+#define H2C_ERRNO(a,b) [a] = b
+#define C2H_ERRNO(a,b) [a] = b
+
+// Build a table with the FreeBSD error as index
+// and the Linux error as value
+// Use the fact that the arry is initialised per default on all 0's
+// And we do not translate for 0's, but return the original value.
+static const __s32 ceph_to_hostos_conv[256] = {
+//       Linux errno  FreeBSD errno
+       C2H_ERRNO(11,  EAGAIN),	
+       C2H_ERRNO(35,  EDEADLK),	
+       C2H_ERRNO(36,  ENAMETOOLONG),	
+       C2H_ERRNO(37,  ENOLCK),	
+       C2H_ERRNO(38,  ENOSYS),	
+       C2H_ERRNO(39,  ENOTEMPTY),	
+       C2H_ERRNO(40,  ELOOP),	
+       C2H_ERRNO(42,  ENOMSG),	
+       C2H_ERRNO(43,  EIDRM),	
+       C2H_ERRNO(44,  EPERM),	 //TODO ECHRNG   /* Channel number out of range */
+       C2H_ERRNO(45,  EPERM),	 //TODO EL2NSYNC /* Level 2 not synchronized */
+       C2H_ERRNO(46,  EPERM),	 //TODO EL3HLT   /* Level 3 halted */
+       C2H_ERRNO(47,  EPERM),	 //TODO EL3RST   /* Level 3 reset */
+       C2H_ERRNO(48,  EPERM),	 //TODO ELNRNG   /* Link number out of range */
+       C2H_ERRNO(49,  EPERM),	 //TODO EUNATCH  /* Protocol driver not attached */
+       C2H_ERRNO(50,  EPERM),	 //TODO ENOCSI   /* No CSI structure available */
+       C2H_ERRNO(51,  EPERM),	 //TODO EL2HLT   /* Level 2 halted */
+       C2H_ERRNO(52,  EPERM),	 //TODO EBADE    /* Invalid exchange */
+       C2H_ERRNO(53,  EPERM),	 //TODO EBADR    /* Invalid request descriptor */
+       C2H_ERRNO(54,  EPERM),	 //TODO EXFULL   /* Exchange full */
+       C2H_ERRNO(55,  EPERM),	 //TODO ENOANO   /* No anode */
+       C2H_ERRNO(56,  EPERM),	 //TODO EBADRQC  /* Invalid request code */
+       C2H_ERRNO(57,  EPERM),	 //TODO EBADSLT  /* Invalid slot */
+       C2H_ERRNO(59,  EPERM),	 //TODO EBFONT   /* Bad font file format */
+       C2H_ERRNO(60,  ENOSTR),	
+       C2H_ERRNO(61,  ENODATA),	
+       C2H_ERRNO(62,  ETIME),	
+       C2H_ERRNO(63,  ENOSR),	
+       C2H_ERRNO(64,  EPERM),	 //TODO ENONET
+       C2H_ERRNO(65,  EPERM),	 //TODO ENOPKG
+       C2H_ERRNO(66,  EREMOTE),	
+       C2H_ERRNO(67,  ENOLINK),	
+       C2H_ERRNO(68,  EPERM),	 //TODO EADV
+       C2H_ERRNO(69,  EPERM),	 //TODO ESRMNT
+       C2H_ERRNO(70,  EPERM),	 //TODO ECOMM
+       C2H_ERRNO(71,  EPROTO),	
+       C2H_ERRNO(72,  EMULTIHOP),	
+       C2H_ERRNO(73,  EPERM),	 //TODO EDOTDOT
+       C2H_ERRNO(74,  EBADMSG),	
+       C2H_ERRNO(75,  EOVERFLOW),	
+       C2H_ERRNO(76,  EPERM),	 //TODO ENOTUNIQ
+       C2H_ERRNO(77,  EPERM),	 //TODO EBADFD
+       C2H_ERRNO(78,  EPERM),	 //TODO EREMCHG
+       C2H_ERRNO(79,  EPERM),	 //TODO ELIBACC
+       C2H_ERRNO(80,  EPERM),	 //TODO ELIBBAD
+       C2H_ERRNO(81,  EPERM),	 //TODO ELIBSCN
+       C2H_ERRNO(82,  EPERM),	 //TODO ELIBMAX
+       C2H_ERRNO(83,  EPERM),	 //TODO ELIBEXEC
+       C2H_ERRNO(84,  EILSEQ),	
+       C2H_ERRNO(85,  EINTR),	 /* not quite, since this is a syscll restart */
+       C2H_ERRNO(86,  EPERM),	 //ESTRPIPE;
+       C2H_ERRNO(87,  EUSERS),	
+       C2H_ERRNO(88,  ENOTSOCK),	
+       C2H_ERRNO(89,  EDESTADDRREQ),	
+       C2H_ERRNO(90,  EMSGSIZE),	
+       C2H_ERRNO(91,  EPROTOTYPE),	
+       C2H_ERRNO(92,  ENOPROTOOPT),	
+       C2H_ERRNO(93,  EPROTONOSUPPORT),	
+       C2H_ERRNO(94,  ESOCKTNOSUPPORT),	
+       C2H_ERRNO(95,  EOPNOTSUPP),	
+       C2H_ERRNO(96,  EPFNOSUPPORT),	
+       C2H_ERRNO(97,  EAFNOSUPPORT),	
+       C2H_ERRNO(98,  EADDRINUSE),	
+       C2H_ERRNO(99,  EADDRNOTAVAIL),	
+       C2H_ERRNO(100, ENETDOWN),	
+       C2H_ERRNO(101, ENETUNREACH),	
+       C2H_ERRNO(102, ENETRESET),	
+       C2H_ERRNO(103, ECONNABORTED),	
+       C2H_ERRNO(104, ECONNRESET),	
+       C2H_ERRNO(105, ENOBUFS),	
+       C2H_ERRNO(106, EISCONN),	
+       C2H_ERRNO(107, ENOTCONN),	
+       C2H_ERRNO(108, ESHUTDOWN),	
+       C2H_ERRNO(109, ETOOMANYREFS),	
+       C2H_ERRNO(110, ETIMEDOUT),	
+       C2H_ERRNO(111, ECONNREFUSED),	
+       C2H_ERRNO(112, EHOSTDOWN),	
+       C2H_ERRNO(113, EHOSTUNREACH),	
+       C2H_ERRNO(114, EALREADY),	
+       C2H_ERRNO(115, EINPROGRESS),	
+       C2H_ERRNO(116, ESTALE),	
+       C2H_ERRNO(117, EPERM),	 //TODO EUCLEAN
+       C2H_ERRNO(118, EPERM),	 //TODO ENOTNAM
+       C2H_ERRNO(119, EPERM),	 //TODO ENAVAIL
+       C2H_ERRNO(120, EPERM),	 //TODO EISNAM
+       C2H_ERRNO(121, EREMOTEIO),	
+       C2H_ERRNO(122, EDQUOT),	
+       C2H_ERRNO(123, EPERM),	 //TODO ENOMEDIUM
+       C2H_ERRNO(124, EPERM),	 //TODO EMEDIUMTYPE - not used
+       C2H_ERRNO(125, ECANCELED),	
+       C2H_ERRNO(126, EPERM),	 //TODO ENOKEY
+       C2H_ERRNO(127, EPERM),	 //TODO EKEYEXPIRED
+       C2H_ERRNO(128, EPERM),	 //TODO EKEYREVOKED
+       C2H_ERRNO(129, EPERM),	 //TODO EKEYREJECTED
+       C2H_ERRNO(130, EOWNERDEAD),	
+       C2H_ERRNO(131, ENOTRECOVERABLE),	
+       C2H_ERRNO(132, EPERM),	 //TODO ERFKILL
+       C2H_ERRNO(133, EPERM),	 //TODO EHWPOISON
+    };
+
+// Build a table with the FreeBSD error as index
+// and the Linux error as value
+// Use the fact that the arry is initialised per default on all 0's
+// And we do not translate for 0's, but return the original value.
+static const __s32 hostos_to_ceph_conv[256] = {
+	//        FreeBSD errno Linux errno
+	H2C_ERRNO(EDEADLK,	35),   	/* Resource deadlock avoided */
+        H2C_ERRNO(EAGAIN,	11),   	/* Resource temporarily unavailable */
+        H2C_ERRNO(EINPROGRESS,	115),	/* Operation now in progress */
+        H2C_ERRNO(EALREADY,	114),	/* Operation already in progress */
+        H2C_ERRNO(ENOTSOCK,	88),	/* Socket operation on non-socket */
+        H2C_ERRNO(EDESTADDRREQ,	89),	/* Destination address required */
+        H2C_ERRNO(EMSGSIZE,	90),	/* Message too long */
+        H2C_ERRNO(EPROTOTYPE,	91),	/* Protocol wrong type for socket */
+        H2C_ERRNO(ENOPROTOOPT,	92),	/* Protocol not available */
+        H2C_ERRNO(EPROTONOSUPPORT, 93),	/* Protocol not supported */
+        H2C_ERRNO(ESOCKTNOSUPPORT, 94),	/* Socket type not supported */
+        H2C_ERRNO(EOPNOTSUPP,	95),	/* Operation not supported */
+        H2C_ERRNO(EPFNOSUPPORT,	96),	/* Protocol family not supported */
+        H2C_ERRNO(EAFNOSUPPORT,	97),	/* Address family not supported by protocol family */
+        H2C_ERRNO(EADDRINUSE,	98),	/* Address already in use */
+        H2C_ERRNO(EADDRNOTAVAIL, 99),	/* Can't assign requested address */
+        H2C_ERRNO(ENETDOWN,	100),	/* Network is down */
+        H2C_ERRNO(ENETUNREACH,	101),	/* Network is unreachable */
+        H2C_ERRNO(ENETRESET,	102),	/* Network dropped connection on reset */
+        H2C_ERRNO(ECONNABORTED,	103),	/* Software caused connection abort */
+        H2C_ERRNO(ECONNRESET,	104),	/* Connection reset by peer */
+        H2C_ERRNO(ENOBUFS,	105),	/* No buffer space available */
+        H2C_ERRNO(EISCONN,	106),	/* Socket is already connected */
+        H2C_ERRNO(ENOTCONN,	107),	/* Socket is not connected */
+        H2C_ERRNO(ESHUTDOWN,	108),	/* Can't send after socket shutdown */
+        H2C_ERRNO(ETOOMANYREFS,	109),	/* Too many references: can't splice */
+        H2C_ERRNO(ETIMEDOUT,	110),	/* Operation timed out */
+        H2C_ERRNO(ECONNREFUSED,	111),	/* Connection refused */
+        H2C_ERRNO(ELOOP,	40),	/* Too many levels of symbolic links */
+        H2C_ERRNO(ENAMETOOLONG,	36),	/* File name too long */
+        H2C_ERRNO(EHOSTDOWN,	112),	/* Host is down */
+        H2C_ERRNO(EHOSTUNREACH,	113),	/* No route to host */
+        H2C_ERRNO(ENOTEMPTY,	39),	/* Directory not empty */
+        H2C_ERRNO(EPROCLIM,	EPERM),	/* Too many processes */
+        H2C_ERRNO(EUSERS,	87),	/* Too many users */
+        H2C_ERRNO(EDQUOT,	122),	/* Disc quota exceeded */
+        H2C_ERRNO(ESTALE,	116),	/* Stale NFS file handle */
+        H2C_ERRNO(EREMOTE,	66),	/* Too many levels of remote in path */
+        H2C_ERRNO(EBADRPC,	EPERM),	/* RPC struct is bad */
+        H2C_ERRNO(ERPCMISMATCH,	EPERM),	/* RPC version wrong */
+        H2C_ERRNO(EPROGUNAVAIL,	EPERM),	/* RPC prog. not avail */
+        H2C_ERRNO(EPROGMISMATCH, EPERM),/* Program version wrong */
+        H2C_ERRNO(EPROCUNAVAIL,	EPERM),	/* Bad procedure for program */
+        H2C_ERRNO(ENOLCK,	EPERM),	/* No locks available */
+        H2C_ERRNO(ENOSYS,	EPERM),	/* Function not implemented */
+        H2C_ERRNO(EFTYPE,	EPERM),	/* Inappropriate file type or format */
+        H2C_ERRNO(EAUTH,	EPERM),	/* Authentication error */
+        H2C_ERRNO(ENEEDAUTH,	EPERM),	/* Need authenticator */
+        H2C_ERRNO(EIDRM,	43),	/* Identifier removed */
+        H2C_ERRNO(ENOMSG,	42),	/* No message of desired type */
+        H2C_ERRNO(EOVERFLOW,	75),	/* Value too large to be stored in data type */
+        H2C_ERRNO(ECANCELED,	125),	/* Operation canceled */
+        H2C_ERRNO(EILSEQ,	84),	/* Illegal byte sequence */
+        H2C_ERRNO(ENOATTR,	61),	/* Attribute not found */
+        H2C_ERRNO(EDOOFUS,	EPERM),	/* Programming error */
+        H2C_ERRNO(EBADMSG,	74),	/* Bad message */
+        H2C_ERRNO(EMULTIHOP,	72),	/* Multihop attempted */
+        H2C_ERRNO(ENOLINK,	67),	/* Link has been severed */
+        H2C_ERRNO(EPROTO,	71),	/* Protocol error */
+        H2C_ERRNO(ENOTCAPABLE,	EPERM),	/* Capabilities insufficient */
+        H2C_ERRNO(ECAPMODE,	EPERM),	/* Not permitted in capability mode */
+        H2C_ERRNO(ENOTRECOVERABLE, 131),/* State not recoverable */
+        H2C_ERRNO(EOWNERDEAD,	130),	/* Previous owner died */
+	};
+
+// converts from linux errno values to host values
+__s32 ceph_to_hostos_errno(__s32 r)
+{
+  int sign = (r < 0 ? -1 : 1);
+  int err = std::abs(r);
+  if (err < 256 && ceph_to_hostos_conv[err] !=0 ) {
+    err = ceph_to_hostos_conv[err];
+  }
+  return err * sign;
+}
+
+// converts Host OS errno values to linux/Ceph values
+__s32 hostos_to_ceph_errno(__s32 r)
+{
+  int sign = (r < 0 ? -1 : 1);
+  int err = std::abs(r);
+  if (err < 256 && hostos_to_ceph_conv[err] !=0 ) {
+    err = hostos_to_ceph_conv[err];
+  }
+  return err * sign;
+}
diff --git a/src/common/fs_types.cc b/src/common/fs_types.cc
new file mode 100644
index 00000000..6ad4b24c
--- /dev/null
+++ b/src/common/fs_types.cc
@@ -0,0 +1,136 @@
+
+#include "include/fs_types.h"
+#include "common/Formatter.h"
+#include "include/ceph_features.h"
+
+void dump(const ceph_file_layout& l, Formatter *f)
+{
+  f->dump_unsigned("stripe_unit", l.fl_stripe_unit);
+  f->dump_unsigned("stripe_count", l.fl_stripe_count);
+  f->dump_unsigned("object_size", l.fl_object_size);
+  if (l.fl_cas_hash)
+    f->dump_unsigned("cas_hash", l.fl_cas_hash);
+  if (l.fl_object_stripe_unit)
+    f->dump_unsigned("object_stripe_unit", l.fl_object_stripe_unit);
+  if (l.fl_pg_pool)
+    f->dump_unsigned("pg_pool", l.fl_pg_pool);
+}
+
+void dump(const ceph_dir_layout& l, Formatter *f)
+{
+  f->dump_unsigned("dir_hash", l.dl_dir_hash);
+}
+
+
+// file_layout_t
+
+bool file_layout_t::is_valid() const
+{
+  /* stripe unit, object size must be non-zero, 64k increment */
+  if (!stripe_unit || (stripe_unit & (CEPH_MIN_STRIPE_UNIT-1)))
+    return false;
+  if (!object_size || (object_size & (CEPH_MIN_STRIPE_UNIT-1)))
+    return false;
+  /* object size must be a multiple of stripe unit */
+  if (object_size < stripe_unit || object_size % stripe_unit)
+    return false;
+  /* stripe count must be non-zero */
+  if (!stripe_count)
+    return false;
+  return true;
+}
+
+void file_layout_t::from_legacy(const ceph_file_layout& fl)
+{
+  stripe_unit = fl.fl_stripe_unit;
+  stripe_count = fl.fl_stripe_count;
+  object_size = fl.fl_object_size;
+  pool_id = (int32_t)fl.fl_pg_pool;
+  // in the legacy encoding, a zeroed structure was the default and
+  // would have pool 0 instead of -1.
+  if (pool_id == 0 && stripe_unit == 0 && stripe_count == 0 && object_size == 0)
+    pool_id = -1;
+  pool_ns.clear();
+}
+
+void file_layout_t::to_legacy(ceph_file_layout *fl) const
+{
+  fl->fl_stripe_unit = stripe_unit;
+  fl->fl_stripe_count = stripe_count;
+  fl->fl_object_size = object_size;
+  fl->fl_cas_hash = 0;
+  fl->fl_object_stripe_unit = 0;
+  fl->fl_unused = 0;
+  // in the legacy encoding, pool 0 was undefined.
+  if (pool_id >= 0)
+    fl->fl_pg_pool = pool_id;
+  else
+    fl->fl_pg_pool = 0;
+}
+
+void file_layout_t::encode(bufferlist& bl, uint64_t features) const
+{
+  using ceph::encode;
+  if ((features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) == 0) {
+    ceph_file_layout fl;
+    ceph_assert((stripe_unit & 0xff) == 0);  // first byte must be 0
+    to_legacy(&fl);
+    encode(fl, bl);
+    return;
+  }
+
+  ENCODE_START(2, 2, bl);
+  encode(stripe_unit, bl);
+  encode(stripe_count, bl);
+  encode(object_size, bl);
+  encode(pool_id, bl);
+  encode(pool_ns, bl);
+  ENCODE_FINISH(bl);
+}
+
+void file_layout_t::decode(bufferlist::const_iterator& p)
+{
+  using ceph::decode;
+  if (*p == 0) {
+    ceph_file_layout fl;
+    decode(fl, p);
+    from_legacy(fl);
+    return;
+  }
+  DECODE_START(2, p);
+  decode(stripe_unit, p);
+  decode(stripe_count, p);
+  decode(object_size, p);
+  decode(pool_id, p);
+  decode(pool_ns, p);
+  DECODE_FINISH(p);
+}
+
+void file_layout_t::dump(Formatter *f) const
+{
+  f->dump_unsigned("stripe_unit", stripe_unit);
+  f->dump_unsigned("stripe_count", stripe_count);
+  f->dump_unsigned("object_size", object_size);
+  f->dump_int("pool_id", pool_id);
+  f->dump_string("pool_ns", pool_ns);
+}
+
+void file_layout_t::generate_test_instances(list<file_layout_t*>& o)
+{
+  o.push_back(new file_layout_t);
+  o.push_back(new file_layout_t);
+  o.back()->stripe_unit = 4096;
+  o.back()->stripe_count = 16;
+  o.back()->object_size = 1048576;
+  o.back()->pool_id = 3;
+  o.back()->pool_ns = "myns";
+}
+
+ostream& operator<<(ostream& out, const file_layout_t &layout)
+{
+  JSONFormatter f;
+  layout.dump(&f);
+  f.flush(out);
+  return out;
+}
+
diff --git a/src/common/function_signature.h b/src/common/function_signature.h
new file mode 100644
index 00000000..6d2a34ee
--- /dev/null
+++ b/src/common/function_signature.h
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Copied from:
+ * https://github.com/exclipy/inline_variant_visitor/blob/master/function_signature.hpp
+ * which apparently copied it from
+ * http://stackoverflow.com/questions/4771417/how-to-get-the-signature-of-a-c-bind-expression
+ */
+
+#ifndef FUNCTION_SIGNATURE_H
+#define FUNCTION_SIGNATURE_H
+
+#include <boost/mpl/pop_front.hpp>
+#include <boost/mpl/push_front.hpp>
+#include <boost/function_types/function_type.hpp>
+#include <boost/function_types/result_type.hpp>
+#include <boost/function_types/parameter_types.hpp>
+
+template <typename F>
+struct signature_of_member
+{
+    typedef typename boost::function_types::result_type<F>::type result_type;
+    typedef typename boost::function_types::parameter_types<F>::type parameter_types;
+    typedef typename boost::mpl::pop_front<parameter_types>::type base;
+    typedef typename boost::mpl::push_front<base, result_type>::type L;
+    typedef typename boost::function_types::function_type<L>::type type;
+};
+
+template <typename F, bool is_class>
+struct signature_of_impl
+{
+    typedef typename boost::function_types::function_type<F>::type type;
+};
+
+template <typename F>
+struct signature_of_impl<F, true>
+{
+    typedef typename signature_of_member<decltype(&F::operator())>::type type;
+};
+
+template <typename F>
+struct signature_of
+{
+    typedef typename signature_of_impl<F, boost::is_class<F>::value>::type type;
+};
+
+#endif
diff --git a/src/common/hex.cc b/src/common/hex.cc
new file mode 100644
index 00000000..a02e0fd6
--- /dev/null
+++ b/src/common/hex.cc
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2008-2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/hex.h"
+
+void hex2str(const char *s, int len, char *buf, int dest_len)
+{
+  int pos = 0;
+  for (int i=0; i<len && pos<dest_len; i++) {
+    if (i && !(i%8))
+      pos += snprintf(&buf[pos], dest_len-pos, " ");
+    if (i && !(i%16))
+      pos += snprintf(&buf[pos], dest_len-pos, "\n");
+    pos += snprintf(&buf[pos], dest_len-pos, "%.2x ", (int)(unsigned char)s[i]);
+  }
+}
+
+std::string hexdump(const std::string &msg, const char *s, int len)
+{
+  int buf_len = len*4;
+  char buf[buf_len];
+  hex2str(s, len, buf, buf_len);
+  return buf;
+}
diff --git a/src/common/hex.h b/src/common/hex.h
new file mode 100644
index 00000000..f3c15097
--- /dev/null
+++ b/src/common/hex.h
@@ -0,0 +1,25 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_HEX_H
+#define CEPH_COMMON_HEX_H
+
+#include <string>
+
+extern void hex2str(const char *s, int len, char *buf, int dest_len);
+
+extern std::string hexdump(std::string msg, const char *s, int len);
+
+#endif
diff --git a/src/common/histogram.cc b/src/common/histogram.cc
new file mode 100644
index 00000000..b8a71b67
--- /dev/null
+++ b/src/common/histogram.cc
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/histogram.h"
+#include "common/Formatter.h"
+
+// -- pow2_hist_t --
+void pow2_hist_t::dump(Formatter *f) const
+{
+  f->open_array_section("histogram");
+  for (std::vector<int32_t>::const_iterator p = h.begin(); p != h.end(); ++p)
+    f->dump_int("count", *p);
+  f->close_section();
+  f->dump_int("upper_bound", upper_bound());
+}
+
+void pow2_hist_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(h, bl);
+  ENCODE_FINISH(bl);
+}
+
+void pow2_hist_t::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(1, p);
+  decode(h, p);
+  DECODE_FINISH(p);
+}
+
+void pow2_hist_t::generate_test_instances(std::list<pow2_hist_t*>& ls)
+{
+  ls.push_back(new pow2_hist_t);
+  ls.push_back(new pow2_hist_t);
+  ls.back()->h.push_back(1);
+  ls.back()->h.push_back(3);
+  ls.back()->h.push_back(0);
+  ls.back()->h.push_back(2);
+}
+
+void pow2_hist_t::decay(int bits)
+{
+  for (std::vector<int32_t>::iterator p = h.begin(); p != h.end(); ++p) {
+    *p >>= bits;
+  }
+  _contract();
+}
diff --git a/src/common/histogram.h b/src/common/histogram.h
new file mode 100644
index 00000000..139450b7
--- /dev/null
+++ b/src/common/histogram.h
@@ -0,0 +1,128 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ * Copyright 2013 Inktank
+ */
+
+#ifndef CEPH_HISTOGRAM_H
+#define CEPH_HISTOGRAM_H
+
+#include <list>
+
+#include "include/encoding.h"
+
+namespace ceph {
+  class Formatter;
+}
+
+/**
+ * power of 2 histogram
+ */
+struct pow2_hist_t { //
+  /**
+   * histogram
+   *
+   * bin size is 2^index
+   * value is count of elements that are <= the current bin but > the previous bin.
+   */
+  std::vector<int32_t> h;
+
+private:
+  /// expand to at least another's size
+  void _expand_to(unsigned s) {
+    if (s > h.size())
+      h.resize(s, 0);
+  }
+  /// drop useless trailing 0's
+  void _contract() {
+    unsigned p = h.size();
+    while (p > 0 && h[p-1] == 0)
+      --p;
+    h.resize(p);
+  }
+
+public:
+  void clear() {
+    h.clear();
+  }
+  bool empty() const {
+    return h.empty();
+  }
+  void set_bin(int bin, int32_t count) {
+    _expand_to(bin + 1);
+    h[bin] = count;
+    _contract();
+  }
+
+  void add(int32_t v) {
+    int bin = cbits(v);
+    _expand_to(bin + 1);
+    h[bin]++;
+    _contract();
+  }
+
+  bool operator==(const pow2_hist_t &r) const {
+    return h == r.h;
+  }
+
+  /// get a value's position in the histogram.
+  ///
+  /// positions are represented as values in the range [0..1000000]
+  /// (millionths on the unit interval).
+  ///
+  /// @param v [in] value (non-negative)
+  /// @param lower [out] pointer to lower-bound (0..1000000)
+  /// @param upper [out] pointer to the upper bound (0..1000000)
+  int get_position_micro(int32_t v, uint64_t *lower, uint64_t *upper) {
+    if (v < 0)
+      return -1;
+    unsigned bin = cbits(v);
+    uint64_t lower_sum = 0, upper_sum = 0, total = 0;
+    for (unsigned i=0; i<h.size(); ++i) {
+      if (i <= bin)
+	upper_sum += h[i];
+      if (i < bin)
+	lower_sum += h[i];
+      total += h[i];
+    }
+    if (total > 0) {
+      *lower = lower_sum * 1000000 / total;
+      *upper = upper_sum * 1000000 / total;
+    }
+    return 0;
+  }
+
+  void add(const pow2_hist_t& o) {
+    _expand_to(o.h.size());
+    for (unsigned p = 0; p < o.h.size(); ++p)
+      h[p] += o.h[p];
+    _contract();
+  }
+  void sub(const pow2_hist_t& o) {
+    _expand_to(o.h.size());
+    for (unsigned p = 0; p < o.h.size(); ++p)
+      h[p] -= o.h[p];
+    _contract();
+  }
+
+  int32_t upper_bound() const {
+    return 1 << h.size();
+  }
+
+  /// decay histogram by N bits (default 1, for a halflife)
+  void decay(int bits = 1);
+
+  void dump(Formatter *f) const;
+  void encode(bufferlist &bl) const;
+  void decode(bufferlist::const_iterator &bl);
+  static void generate_test_instances(std::list<pow2_hist_t*>& o);
+};
+WRITE_CLASS_ENCODER(pow2_hist_t)
+
+#endif /* CEPH_HISTOGRAM_H */
diff --git a/src/common/hobject.cc b/src/common/hobject.cc
new file mode 100644
index 00000000..e5da4a35
--- /dev/null
+++ b/src/common/hobject.cc
@@ -0,0 +1,597 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "hobject.h"
+#include "common/Formatter.h"
+
+static void append_escaped(const string &in, string *out)
+{
+  for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
+    if (*i == '%') {
+      out->push_back('%');
+      out->push_back('p');
+    } else if (*i == '.') {
+      out->push_back('%');
+      out->push_back('e');
+    } else if (*i == '_') {
+      out->push_back('%');
+      out->push_back('u');
+    } else {
+      out->push_back(*i);
+    }
+  }
+}
+
+set<string> hobject_t::get_prefixes(
+  uint32_t bits,
+  uint32_t mask,
+  int64_t pool)
+{
+  uint32_t len = bits;
+  while (len % 4 /* nibbles */) len++;
+
+  set<uint32_t> from;
+  if (bits < 32)
+    from.insert(mask & ~((uint32_t)(~0) << bits));
+  else if (bits == 32)
+    from.insert(mask);
+  else
+    ceph_abort();
+
+
+  set<uint32_t> to;
+  for (uint32_t i = bits; i < len; ++i) {
+    for (set<uint32_t>::iterator j = from.begin();
+	 j != from.end();
+	 ++j) {
+      to.insert(*j | (1U << i));
+      to.insert(*j);
+    }
+    to.swap(from);
+    to.clear();
+  }
+
+  char buf[20];
+  char *t = buf;
+  uint64_t poolid(pool);
+  t += snprintf(t, sizeof(buf), "%.*llX", 16, (long long unsigned)poolid);
+  *(t++) = '.';
+  string poolstr(buf, t - buf);
+  set<string> ret;
+  for (set<uint32_t>::iterator i = from.begin();
+       i != from.end();
+       ++i) {
+    uint32_t revhash(hobject_t::_reverse_nibbles(*i));
+    snprintf(buf, sizeof(buf), "%.*X", (int)(sizeof(revhash))*2, revhash);
+    ret.insert(poolstr + string(buf, len/4));
+  }
+  return ret;
+}
+
+string hobject_t::to_str() const
+{
+  string out;
+
+  char snap_with_hash[1000];
+  char *t = snap_with_hash;
+  const char *end = t + sizeof(snap_with_hash);
+
+  uint64_t poolid(pool);
+  t += snprintf(t, end - t, "%.*llX", 16, (long long unsigned)poolid);
+
+  uint32_t revhash(get_nibblewise_key_u32());
+  t += snprintf(t, end - t, ".%.*X", 8, revhash);
+
+  if (snap == CEPH_NOSNAP)
+    t += snprintf(t, end - t, ".head");
+  else if (snap == CEPH_SNAPDIR)
+    t += snprintf(t, end - t, ".snapdir");
+  else
+    t += snprintf(t, end - t, ".%llx", (long long unsigned)snap);
+
+  out.append(snap_with_hash, t);
+
+  out.push_back('.');
+  append_escaped(oid.name, &out);
+  out.push_back('.');
+  append_escaped(get_key(), &out);
+  out.push_back('.');
+  append_escaped(nspace, &out);
+
+  return out;
+}
+
+void hobject_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(4, 3, bl);
+  encode(key, bl);
+  encode(oid, bl);
+  encode(snap, bl);
+  encode(hash, bl);
+  encode(max, bl);
+  encode(nspace, bl);
+  encode(pool, bl);
+  ceph_assert(!max || (*this == hobject_t(hobject_t::get_max())));
+  ENCODE_FINISH(bl);
+}
+
+void hobject_t::decode(bufferlist::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl);
+  if (struct_v >= 1)
+    decode(key, bl);
+  decode(oid, bl);
+  decode(snap, bl);
+  decode(hash, bl);
+  if (struct_v >= 2)
+    decode(max, bl);
+  else
+    max = false;
+  if (struct_v >= 4) {
+    decode(nspace, bl);
+    decode(pool, bl);
+    // for compat with hammer, which did not handle the transition
+    // from pool -1 -> pool INT64_MIN for MIN properly.  this object
+    // name looks a bit like a pgmeta object for the meta collection,
+    // but those do not ever exist (and is_pgmeta() pool >= 0).
+    if (pool == -1 &&
+	snap == 0 &&
+	hash == 0 &&
+	!max &&
+	oid.name.empty()) {
+      pool = INT64_MIN;
+      ceph_assert(is_min());
+    }
+
+    // for compatibility with some earlier verisons which might encoded
+    // a non-canonical max object
+    if (max) {
+      *this = hobject_t::get_max();
+    }
+  }
+  DECODE_FINISH(bl);
+  build_hash_cache();
+}
+
+void hobject_t::decode(json_spirit::Value& v)
+{
+  using namespace json_spirit;
+  Object& o = v.get_obj();
+  for (Object::size_type i=0; i<o.size(); i++) {
+    Pair& p = o[i];
+    if (p.name_ == "oid")
+      oid.name = p.value_.get_str();
+    else if (p.name_ == "key")
+      key = p.value_.get_str();
+    else if (p.name_ == "snapid")
+      snap = p.value_.get_uint64();
+    else if (p.name_ == "hash")
+      hash = p.value_.get_int();
+    else if (p.name_ == "max")
+      max = p.value_.get_int();
+    else if (p.name_ == "pool")
+      pool = p.value_.get_int();
+    else if (p.name_ == "namespace")
+      nspace = p.value_.get_str();
+  }
+  build_hash_cache();
+}
+
+void hobject_t::dump(Formatter *f) const
+{
+  f->dump_string("oid", oid.name);
+  f->dump_string("key", key);
+  f->dump_int("snapid", snap);
+  f->dump_int("hash", hash);
+  f->dump_int("max", (int)max);
+  f->dump_int("pool", pool);
+  f->dump_string("namespace", nspace);
+}
+
+void hobject_t::generate_test_instances(list<hobject_t*>& o)
+{
+  o.push_back(new hobject_t);
+  o.push_back(new hobject_t);
+  o.back()->max = true;
+  o.push_back(new hobject_t(object_t("oname"), string(), 1, 234, -1, ""));
+  o.push_back(new hobject_t(object_t("oname2"), string("okey"), CEPH_NOSNAP,
+	67, 0, "n1"));
+  o.push_back(new hobject_t(object_t("oname3"), string("oname3"),
+	CEPH_SNAPDIR, 910, 1, "n2"));
+}
+
+static void append_out_escaped(const string &in, string *out)
+{
+  for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
+    if (*i == '%' || *i == ':' || *i == '/' || *i < 32 || *i >= 127) {
+      out->push_back('%');
+      char buf[3];
+      snprintf(buf, sizeof(buf), "%02x", (int)(unsigned char)*i);
+      out->append(buf);
+    } else {
+      out->push_back(*i);
+    }
+  }
+}
+
+static const char *decode_out_escaped(const char *in, string *out)
+{
+  while (*in && *in != ':') {
+    if (*in == '%') {
+      ++in;
+      char buf[3];
+      buf[0] = *in;
+      ++in;
+      buf[1] = *in;
+      buf[2] = 0;
+      int v = strtol(buf, NULL, 16);
+      out->push_back(v);
+    } else {
+      out->push_back(*in);
+    }
+    ++in;
+  }
+  return in;
+}
+
+ostream& operator<<(ostream& out, const hobject_t& o)
+{
+  if (o == hobject_t())
+    return out << "MIN";
+  if (o.is_max())
+    return out << "MAX";
+  out << o.pool << ':';
+  out << std::hex;
+  out.width(8);
+  out.fill('0');
+  out << o.get_bitwise_key_u32(); // << '~' << o.get_hash();
+  out.width(0);
+  out.fill(' ');
+  out << std::dec;
+  out << ':';
+  string v;
+  append_out_escaped(o.nspace, &v);
+  v.push_back(':');
+  append_out_escaped(o.get_key(), &v);
+  v.push_back(':');
+  append_out_escaped(o.oid.name, &v);
+  out << v << ':' << o.snap;
+  return out;
+}
+
+bool hobject_t::parse(const string &s)
+{
+  if (s == "MIN") {
+    *this = hobject_t();
+    return true;
+  }
+  if (s == "MAX") {
+    *this = hobject_t::get_max();
+    return true;
+  }
+
+  const char *start = s.c_str();
+  long long po;
+  unsigned h;
+  int r = sscanf(start, "%lld:%x:", &po, &h);
+  if (r != 2)
+    return false;
+  for (; *start && *start != ':'; ++start) ;
+  for (++start; *start && isxdigit(*start); ++start) ;
+  if (*start != ':')
+    return false;
+
+  string ns, k, name;
+  const char *p = decode_out_escaped(start + 1, &ns);
+  if (*p != ':')
+    return false;
+  p = decode_out_escaped(p + 1, &k);
+  if (*p != ':')
+    return false;
+  p = decode_out_escaped(p + 1, &name);
+  if (*p != ':')
+    return false;
+  start = p + 1;
+
+  unsigned long long sn;
+  if (strncmp(start, "head", 4) == 0) {
+    sn = CEPH_NOSNAP;
+    start += 4;
+    if (*start != 0)
+      return false;
+  } else {
+    r = sscanf(start, "%llx", &sn);
+    if (r != 1)
+      return false;
+    for (++start; *start && isxdigit(*start); ++start) ;
+    if (*start)
+      return false;
+  }
+
+  max = false;
+  pool = po;
+  set_hash(_reverse_bits(h));
+  nspace = ns;
+  oid.name = name;
+  set_key(k);
+  snap = sn;
+  return true;
+}
+
+int cmp(const hobject_t& l, const hobject_t& r)
+{
+  if (l.max < r.max)
+    return -1;
+  if (l.max > r.max)
+    return 1;
+  if (l.pool < r.pool)
+    return -1;
+  if (l.pool > r.pool)
+    return 1;
+  if (l.get_bitwise_key() < r.get_bitwise_key())
+    return -1;
+  if (l.get_bitwise_key() > r.get_bitwise_key())
+    return 1;
+  if (l.nspace < r.nspace)
+    return -1;
+  if (l.nspace > r.nspace)
+    return 1;
+  if (!(l.get_key().empty() && r.get_key().empty())) {
+    if (l.get_effective_key() < r.get_effective_key()) {
+      return -1;
+    }
+    if (l.get_effective_key() > r.get_effective_key()) {
+      return 1;
+    }
+  }
+  if (l.oid < r.oid)
+    return -1;
+  if (l.oid > r.oid)
+    return 1;
+  if (l.snap < r.snap)
+    return -1;
+  if (l.snap > r.snap)
+    return 1;
+  return 0;
+}
+
+
+
+// This is compatible with decode for hobject_t prior to
+// version 5.
+void ghobject_t::encode(bufferlist& bl) const
+{
+  // when changing this, remember to update encoded_size() too.
+  ENCODE_START(6, 3, bl);
+  encode(hobj.key, bl);
+  encode(hobj.oid, bl);
+  encode(hobj.snap, bl);
+  encode(hobj.hash, bl);
+  encode(hobj.max, bl);
+  encode(hobj.nspace, bl);
+  encode(hobj.pool, bl);
+  encode(generation, bl);
+  encode(shard_id, bl);
+  encode(max, bl);
+  ENCODE_FINISH(bl);
+}
+
+size_t ghobject_t::encoded_size() const
+{
+  // this is not in order of encoding or appearance, but rather
+  // in order of known constants first, so it can be (mostly) computed
+  // at compile time.
+  //  - encoding header + 3 string lengths
+  size_t r = sizeof(ceph_le32) + 2 * sizeof(__u8) + 3 * sizeof(__u32);
+
+  // hobj.snap
+  r += sizeof(uint64_t);
+
+  // hobj.hash
+  r += sizeof(uint32_t);
+
+  // hobj.max
+  r += sizeof(bool);
+
+  // hobj.pool
+  r += sizeof(uint64_t);
+
+  // hobj.generation
+  r += sizeof(uint64_t);
+
+  // hobj.shard_id
+  r += sizeof(int8_t);
+
+  // max
+  r += sizeof(bool);
+
+  // hobj.key
+  r += hobj.key.size();
+
+  // hobj.oid
+  r += hobj.oid.name.size();
+
+  // hobj.nspace
+  r += hobj.nspace.size();
+
+  return r;
+}
+
+void ghobject_t::decode(bufferlist::const_iterator& bl)
+{
+  DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl);
+  if (struct_v >= 1)
+    decode(hobj.key, bl);
+  decode(hobj.oid, bl);
+  decode(hobj.snap, bl);
+  decode(hobj.hash, bl);
+  if (struct_v >= 2)
+    decode(hobj.max, bl);
+  else
+    hobj.max = false;
+  if (struct_v >= 4) {
+    decode(hobj.nspace, bl);
+    decode(hobj.pool, bl);
+    // for compat with hammer, which did not handle the transition from
+    // pool -1 -> pool INT64_MIN for MIN properly (see hobject_t::decode()).
+    if (hobj.pool == -1 &&
+	hobj.snap == 0 &&
+	hobj.hash == 0 &&
+	!hobj.max &&
+	hobj.oid.name.empty()) {
+      hobj.pool = INT64_MIN;
+      ceph_assert(hobj.is_min());
+    }
+  }
+  if (struct_v >= 5) {
+    decode(generation, bl);
+    decode(shard_id, bl);
+  } else {
+    generation = ghobject_t::NO_GEN;
+    shard_id = shard_id_t::NO_SHARD;
+  }
+  if (struct_v >= 6) {
+    decode(max, bl);
+  } else {
+    max = false;
+  }
+  DECODE_FINISH(bl);
+  hobj.build_hash_cache();
+}
+
+void ghobject_t::decode(json_spirit::Value& v)
+{
+  hobj.decode(v);
+  using namespace json_spirit;
+  Object& o = v.get_obj();
+  for (Object::size_type i=0; i<o.size(); i++) {
+    Pair& p = o[i];
+    if (p.name_ == "generation")
+      generation = p.value_.get_uint64();
+    else if (p.name_ == "shard_id")
+      shard_id.id = p.value_.get_int();
+    else if (p.name_ == "max")
+      max = p.value_.get_int();
+  }
+}
+
+void ghobject_t::dump(Formatter *f) const
+{
+  hobj.dump(f);
+  if (generation != NO_GEN)
+    f->dump_int("generation", generation);
+  if (shard_id != shard_id_t::NO_SHARD)
+    f->dump_int("shard_id", shard_id);
+  f->dump_int("max", (int)max);
+}
+
+void ghobject_t::generate_test_instances(list<ghobject_t*>& o)
+{
+  o.push_back(new ghobject_t);
+  o.push_back(new ghobject_t);
+  o.back()->hobj.max = true;
+  o.push_back(new ghobject_t(hobject_t(object_t("oname"), string(), 1, 234, -1, "")));
+
+  o.push_back(new ghobject_t(hobject_t(object_t("oname2"), string("okey"), CEPH_NOSNAP,
+        67, 0, "n1"), 1, shard_id_t(0)));
+  o.push_back(new ghobject_t(hobject_t(object_t("oname2"), string("okey"), CEPH_NOSNAP,
+        67, 0, "n1"), 1, shard_id_t(1)));
+  o.push_back(new ghobject_t(hobject_t(object_t("oname2"), string("okey"), CEPH_NOSNAP,
+        67, 0, "n1"), 1, shard_id_t(2)));
+  o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+        CEPH_SNAPDIR, 910, 1, "n2"), 1, shard_id_t(0)));
+  o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+        CEPH_SNAPDIR, 910, 1, "n2"), 2, shard_id_t(0)));
+  o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+        CEPH_SNAPDIR, 910, 1, "n2"), 3, shard_id_t(0)));
+  o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+        CEPH_SNAPDIR, 910, 1, "n2"), 3, shard_id_t(1)));
+  o.push_back(new ghobject_t(hobject_t(object_t("oname3"), string("oname3"),
+        CEPH_SNAPDIR, 910, 1, "n2"), 3, shard_id_t(2)));
+}
+
+ostream& operator<<(ostream& out, const ghobject_t& o)
+{
+  if (o == ghobject_t())
+    return out << "GHMIN";
+  if (o.is_max())
+    return out << "GHMAX";
+  if (o.shard_id != shard_id_t::NO_SHARD)
+    out << std::hex << o.shard_id << std::dec;
+  out << '#' << o.hobj << '#';
+  if (o.generation != ghobject_t::NO_GEN)
+    out << std::hex << (unsigned long long)(o.generation) << std::dec;
+  return out;
+}
+
+bool ghobject_t::parse(const string& s)
+{
+  if (s == "GHMIN") {
+    *this = ghobject_t();
+    return true;
+  }
+  if (s == "GHMAX") {
+    *this = ghobject_t::get_max();
+    return true;
+  }
+
+  // look for shard# prefix
+  const char *start = s.c_str();
+  const char *p;
+  int sh = shard_id_t::NO_SHARD;
+  for (p = start; *p && isxdigit(*p); ++p) ;
+  if (!*p && *p != '#')
+    return false;
+  if (p > start) {
+    int r = sscanf(s.c_str(), "%x", &sh);
+    if (r < 1)
+      return false;
+    start = p + 1;
+  } else {
+    ++start;
+  }
+
+  // look for #generation suffix
+  long long unsigned g = NO_GEN;
+  const char *last = start + strlen(start) - 1;
+  p = last;
+  while (isxdigit(*p))
+    p--;
+  if (*p != '#')
+    return false;
+  if (p < last) {
+    sscanf(p + 1, "%llx", &g);
+  }
+
+  string inner(start, p - start);
+  hobject_t h;
+  if (!h.parse(inner)) {
+    return false;
+  }
+
+  shard_id = shard_id_t(sh);
+  hobj = h;
+  generation = g;
+  max = false;
+  return true;
+}
+
+int cmp(const ghobject_t& l, const ghobject_t& r)
+{
+  if (l.max < r.max)
+    return -1;
+  if (l.max > r.max)
+    return 1;
+  if (l.shard_id < r.shard_id)
+    return -1;
+  if (l.shard_id > r.shard_id)
+    return 1;
+  int ret = cmp(l.hobj, r.hobj);
+  if (ret != 0)
+    return ret;
+  if (l.generation < r.generation)
+    return -1;
+  if (l.generation > r.generation)
+    return 1;
+  return 0;
+}
diff --git a/src/common/hobject.h b/src/common/hobject.h
new file mode 100644
index 00000000..e0ec42f4
--- /dev/null
+++ b/src/common/hobject.h
@@ -0,0 +1,512 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __CEPH_OS_HOBJECT_H
+#define __CEPH_OS_HOBJECT_H
+
+#include "include/types.h"
+#include "include/cmp.h"
+
+#include "json_spirit/json_spirit_value.h"
+#include "include/ceph_assert.h"   // spirit clobbers it!
+
+#include "reverse.h"
+
+namespace ceph {
+  class Formatter;
+}
+
+#ifndef UINT64_MAX
+#define UINT64_MAX (18446744073709551615ULL)
+#endif
+#ifndef INT64_MIN
+#define INT64_MIN ((int64_t)0x8000000000000000ll)
+#endif
+
+struct hobject_t {
+public:
+  static const int64_t POOL_META = -1;
+  static const int64_t POOL_TEMP_START = -2; // and then negative
+
+  static bool is_temp_pool(int64_t pool) {
+    return pool <= POOL_TEMP_START;
+  }
+  static int64_t get_temp_pool(int64_t pool) {
+    return POOL_TEMP_START - pool;
+  }
+  static bool is_meta_pool(int64_t pool) {
+    return pool == POOL_META;
+  }
+
+public:
+  object_t oid;
+  snapid_t snap;
+private:
+  uint32_t hash;
+  bool max;
+  uint32_t nibblewise_key_cache;
+  uint32_t hash_reverse_bits;
+public:
+  int64_t pool;
+  string nspace;
+
+private:
+  string key;
+
+  class hobject_t_max {};
+
+public:
+  const string &get_key() const {
+    return key;
+  }
+
+  void set_key(const std::string &key_) {
+    if (key_ == oid.name)
+      key.clear();
+    else
+      key = key_;
+  }
+
+  string to_str() const;
+  
+  uint32_t get_hash() const { 
+    return hash;
+  }
+  void set_hash(uint32_t value) { 
+    hash = value;
+    build_hash_cache();
+  }
+
+  static bool match_hash(uint32_t to_check, uint32_t bits, uint32_t match) {
+    return (match & ~((~0)<<bits)) == (to_check & ~((~0)<<bits));
+  }
+  bool match(uint32_t bits, uint32_t match) const {
+    return match_hash(hash, bits, match);
+  }
+
+  bool is_temp() const {
+    return is_temp_pool(pool) && pool != INT64_MIN;
+  }
+  bool is_meta() const {
+    return is_meta_pool(pool);
+  }
+  int64_t get_logical_pool() const {
+    if (is_temp_pool(pool))
+      return get_temp_pool(pool);  // it's reversible
+    else
+      return pool;
+  }
+
+  hobject_t() : snap(0), hash(0), max(false), pool(INT64_MIN) {
+    build_hash_cache();
+  }
+
+  hobject_t(const hobject_t &rhs) = default;
+  hobject_t(hobject_t &&rhs) = default;
+  hobject_t(hobject_t_max &&singleton) : hobject_t() {
+    max = true;
+  }
+  hobject_t &operator=(const hobject_t &rhs) = default;
+  hobject_t &operator=(hobject_t &&rhs) = default;
+  hobject_t &operator=(hobject_t_max &&singleton) {
+    *this = hobject_t();
+    max = true;
+    return *this;
+  }
+
+  // maximum sorted value.
+  static hobject_t_max get_max() {
+    return hobject_t_max();
+  }
+
+  hobject_t(object_t oid, const string& key, snapid_t snap, uint32_t hash,
+	    int64_t pool, string nspace)
+    : oid(oid), snap(snap), hash(hash), max(false),
+      pool(pool), nspace(nspace),
+      key(oid.name == key ? string() : key) {
+    build_hash_cache();
+  }
+
+  hobject_t(const sobject_t &soid, const string &key, uint32_t hash,
+	    int64_t pool, string nspace)
+    : oid(soid.oid), snap(soid.snap), hash(hash), max(false),
+      pool(pool), nspace(nspace),
+      key(soid.oid.name == key ? string() : key) {
+    build_hash_cache();
+  }
+
+  /// @return min hobject_t ret s.t. ret.hash == this->hash
+  hobject_t get_boundary() const {
+    if (is_max())
+      return *this;
+    hobject_t ret;
+    ret.set_hash(hash);
+    ret.pool = pool;
+    return ret;
+  }
+
+  hobject_t get_object_boundary() const {
+    if (is_max())
+      return *this;
+    hobject_t ret = *this;
+    ret.snap = 0;
+    return ret;
+  }
+
+  /// @return head version of this hobject_t
+  hobject_t get_head() const {
+    hobject_t ret(*this);
+    ret.snap = CEPH_NOSNAP;
+    return ret;
+  }
+
+  /// @return snapdir version of this hobject_t
+  hobject_t get_snapdir() const {
+    hobject_t ret(*this);
+    ret.snap = CEPH_SNAPDIR;
+    return ret;
+  }
+
+  /// @return true if object is snapdir
+  bool is_snapdir() const {
+    return snap == CEPH_SNAPDIR;
+  }
+
+  /// @return true if object is head
+  bool is_head() const {
+    return snap == CEPH_NOSNAP;
+  }
+
+  /// @return true if object is neither head nor snapdir nor max
+  bool is_snap() const {
+    return !is_max() && !is_head() && !is_snapdir();
+  }
+
+  /// @return true iff the object should have a snapset in it's attrs
+  bool has_snapset() const {
+    return is_head() || is_snapdir();
+  }
+
+  /* Do not use when a particular hash function is needed */
+  explicit hobject_t(const sobject_t &o) :
+    oid(o.oid), snap(o.snap), max(false), pool(POOL_META) {
+    set_hash(std::hash<sobject_t>()(o));
+  }
+
+  bool is_max() const {
+    ceph_assert(!max || (*this == hobject_t(hobject_t::get_max())));
+    return max;
+  }
+  bool is_min() const {
+    // this needs to match how it's constructed
+    return snap == 0 &&
+	   hash == 0 &&
+	   !max &&
+	   pool == INT64_MIN;
+  }
+
+  static uint32_t _reverse_bits(uint32_t v) {
+    return reverse_bits(v);
+  }
+  static uint32_t _reverse_nibbles(uint32_t retval) {
+    return reverse_nibbles(retval);
+  }
+
+  /**
+   * Returns set S of strings such that for any object
+   * h where h.match(bits, mask), there is some string
+   * s \f$\in\f$ S such that s is a prefix of h.to_str().
+   * Furthermore, for any s $f\in\f$ S, s is a prefix of
+   * h.str() implies that h.match(bits, mask).
+   */
+  static set<string> get_prefixes(
+    uint32_t bits,
+    uint32_t mask,
+    int64_t pool);
+
+  // filestore nibble-based key
+  uint32_t get_nibblewise_key_u32() const {
+    ceph_assert(!max);
+    return nibblewise_key_cache;
+  }
+  uint64_t get_nibblewise_key() const {
+    return max ? 0x100000000ull : nibblewise_key_cache;
+  }
+
+  // newer bit-reversed key
+  uint32_t get_bitwise_key_u32() const {
+    ceph_assert(!max);
+    return hash_reverse_bits;
+  }
+  uint64_t get_bitwise_key() const {
+    return max ? 0x100000000ull : hash_reverse_bits;
+  }
+
+  // please remember to update set_bitwise_key_u32() also
+  // once you change build_hash_cache()
+  void build_hash_cache() {
+    nibblewise_key_cache = _reverse_nibbles(hash);
+    hash_reverse_bits = _reverse_bits(hash);
+  }
+  void set_bitwise_key_u32(uint32_t value) {
+    hash = _reverse_bits(value);
+    // below is identical to build_hash_cache() and shall be
+    // updated correspondingly if you change build_hash_cache() 
+    nibblewise_key_cache = _reverse_nibbles(hash);
+    hash_reverse_bits = value;
+  }
+
+  const string& get_effective_key() const {
+    if (key.length())
+      return key;
+    return oid.name;
+  }
+
+  hobject_t make_temp_hobject(const string& name) const {
+    return hobject_t(object_t(name), "", CEPH_NOSNAP,
+		     hash,
+		     get_temp_pool(pool),
+		     "");
+  }
+
+  void swap(hobject_t &o) {
+    hobject_t temp(o);
+    o = (*this);
+    (*this) = temp;
+  }
+
+  const string &get_namespace() const {
+    return nspace;
+  }
+
+  bool parse(const string& s);
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& bl);
+  void decode(json_spirit::Value& v);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(list<hobject_t*>& o);
+  friend int cmp(const hobject_t& l, const hobject_t& r);
+  friend bool operator>(const hobject_t& l, const hobject_t& r) {
+    return cmp(l, r) > 0;
+  }
+  friend bool operator>=(const hobject_t& l, const hobject_t& r) {
+    return cmp(l, r) >= 0;
+  }
+  friend bool operator<(const hobject_t& l, const hobject_t& r) {
+    return cmp(l, r) < 0;
+  }
+  friend bool operator<=(const hobject_t& l, const hobject_t& r) {
+    return cmp(l, r) <= 0;
+  }
+  friend bool operator==(const hobject_t&, const hobject_t&);
+  friend bool operator!=(const hobject_t&, const hobject_t&);
+  friend struct ghobject_t;
+};
+WRITE_CLASS_ENCODER(hobject_t)
+
+namespace std {
+  template<> struct hash<hobject_t> {
+    size_t operator()(const hobject_t &r) const {
+      static rjhash<uint64_t> RJ;
+      return RJ(r.get_hash() ^ r.snap);
+    }
+  };
+} // namespace std
+
+ostream& operator<<(ostream& out, const hobject_t& o);
+
+WRITE_EQ_OPERATORS_7(hobject_t, hash, oid, get_key(), snap, pool, max, nspace)
+
+template <typename T>
+struct always_false {
+  using value = std::false_type;
+};
+
+template <typename T>
+inline bool operator==(const hobject_t &lhs, const T&) {
+  static_assert(always_false<T>::value::value, "Do not compare to get_max()");
+  return lhs.is_max();
+}
+template <typename T>
+inline bool operator==(const T&, const hobject_t &rhs) {
+  static_assert(always_false<T>::value::value, "Do not compare to get_max()");
+  return rhs.is_max();
+}
+template <typename T>
+inline bool operator!=(const hobject_t &lhs, const T&) {
+  static_assert(always_false<T>::value::value, "Do not compare to get_max()");
+  return !lhs.is_max();
+}
+template <typename T>
+inline bool operator!=(const T&, const hobject_t &rhs) {
+  static_assert(always_false<T>::value::value, "Do not compare to get_max()");
+  return !rhs.is_max();
+}
+
+extern int cmp(const hobject_t& l, const hobject_t& r);
+template <typename T>
+static inline int cmp(const hobject_t &l, const T&) {
+  static_assert(always_false<T>::value::value, "Do not compare to get_max()");
+  return l.is_max() ? 0 : -1;
+}
+template <typename T>
+static inline int cmp(const T&, const hobject_t&r) {
+  static_assert(always_false<T>::value::value, "Do not compare to get_max()");
+  return r.is_max() ? 0 : 1;
+}
+
+
+
+typedef version_t gen_t;
+
+struct ghobject_t {
+  hobject_t hobj;
+  gen_t generation;
+  shard_id_t shard_id;
+  bool max;
+
+public:
+  static const gen_t NO_GEN = UINT64_MAX;
+
+  ghobject_t()
+    : generation(NO_GEN),
+      shard_id(shard_id_t::NO_SHARD),
+      max(false) {}
+
+  explicit ghobject_t(const hobject_t &obj)
+    : hobj(obj),
+      generation(NO_GEN),
+      shard_id(shard_id_t::NO_SHARD),
+      max(false) {}
+
+  ghobject_t(const hobject_t &obj, gen_t gen, shard_id_t shard)
+    : hobj(obj),
+      generation(gen),
+      shard_id(shard),
+      max(false) {}
+
+  static ghobject_t make_pgmeta(int64_t pool, uint32_t hash, shard_id_t shard) {
+    hobject_t h(object_t(), string(), CEPH_NOSNAP, hash, pool, string());
+    return ghobject_t(h, NO_GEN, shard);
+  }
+  bool is_pgmeta() const {
+    // make sure we are distinct from hobject_t(), which has pool INT64_MIN
+    return hobj.pool >= 0 && hobj.oid.name.empty();
+  }
+
+  bool match(uint32_t bits, uint32_t match) const {
+    return hobj.match_hash(hobj.hash, bits, match);
+  }
+  /// @return min ghobject_t ret s.t. ret.hash == this->hash
+  ghobject_t get_boundary() const {
+    if (hobj.is_max())
+      return *this;
+    ghobject_t ret;
+    ret.hobj.set_hash(hobj.hash);
+    ret.shard_id = shard_id;
+    ret.hobj.pool = hobj.pool;
+    return ret;
+  }
+  uint32_t get_nibblewise_key_u32() const {
+    return hobj.get_nibblewise_key_u32();
+  }
+  uint32_t get_nibblewise_key() const {
+    return hobj.get_nibblewise_key();
+  }
+
+  bool is_degenerate() const {
+    return generation == NO_GEN && shard_id == shard_id_t::NO_SHARD;
+  }
+
+  bool is_no_gen() const {
+    return generation == NO_GEN;
+  }
+
+  bool is_no_shard() const {
+    return shard_id == shard_id_t::NO_SHARD;
+  }
+
+  void set_shard(shard_id_t s) {
+    shard_id = s;
+  }
+
+  bool parse(const string& s);
+
+  // maximum sorted value.
+  static ghobject_t get_max() {
+    ghobject_t h;
+    h.max = true;
+    h.hobj = hobject_t::get_max();  // so that is_max() => hobj.is_max()
+    return h;
+  }
+  bool is_max() const {
+    return max;
+  }
+  bool is_min() const {
+    return *this == ghobject_t();
+  }
+
+  void swap(ghobject_t &o) {
+    ghobject_t temp(o);
+    o = (*this);
+    (*this) = temp;
+  }
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& bl);
+  void decode(json_spirit::Value& v);
+  size_t encoded_size() const;
+  void dump(Formatter *f) const;
+  static void generate_test_instances(list<ghobject_t*>& o);
+  friend int cmp(const ghobject_t& l, const ghobject_t& r);
+  friend bool operator>(const ghobject_t& l, const ghobject_t& r) {
+    return cmp(l, r) > 0;
+  }
+  friend bool operator>=(const ghobject_t& l, const ghobject_t& r) {
+    return cmp(l, r) >= 0;
+  }
+  friend bool operator<(const ghobject_t& l, const ghobject_t& r) {
+    return cmp(l, r) < 0;
+  }
+  friend bool operator<=(const ghobject_t& l, const ghobject_t& r) {
+    return cmp(l, r) <= 0;
+  }
+  friend bool operator==(const ghobject_t&, const ghobject_t&);
+  friend bool operator!=(const ghobject_t&, const ghobject_t&);
+
+};
+WRITE_CLASS_ENCODER(ghobject_t)
+
+namespace std {
+  template<> struct hash<ghobject_t> {
+    size_t operator()(const ghobject_t &r) const {
+      static rjhash<uint64_t> RJ;
+      static hash<hobject_t> HO;
+      size_t hash = HO(r.hobj);
+      hash = RJ(hash ^ r.generation);
+      hash = hash ^ r.shard_id.id;
+      return hash;
+    }
+  };
+} // namespace std
+
+ostream& operator<<(ostream& out, const ghobject_t& o);
+
+WRITE_EQ_OPERATORS_4(ghobject_t, max, shard_id, hobj, generation)
+
+extern int cmp(const ghobject_t& l, const ghobject_t& r);
+
+
+#endif
diff --git a/src/common/hostname.cc b/src/common/hostname.cc
new file mode 100644
index 00000000..879fc939
--- /dev/null
+++ b/src/common/hostname.cc
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/hostname.h"
+
+#include <unistd.h>
+
+std::string ceph_get_hostname()
+{
+  // are we in a container?  if so we would prefer the *real* hostname.
+  const char *node_name = getenv("NODE_NAME");
+  if (node_name) {
+    return node_name;
+  }
+
+  char buf[1024];
+  gethostname(buf, 1024);
+  return std::string(buf);
+}
+
+std::string ceph_get_short_hostname()
+{
+  std::string hostname = ceph_get_hostname();
+  size_t pos = hostname.find('.');
+  if (pos == std::string::npos)
+  {
+    return hostname;
+  }
+  else
+  {
+    return hostname.substr(0, pos);
+  }
+}
diff --git a/src/common/hostname.h b/src/common/hostname.h
new file mode 100644
index 00000000..9d270bf6
--- /dev/null
+++ b/src/common/hostname.h
@@ -0,0 +1,22 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_HOSTNAME_H
+#define CEPH_COMMON_HOSTNAME_H
+
+#include <string>
+
+extern std::string ceph_get_hostname();
+extern std::string ceph_get_short_hostname();
+#endif
diff --git a/src/common/inline_variant.h b/src/common/inline_variant.h
new file mode 100644
index 00000000..28426ba7
--- /dev/null
+++ b/src/common/inline_variant.h
@@ -0,0 +1,211 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:4; indent-tabs-mode:t -*-
+// vim: ts=8 sw=4 smarttab
+/*
+ * Copied from:
+ * https://github.com/exclipy/inline_variant_visitor/blob/master/inline_variant.hpp
+ */
+
+#ifndef INLINE_VARIANT_H
+#define INLINE_VARIANT_H
+
+#include <boost/function_types/function_arity.hpp>
+#include <boost/fusion/algorithm/transformation/transform.hpp>
+#include <boost/mpl/contains.hpp>
+#include <boost/mpl/map.hpp>
+#include <boost/mpl/vector.hpp>
+#include <boost/mpl/range_c.hpp>
+#include <boost/noncopyable.hpp>
+
+#include "function_signature.h"
+
+namespace detail {
+
+// A metafunction class for getting the argument type from a unary function or functor type
+struct function_arg_extractor
+{
+    // Function is either a function type like void(int const&), or a functor - eg. a class with void operator(int)
+    // Sets type to the argument type with the constness and referenceness stripped (eg. int)
+    template <typename Function>
+    struct apply
+    {
+    private:
+        typedef typename boost::remove_const< typename boost::remove_reference<Function>::type >::type bare_type;
+        typedef typename signature_of<bare_type>::type normalized_function_type;
+        typedef typename boost::function_types::function_arity<normalized_function_type>::type arity;
+        typedef typename boost::function_types::parameter_types<normalized_function_type>::type parameter_types;
+        typedef typename boost::function_types::result_type<normalized_function_type>::type result_type;
+
+        BOOST_STATIC_ASSERT_MSG((arity::value == 1), "make_visitor called with a non-unary function");
+
+        typedef typename boost::mpl::front<parameter_types>::type parameter_type;
+    public:
+        typedef typename boost::remove_const< typename boost::remove_reference<parameter_type>::type >::type type;
+    };
+};
+
+struct make_pair
+{
+    template <typename AType, typename Ind>
+    struct apply {
+	typedef boost::mpl::pair<AType, Ind> type;
+    };
+};
+
+// A metafunction class that asserts the second argument is in Allowed, and returns void
+template<typename Allowed>
+struct check_in
+{
+    template <typename Type1, typename Type2>
+    struct apply
+    {
+    private:
+        BOOST_STATIC_ASSERT_MSG((boost::mpl::contains<Allowed, typename boost::mpl::first<Type2>::type>::value),
+                "make_visitor called with spurious handler functions");
+    public:
+        typedef void type;
+    };
+};
+
+template <typename Seq>
+struct as_map
+{
+private:
+    struct insert_helper {
+	template <typename M, typename P>
+	struct apply
+	{
+	    typedef typename boost::mpl::insert<
+		M,
+		P>::type type;
+	};
+    };
+public:
+    typedef typename boost::mpl::fold<Seq, boost::mpl::map0<>, insert_helper>::type type;
+};
+
+// A functor template suitable for passing into apply_visitor.  The constructor accepts the list of handler functions,
+// which are then exposed through a set of operator()s
+template <typename Result, typename Variant, typename... Functions>
+struct generic_visitor : boost::static_visitor<Result>, boost::noncopyable
+{
+private:
+    typedef generic_visitor<Result, Variant, Functions...> type;
+
+    // Compute the function_map type
+    typedef boost::mpl::vector<Functions...> function_types;
+    typedef typename boost::mpl::transform<function_types, function_arg_extractor>::type arg_types;
+    typedef typename boost::mpl::transform<
+        arg_types,
+	boost::mpl::range_c<int, 0, boost::mpl::size<arg_types>::value>,
+	make_pair
+	>::type pair_list;
+    typedef typename as_map<pair_list>::type fmap;
+
+    // Check that the argument types are unique
+    BOOST_STATIC_ASSERT_MSG((boost::mpl::size<fmap>::value == boost::mpl::size<arg_types>::value),
+            "make_visitor called with non-unique argument types for handler functions");
+
+    // Check that there aren't any argument types not in the variant types
+    typedef typename boost::mpl::fold<fmap, void, check_in<typename Variant::types> >::type dummy;
+
+    boost::fusion::vector<Functions...> fvec;
+
+
+    template <typename T>
+    Result apply_helper(const T& object, boost::mpl::true_) const {
+	typedef typename boost::mpl::at<fmap, T>::type Ind;
+        return boost::fusion::at<Ind>(fvec)(object);
+    }
+
+    template <typename T>
+    Result apply_helper(const T& object, boost::mpl::false_) const {
+        return Result();
+    }
+
+    BOOST_MOVABLE_BUT_NOT_COPYABLE(generic_visitor)
+
+public:
+    generic_visitor(BOOST_RV_REF(type) other)
+    :
+        fvec(boost::move(other.fvec))
+    {
+    }
+    generic_visitor(Functions&&... functions)
+    :
+        fvec(std::forward<Functions>(functions)...)
+    {
+    }
+
+    template <typename T>
+    Result operator()(const T& object) const {
+        typedef typename boost::mpl::has_key<fmap, T>::type correct_key;
+        BOOST_STATIC_ASSERT_MSG(correct_key::value,
+            "make_visitor called without specifying handlers for all required types");
+        return apply_helper(object, correct_key());
+    }
+};
+
+// A metafunction class for getting the return type of a function
+struct function_return_extractor
+{
+    template <typename Function>
+    struct apply : boost::function_types::result_type<typename signature_of<Function>::type>
+    {
+    };
+};
+
+// A metafunction class that asserts the two arguments are the same and returns the first one
+struct check_same
+{
+    template <typename Type1, typename Type2>
+    struct apply
+    {
+    private:
+        BOOST_STATIC_ASSERT_MSG((boost::is_same<Type1, Type2>::value),
+                "make_visitor called with functions of differing return types");
+    public:
+        typedef Type1 type;
+    };
+};
+
+// A metafunction for getting the required generic_visitor type for the set of Functions
+template <typename Variant, typename... Functions>
+struct get_generic_visitor
+{
+private:
+    typedef boost::mpl::vector<Functions...> function_types;
+    typedef typename boost::mpl::transform<
+        function_types,
+        boost::remove_const< boost::remove_reference<boost::mpl::_1> >
+    >::type bare_function_types;
+    typedef typename boost::mpl::transform<bare_function_types, function_return_extractor>::type return_types;
+
+public:
+    // Set result_type to the return type of the first function
+    typedef typename boost::mpl::front<return_types>::type result_type;
+    typedef generic_visitor<result_type, Variant, Functions...> type;
+
+private:
+    // Assert that every return type is the same as the first one
+    typedef typename boost::mpl::fold<return_types, result_type, check_same>::type dummy;
+};
+
+// Accepts a set of functions and returns an object suitable for apply_visitor
+template <typename Variant, typename... Functions>
+auto make_visitor(BOOST_RV_REF(Functions)... functions)
+    -> typename detail::get_generic_visitor<Variant, Functions...>::type
+{
+    return typename detail::get_generic_visitor<Variant, Functions...>::type(boost::forward<Functions>(functions)...);
+}
+
+}
+
+template <typename Variant, typename... Functions>
+auto match(Variant const& variant, BOOST_RV_REF(Functions)... functions)
+    -> typename detail::get_generic_visitor<Variant, Functions...>::result_type
+{
+    return boost::apply_visitor(detail::make_visitor<Variant>(
+        boost::forward<Functions>(functions)...), variant);
+}
+
+#endif
diff --git a/src/common/interval_map.h b/src/common/interval_map.h
new file mode 100644
index 00000000..320c8843
--- /dev/null
+++ b/src/common/interval_map.h
@@ -0,0 +1,281 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef INTERVAL_MAP_H
+#define INTERVAL_MAP_H
+
+#include "include/interval_set.h"
+
+template <typename K, typename V, typename S>
+/**
+ * interval_map
+ *
+ * Maps intervals to values.  Erasing or inserting over an existing
+ * range will use S::operator() to split any overlapping existing
+ * values.
+ *
+ * Surprisingly, boost/icl/interval_map doesn't seem to be appropriate
+ * for this use case.  The aggregation concept seems to assume
+ * commutativity, which doesn't work if we want more recent insertions
+ * to overwrite previous ones.
+ */
+class interval_map {
+  S s;
+  using map = std::map<K, std::pair<K, V> >;
+  using mapiter = typename std::map<K, std::pair<K, V> >::iterator;
+  using cmapiter = typename std::map<K, std::pair<K, V> >::const_iterator;
+  map m;
+  std::pair<mapiter, mapiter> get_range(K off, K len) {
+    // fst is first iterator with end after off (may be end)
+    auto fst = m.upper_bound(off);
+    if (fst != m.begin())
+      --fst;
+    if (fst != m.end() && off >= (fst->first + fst->second.first))
+      ++fst;
+
+    // lst is first iterator with start after off + len (may be end)
+    auto lst = m.lower_bound(off + len);
+    return std::make_pair(fst, lst);
+  }
+  std::pair<cmapiter, cmapiter> get_range(K off, K len) const {
+    // fst is first iterator with end after off (may be end)
+    auto fst = m.upper_bound(off);
+    if (fst != m.begin())
+      --fst;
+    if (fst != m.end() && off >= (fst->first + fst->second.first))
+      ++fst;
+
+    // lst is first iterator with start after off + len (may be end)
+    auto lst = m.lower_bound(off + len);
+    return std::make_pair(fst, lst);
+  }
+  void try_merge(mapiter niter) {
+    if (niter != m.begin()) {
+      auto prev = niter;
+      prev--;
+      if (prev->first + prev->second.first == niter->first &&
+	  s.can_merge(prev->second.second, niter->second.second)) {
+	V n = s.merge(
+	  std::move(prev->second.second),
+	  std::move(niter->second.second));
+	K off = prev->first;
+	K len = niter->first + niter->second.first - off;
+	niter++;
+	m.erase(prev, niter);
+	auto p = m.insert(
+	  std::make_pair(
+	    off,
+	    std::make_pair(len, std::move(n))));
+	ceph_assert(p.second);
+	niter = p.first;
+      }
+    }
+    auto next = niter;
+    next++;
+    if (next != m.end() &&
+	niter->first + niter->second.first == next->first &&
+	s.can_merge(niter->second.second, next->second.second)) {
+      V n = s.merge(
+	std::move(niter->second.second),
+	std::move(next->second.second));
+      K off = niter->first;
+      K len = next->first + next->second.first - off;
+      next++;
+      m.erase(niter, next);
+      auto p = m.insert(
+	std::make_pair(
+	  off,
+	  std::make_pair(len, std::move(n))));
+      ceph_assert(p.second);
+    }
+  }
+public:
+  interval_map intersect(K off, K len) const {
+    interval_map ret;
+    auto limits = get_range(off, len);
+    for (auto i = limits.first; i != limits.second; ++i) {
+      K o = i->first;
+      K l = i->second.first;
+      V v = i->second.second;
+      if (o < off) {
+	V p = v;
+	l -= (off - o);
+	v = s.split(off - o, l, p);
+	o = off;
+      }
+      if ((o + l) > (off + len)) {
+	V p = v;
+	l -= (o + l) - (off + len);
+	v = s.split(0, l, p);
+      }
+      ret.insert(o, l, v);
+    }
+    return ret;
+  }
+  void clear() {
+    m.clear();
+  }
+  void erase(K off, K len) {
+    if (len == 0)
+      return;
+    auto range = get_range(off, len);
+    std::vector<
+      std::pair<
+	K,
+	std::pair<K, V>
+	>> to_insert;
+    for (auto i = range.first; i != range.second; ++i) {
+      if (i->first < off) {
+	to_insert.emplace_back(
+	  std::make_pair(
+	    i->first,
+	    std::make_pair(
+	      off - i->first,
+	      s.split(0, off - i->first, i->second.second))));
+      }
+      if ((off + len) < (i->first + i->second.first)) {
+	K nlen = (i->first + i->second.first) - (off + len);
+	to_insert.emplace_back(
+	  std::make_pair(
+	    off + len,
+	    std::make_pair(
+	      nlen,
+	      s.split(i->second.first - nlen, nlen, i->second.second))));
+      }
+    }
+    m.erase(range.first, range.second);
+    m.insert(to_insert.begin(), to_insert.end());
+  }
+  void insert(K off, K len, V &&v) {
+    ceph_assert(len > 0);
+    ceph_assert(len == s.length(v));
+    erase(off, len);
+    auto p = m.insert(make_pair(off, std::make_pair(len, std::forward<V>(v))));
+    ceph_assert(p.second);
+    try_merge(p.first);
+  }
+  void insert(interval_map &&other) {
+    for (auto i = other.m.begin();
+	 i != other.m.end();
+	 other.m.erase(i++)) {
+      insert(i->first, i->second.first, std::move(i->second.second));
+    }
+  }
+  void insert(K off, K len, const V &v) {
+    ceph_assert(len > 0);
+    ceph_assert(len == s.length(v));
+    erase(off, len);
+    auto p = m.insert(make_pair(off, std::make_pair(len, v)));
+    ceph_assert(p.second);
+    try_merge(p.first);
+  }
+  void insert(const interval_map &other) {
+    for (auto &&i: other) {
+      insert(i.get_off(), i.get_len(), i.get_val());
+    }
+  }
+  bool empty() const {
+    return m.empty();
+  }
+  interval_set<K> get_interval_set() const {
+    interval_set<K> ret;
+    for (auto &&i: *this) {
+      ret.insert(i.get_off(), i.get_len());
+    }
+    return ret;
+  }
+  class const_iterator {
+    cmapiter it;
+    const_iterator(cmapiter &&it) : it(std::move(it)) {}
+    const_iterator(const cmapiter &it) : it(it) {}
+
+    friend class interval_map;
+  public:
+    const_iterator(const const_iterator &) = default;
+    const_iterator &operator=(const const_iterator &) = default;
+
+    const_iterator &operator++() {
+      ++it;
+      return *this;
+    }
+    const_iterator operator++(int) {
+      return const_iterator(it++);
+    }
+    const_iterator &operator--() {
+      --it;
+      return *this;
+    }
+    const_iterator operator--(int) {
+      return const_iterator(it--);
+    }
+    bool operator==(const const_iterator &rhs) const {
+      return it == rhs.it;
+    }
+    bool operator!=(const const_iterator &rhs) const {
+      return it != rhs.it;
+    }
+    K get_off() const {
+      return it->first;
+    }
+    K get_len() const {
+      return it->second.first;
+    }
+    const V &get_val() const {
+      return it->second.second;
+    }
+    const_iterator &operator*() {
+      return *this;
+    }
+  };
+  const_iterator begin() const {
+    return const_iterator(m.begin());
+  }
+  const_iterator end() const {
+    return const_iterator(m.end());
+  }
+  std::pair<const_iterator, const_iterator> get_containing_range(
+    K off,
+    K len) const {
+    auto rng = get_range(off, len);
+    return std::make_pair(const_iterator(rng.first), const_iterator(rng.second));
+  }
+  unsigned ext_count() const {
+    return m.size();
+  }
+  bool operator==(const interval_map &rhs) const {
+    return m == rhs.m;
+  }
+
+  std::ostream &print(std::ostream &out) const {
+    bool first = true;
+    out << "{";
+    for (auto &&i: *this) {
+      if (first) {
+	first = false;
+      } else {
+	out << ",";
+      }
+      out << i.get_off() << "~" << i.get_len() << "("
+	  << s.length(i.get_val()) << ")";
+    }
+    return out << "}";
+  }
+};
+
+template <typename K, typename V, typename S>
+std::ostream &operator<<(std::ostream &out, const interval_map<K, V, S> &m) {
+  return m.print(out);
+}
+
+#endif
diff --git a/src/common/ipaddr.cc b/src/common/ipaddr.cc
new file mode 100644
index 00000000..0abf7f20
--- /dev/null
+++ b/src/common/ipaddr.cc
@@ -0,0 +1,242 @@
+
+#include <arpa/inet.h>
+#include <ifaddrs.h>
+#include <stdlib.h>
+#include <string.h>
+#include <boost/algorithm/string/predicate.hpp>
+#if defined(__FreeBSD__)
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#endif
+
+#include "include/ipaddr.h"
+#include "msg/msg_types.h"
+#include "common/pick_address.h"
+
+void netmask_ipv4(const struct in_addr *addr,
+			 unsigned int prefix_len,
+			 struct in_addr *out) {
+  uint32_t mask;
+
+  if (prefix_len >= 32) {
+    // also handle 32 in this branch, because >>32 is not defined by
+    // the C standards
+    mask = ~uint32_t(0);
+  } else {
+    mask = htonl(~(~uint32_t(0) >> prefix_len));
+  }
+  out->s_addr = addr->s_addr & mask;
+}
+
+
+static bool match_numa_node(const string& if_name, int numa_node)
+{
+#ifdef WITH_SEASTAR
+  return true;
+#else
+  int if_node = -1;
+  int r = get_iface_numa_node(if_name, &if_node);
+  if (r < 0) {
+    return false;
+  }
+  return if_node == numa_node;
+#endif
+}
+
+const struct ifaddrs *find_ipv4_in_subnet(const struct ifaddrs *addrs,
+					  const struct sockaddr_in *net,
+					  unsigned int prefix_len,
+					  int numa_node) {
+  struct in_addr want, temp;
+
+  netmask_ipv4(&net->sin_addr, prefix_len, &want);
+  for (; addrs != NULL; addrs = addrs->ifa_next) {
+
+    if (addrs->ifa_addr == NULL)
+      continue;
+
+    if (strcmp(addrs->ifa_name, "lo") == 0 || boost::starts_with(addrs->ifa_name, "lo:"))
+      continue;
+
+    if (numa_node >= 0 && !match_numa_node(addrs->ifa_name, numa_node))
+      continue;
+
+    if (addrs->ifa_addr->sa_family != net->sin_family)
+      continue;
+
+    struct in_addr *cur = &((struct sockaddr_in*)addrs->ifa_addr)->sin_addr;
+    netmask_ipv4(cur, prefix_len, &temp);
+
+    if (temp.s_addr == want.s_addr) {
+      return addrs;
+    }
+  }
+
+  return NULL;
+}
+
+
+void netmask_ipv6(const struct in6_addr *addr,
+		  unsigned int prefix_len,
+		  struct in6_addr *out) {
+  if (prefix_len > 128)
+    prefix_len = 128;
+
+  memcpy(out->s6_addr, addr->s6_addr, prefix_len/8);
+  if (prefix_len < 128)
+    out->s6_addr[prefix_len/8] = addr->s6_addr[prefix_len/8] & ~( 0xFF >> (prefix_len % 8) );
+  if (prefix_len/8 < 15)
+    memset(out->s6_addr+prefix_len/8+1, 0, 16-prefix_len/8-1);
+}
+
+
+const struct ifaddrs *find_ipv6_in_subnet(const struct ifaddrs *addrs,
+					  const struct sockaddr_in6 *net,
+					  unsigned int prefix_len,
+					  int numa_node) {
+  struct in6_addr want, temp;
+
+  netmask_ipv6(&net->sin6_addr, prefix_len, &want);
+  for (; addrs != NULL; addrs = addrs->ifa_next) {
+
+    if (addrs->ifa_addr == NULL)
+      continue;
+
+    if (strcmp(addrs->ifa_name, "lo") == 0 || boost::starts_with(addrs->ifa_name, "lo:"))
+      continue;
+
+    if (numa_node >= 0 && !match_numa_node(addrs->ifa_name, numa_node))
+      continue;
+
+    if (addrs->ifa_addr->sa_family != net->sin6_family)
+      continue;
+
+    struct in6_addr *cur = &((struct sockaddr_in6*)addrs->ifa_addr)->sin6_addr;
+    if (IN6_IS_ADDR_LINKLOCAL(cur))
+      continue;
+    netmask_ipv6(cur, prefix_len, &temp);
+
+    if (IN6_ARE_ADDR_EQUAL(&temp, &want))
+      return addrs;
+  }
+
+  return NULL;
+}
+
+
+const struct ifaddrs *find_ip_in_subnet(const struct ifaddrs *addrs,
+					const struct sockaddr *net,
+					unsigned int prefix_len,
+					int numa_node) {
+  switch (net->sa_family) {
+    case AF_INET:
+      return find_ipv4_in_subnet(addrs, (struct sockaddr_in*)net, prefix_len,
+				 numa_node);
+
+    case AF_INET6:
+      return find_ipv6_in_subnet(addrs, (struct sockaddr_in6*)net, prefix_len,
+				 numa_node);
+    }
+
+  return NULL;
+}
+
+
+bool parse_network(const char *s, struct sockaddr_storage *network, unsigned int *prefix_len) {
+  char *slash = strchr((char*)s, '/');
+  if (!slash) {
+    // no slash
+    return false;
+  }
+  if (*(slash+1) == '\0') {
+    // slash is the last character
+    return false;
+  }
+
+  char *end;
+  long int num = strtol(slash+1, &end, 10);
+  if (*end != '\0') {
+    // junk after the prefix_len
+    return false;
+  }
+  if (num < 0) {
+    return false;
+  }
+  *prefix_len = num;
+
+  // copy the part before slash to get nil termination
+  char *addr = (char*)alloca(slash-s + 1);
+  strncpy(addr, s, slash-s);
+  addr[slash-s] = '\0';
+
+  // caller expects ports etc to be zero
+  memset(network, 0, sizeof(*network));
+
+  // try parsing as ipv4
+  int ok;
+  ok = inet_pton(AF_INET, addr, &((struct sockaddr_in*)network)->sin_addr);
+  if (ok) {
+    network->ss_family = AF_INET;
+    return true;
+  }
+
+  // try parsing as ipv6
+  ok = inet_pton(AF_INET6, addr, &((struct sockaddr_in6*)network)->sin6_addr);
+  if (ok) {
+    network->ss_family = AF_INET6;
+    return true;
+  }
+
+  return false;
+}
+
+bool parse_network(const char *s,
+		   entity_addr_t *network,
+		   unsigned int *prefix_len)
+{
+  sockaddr_storage ss;
+  bool ret = parse_network(s, &ss, prefix_len);
+  if (ret) {
+    network->set_type(entity_addr_t::TYPE_LEGACY);
+    network->set_sockaddr((sockaddr *)&ss);
+  }
+  return ret;
+}
+
+bool network_contains(
+  const struct entity_addr_t& network,
+  unsigned int prefix_len,
+  const struct entity_addr_t& addr)
+{
+  if (addr.get_family() != network.get_family()) {
+    return false;
+  }
+  switch (network.get_family()) {
+  case AF_INET:
+    {
+      struct in_addr a, b;
+      netmask_ipv4(
+	&((const sockaddr_in*)network.get_sockaddr())->sin_addr, prefix_len, &a);
+      netmask_ipv4(
+	&((const sockaddr_in*)addr.get_sockaddr())->sin_addr, prefix_len, &b);
+      if (memcmp(&a, &b, sizeof(a)) == 0) {
+	return true;
+      }
+    }
+    break;
+  case AF_INET6:
+    {
+      struct in6_addr a, b;
+      netmask_ipv6(
+	&((const sockaddr_in6*)network.get_sockaddr())->sin6_addr, prefix_len, &a);
+      netmask_ipv6(
+	&((const sockaddr_in6*)addr.get_sockaddr())->sin6_addr, prefix_len, &b);
+      if (memcmp(&a, &b, sizeof(a)) == 0) {
+	return true;
+      }
+    }
+    break;
+  }
+  return false;
+}
diff --git a/src/common/iso_8601.cc b/src/common/iso_8601.cc
new file mode 100644
index 00000000..29cfd4b4
--- /dev/null
+++ b/src/common/iso_8601.cc
@@ -0,0 +1,201 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <iomanip>
+#include <sstream>
+
+#include "iso_8601.h"
+#include "include/timegm.h"
+
+namespace ceph {
+using std::chrono::duration_cast;
+using std::chrono::nanoseconds;
+using std::chrono::seconds;
+using std::setw;
+using std::size_t;
+using std::stringstream;
+using std::string;
+using std::uint16_t;
+
+using boost::none;
+using boost::optional;
+using boost::string_ref;
+
+using ceph::real_clock;
+using ceph::real_time;
+
+using sriter = string_ref::const_iterator;
+
+namespace {
+// This assumes a contiguous block of numbers in the correct order.
+uint16_t digit(char c) {
+  if (!(c >= '0' && c <= '9')) {
+    throw std::invalid_argument("Not a digit.");
+  }
+  return static_cast<uint16_t>(c - '0');
+}
+
+optional<real_time> calculate(const tm& t, uint32_t n = 0) {
+  ceph_assert(n < 1000000000);
+  time_t tt = internal_timegm(&t);
+  if (tt == static_cast<time_t>(-1)) {
+    return none;
+  }
+
+  return boost::make_optional<real_time>(real_clock::from_time_t(tt)
+                                         + nanoseconds(n));
+}
+}
+
+optional<real_time> from_iso_8601(const string_ref s,
+				  const bool ws_terminates) noexcept {
+  auto end = s.cend();
+  auto read_digit = [end](sriter& c) mutable {
+    if (c == end) {
+      throw std::invalid_argument("End of input.");
+    }
+    auto f = digit(*c);
+    ++c;
+    return f;
+  };
+
+  auto read_digits = [&read_digit](sriter& c, std::size_t n) {
+    auto v = 0ULL;
+    for (auto i = 0U; i < n; ++i) {
+      auto d = read_digit(c);
+      v = (10ULL * v) + d;
+    }
+    return v;
+  };
+  auto partial_date = [end, ws_terminates](sriter& c) {
+    return (c == end || (ws_terminates && std::isspace(*c)));
+  };
+  auto time_end = [end, ws_terminates](sriter& c) {
+    return (c != end && *c == 'Z' &&
+	    ((c + 1) == end ||
+	     (ws_terminates && std::isspace(*(c + 1)))));
+  };
+  auto consume_delimiter = [end](sriter& c, char q) {
+    if (c == end || *c != q) {
+      throw std::invalid_argument("Expected delimiter not found.");
+    } else {
+      ++c;
+    }
+  };
+
+  tm t = { 0, // tm_sec
+	   0, // tm_min
+	   0, // tm_hour
+	   1, // tm_mday
+	   0, // tm_mon
+	   70, // tm_year
+	   0, // tm_wday
+	   0, // tm_yday
+	   0, // tm_isdst
+  };
+  try {
+    auto c = s.cbegin();
+    {
+      auto y = read_digits(c, 4);
+      if (y < 1970) {
+	return none;
+      }
+      t.tm_year = y - 1900;
+    }
+    if (partial_date(c)) {
+      return calculate(t, 0);
+    }
+
+    consume_delimiter(c, '-');
+    t.tm_mon = (read_digits(c, 2) - 1);
+    if (partial_date(c)) {
+      return calculate(t);
+    }
+    consume_delimiter(c, '-');
+    t.tm_mday = read_digits(c, 2);
+    if (partial_date(c)) {
+      return calculate(t);
+    }
+    consume_delimiter(c, 'T');
+    t.tm_hour = read_digits(c, 2);
+    if (time_end(c)) {
+      return calculate(t);
+    }
+    consume_delimiter(c, ':');
+    t.tm_min = read_digits(c, 2);
+    if (time_end(c)) {
+      return calculate(t);
+    }
+    consume_delimiter(c, ':');
+    t.tm_sec = read_digits(c, 2);
+    if (time_end(c)) {
+      return calculate(t);
+    }
+    consume_delimiter(c, '.');
+
+    auto n = 0UL;
+    auto multiplier = 100000000UL;
+    for (auto i = 0U; i < 9U; ++i) {
+      auto d = read_digit(c);
+      n += d * multiplier;
+      multiplier /= 10;
+      if (time_end(c)) {
+	return calculate(t, n);
+      }
+    }
+  } catch (std::invalid_argument& e) {
+    // fallthrough
+  }
+  return none;
+}
+
+string to_iso_8601(const real_time t,
+		   const iso_8601_format f) noexcept {
+  ceph_assert(f >= iso_8601_format::Y &&
+	      f <= iso_8601_format::YMDhmsn);
+  stringstream out(std::ios_base::out);
+
+  auto sec = real_clock::to_time_t(t);
+  auto nsec = duration_cast<nanoseconds>(t.time_since_epoch() %
+					 seconds(1)).count();
+
+  struct tm bt;
+  gmtime_r(&sec, &bt);
+  out.fill('0');
+
+  out << 1900 + bt.tm_year;
+  if (f == iso_8601_format::Y) {
+    return out.str();
+  }
+
+  out << '-' << setw(2) << bt.tm_mon + 1;
+  if (f == iso_8601_format::YM) {
+    return out.str();
+  }
+
+  out << '-' << setw(2) << bt.tm_mday;
+  if (f == iso_8601_format::YMD) {
+    return out.str();
+  }
+
+  out << 'T' << setw(2) << bt.tm_hour;
+  if (f == iso_8601_format::YMDh) {
+    out << 'Z';
+    return out.str();
+  }
+
+  out << ':' << setw(2) << bt.tm_min;
+  if (f == iso_8601_format::YMDhm) {
+    out << 'Z';
+    return out.str();
+  }
+
+  out << ':' << setw(2) << bt.tm_sec;
+  if (f == iso_8601_format::YMDhms) {
+    out << 'Z';
+    return out.str();
+  }
+  out << '.' << setw(9) << nsec << 'Z';
+  return out.str();
+}
+}
diff --git a/src/common/iso_8601.h b/src/common/iso_8601.h
new file mode 100644
index 00000000..5aa63983
--- /dev/null
+++ b/src/common/iso_8601.h
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_COMMON_ISO_8601_H
+#define CEPH_COMMON_ISO_8601_H
+
+#include <boost/optional.hpp>
+#include <boost/utility/string_ref.hpp>
+
+#include "common/ceph_time.h"
+
+namespace ceph {
+
+// Here, we support the W3C profile of ISO 8601 with the following
+// restrictions:
+// -   Subsecond resolution is supported to nanosecond
+//     granularity. Any number of digits between 1 and 9 may be
+//     specified after the decimal point.
+// -   All times must be UTC.
+// -   All times must be representable as a sixty-four bit count of
+//     nanoseconds since the epoch.
+// -   Partial times are handled thus:
+//     *    If there are no subseconds, they are assumed to be zero.
+//     *    If there are no seconds, they are assumed to be zero.
+//     *    If there are no minutes, they are assumed to be zero.
+//     *    If there is no time, it is assumed to midnight.
+//     *    If there is no day, it is assumed to be the first.
+//     *    If there is no month, it is assumed to be January.
+//
+// If a date is invalid, boost::none is returned.
+
+boost::optional<ceph::real_time> from_iso_8601(
+  boost::string_ref s, const bool ws_terminates = true) noexcept;
+
+enum class iso_8601_format {
+  Y, YM, YMD, YMDh, YMDhm, YMDhms, YMDhmsn
+};
+
+std::string to_iso_8601(const ceph::real_time t,
+			const iso_8601_format f = iso_8601_format::YMDhmsn)
+  noexcept;
+}
+
+#endif
diff --git a/src/common/item_history.h b/src/common/item_history.h
new file mode 100644
index 00000000..351d5ba7
--- /dev/null
+++ b/src/common/item_history.h
@@ -0,0 +1,103 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <list>
+#include <mutex>
+
+/*
+
+Keep a history of item values so that readers can dereference the pointer to
+the latest value and continue using it as long as they want.  This container
+is only appropriate for values that are updated a handful of times over their
+total lifetime.
+
+There is a prune() method to throw out old values, but it should only be used
+if the caller has some way of knowing all readers are done.
+
+*/
+
+template<class T>
+class mutable_item_history {
+private:
+  std::mutex lock;
+  std::list<T> history;
+  T *current = nullptr;
+
+public:
+  mutable_item_history() {
+    history.emplace_back(T());
+    current = &history.back();
+  }
+
+  // readers are lock-free
+  const T& operator*() const {
+    return *current;
+  }
+  const T *operator->() const {
+    return current;
+  }
+
+  // non-const variants (be careful!)
+  T& operator*() {
+    return *current;
+  }
+  T *operator->() {
+    return current;
+  }
+
+  // writes are serialized
+  const T& operator=(const T& other) {
+    std::lock_guard l(lock);
+    history.push_back(other);
+    current = &history.back();
+    return *current;
+  }
+
+  void prune() {
+    // note: this is not necessarily thread-safe wrt readers
+    std::lock_guard l(lock);
+    while (history.size() > 1) {
+      history.pop_front();
+    }
+  }
+};
+
+template<class T>
+class safe_item_history {
+private:
+  std::mutex lock;
+  std::list<T> history;
+  T *current = nullptr;
+
+public:
+  safe_item_history() {
+    history.emplace_back(T());
+    current = &history.back();
+  }
+
+  // readers are lock-free
+  const T& operator*() const {
+    return *current;
+  }
+  const T *operator->() const {
+    return current;
+  }
+
+  // writes are serialized
+  const T& operator=(const T& other) {
+    std::lock_guard l(lock);
+    history.push_back(other);
+    current = &history.back();
+    return *current;
+  }
+
+  void prune() {
+    // note: this is not necessarily thread-safe wrt readers
+    std::lock_guard l(lock);
+    while (history.size() > 1) {
+      history.pop_front();
+    }
+  }
+};
diff --git a/src/common/legacy_config_opts.h b/src/common/legacy_config_opts.h
new file mode 100644
index 00000000..1ff14bd6
--- /dev/null
+++ b/src/common/legacy_config_opts.h
@@ -0,0 +1,1588 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+/* note: no header guard */
+OPTION(host, OPT_STR) // "" means that ceph will use short hostname
+OPTION(public_addr, OPT_ADDR)
+OPTION(public_bind_addr, OPT_ADDR)
+OPTION(cluster_addr, OPT_ADDR)
+OPTION(public_network, OPT_STR)
+OPTION(cluster_network, OPT_STR)
+OPTION(lockdep, OPT_BOOL)
+OPTION(lockdep_force_backtrace, OPT_BOOL) // always gather current backtrace at every lock
+OPTION(run_dir, OPT_STR)       // the "/var/run/ceph" dir, created on daemon startup
+OPTION(admin_socket, OPT_STR) // default changed by common_preinit()
+OPTION(admin_socket_mode, OPT_STR) // permission bits to set for admin socket file, e.g., "0775", "0755"
+
+OPTION(daemonize, OPT_BOOL) // default changed by common_preinit()
+OPTION(setuser, OPT_STR)        // uid or user name
+OPTION(setgroup, OPT_STR)        // gid or group name
+OPTION(setuser_match_path, OPT_STR)  // make setuser/group conditional on this path matching ownership
+OPTION(pid_file, OPT_STR) // default changed by common_preinit()
+OPTION(chdir, OPT_STR)
+OPTION(restapi_log_level, OPT_STR) 	// default set by Python code
+OPTION(restapi_base_url, OPT_STR)	// "
+OPTION(fatal_signal_handlers, OPT_BOOL)
+OPTION(crash_dir, OPT_STR)
+SAFE_OPTION(erasure_code_dir, OPT_STR) // default location for erasure-code plugins
+
+OPTION(log_file, OPT_STR) // default changed by common_preinit()
+OPTION(log_max_new, OPT_INT) // default changed by common_preinit()
+OPTION(log_max_recent, OPT_INT) // default changed by common_preinit()
+OPTION(log_to_file, OPT_BOOL)
+OPTION(log_to_stderr, OPT_BOOL) // default changed by common_preinit()
+OPTION(err_to_stderr, OPT_BOOL) // default changed by common_preinit()
+OPTION(log_to_syslog, OPT_BOOL)
+OPTION(err_to_syslog, OPT_BOOL)
+OPTION(log_flush_on_exit, OPT_BOOL) // default changed by common_preinit()
+OPTION(log_stop_at_utilization, OPT_FLOAT)  // stop logging at (near) full
+OPTION(log_to_graylog, OPT_BOOL)
+OPTION(err_to_graylog, OPT_BOOL)
+OPTION(log_graylog_host, OPT_STR)
+OPTION(log_graylog_port, OPT_INT)
+
+// options will take k/v pairs, or single-item that will be assumed as general
+// default for all, regardless of channel.
+// e.g., "info" would be taken as the same as "default=info"
+// also, "default=daemon audit=local0" would mean
+//    "default all to 'daemon', override 'audit' with 'local0'
+OPTION(clog_to_monitors, OPT_STR)
+OPTION(clog_to_syslog, OPT_STR)
+OPTION(clog_to_syslog_level, OPT_STR) // this level and above
+OPTION(clog_to_syslog_facility, OPT_STR)
+OPTION(clog_to_graylog, OPT_STR)
+OPTION(clog_to_graylog_host, OPT_STR)
+OPTION(clog_to_graylog_port, OPT_STR)
+
+OPTION(mon_cluster_log_to_syslog, OPT_STR)
+OPTION(mon_cluster_log_to_syslog_level, OPT_STR)   // this level and above
+OPTION(mon_cluster_log_to_syslog_facility, OPT_STR)
+OPTION(mon_cluster_log_to_file, OPT_BOOL)
+OPTION(mon_cluster_log_file, OPT_STR)
+OPTION(mon_cluster_log_file_level, OPT_STR)
+OPTION(mon_cluster_log_to_graylog, OPT_STR)
+OPTION(mon_cluster_log_to_graylog_host, OPT_STR)
+OPTION(mon_cluster_log_to_graylog_port, OPT_STR)
+
+OPTION(enable_experimental_unrecoverable_data_corrupting_features, OPT_STR)
+
+SAFE_OPTION(plugin_dir, OPT_STR)
+
+OPTION(xio_trace_mempool, OPT_BOOL) // mempool allocation counters
+OPTION(xio_trace_msgcnt, OPT_BOOL) // incoming/outgoing msg counters
+OPTION(xio_trace_xcon, OPT_BOOL) // Xio message encode/decode trace
+OPTION(xio_queue_depth, OPT_INT) // depth of Accelio msg queue
+OPTION(xio_mp_min, OPT_INT) // default min mempool size
+OPTION(xio_mp_max_64, OPT_INT) // max 64-byte chunks (buffer is 40)
+OPTION(xio_mp_max_256, OPT_INT) // max 256-byte chunks
+OPTION(xio_mp_max_1k, OPT_INT) // max 1K chunks
+OPTION(xio_mp_max_page, OPT_INT) // max 1K chunks
+OPTION(xio_mp_max_hint, OPT_INT) // max size-hint chunks
+OPTION(xio_portal_threads, OPT_INT) // xio portal threads per messenger
+OPTION(xio_max_conns_per_portal, OPT_INT) // max xio_connections per portal/ctx
+OPTION(xio_transport_type, OPT_STR) // xio transport type: {rdma or tcp}
+OPTION(xio_max_send_inline, OPT_INT) // xio maximum threshold to send inline
+
+OPTION(compressor_zlib_isal, OPT_BOOL)
+OPTION(compressor_zlib_level, OPT_INT) //regular zlib compression level, not applicable to isa-l optimized version
+OPTION(compressor_zstd_level, OPT_INT) //regular zstd compression level
+
+OPTION(qat_compressor_enabled, OPT_BOOL)
+
+OPTION(plugin_crypto_accelerator, OPT_STR)
+
+OPTION(mempool_debug, OPT_BOOL)
+
+
+
+OPTION(key, OPT_STR)
+OPTION(keyfile, OPT_STR)
+OPTION(keyring, OPT_STR)
+OPTION(heartbeat_interval, OPT_INT)
+OPTION(heartbeat_file, OPT_STR)
+OPTION(heartbeat_inject_failure, OPT_INT)    // force an unhealthy heartbeat for N seconds
+OPTION(perf, OPT_BOOL)       // enable internal perf counters
+
+SAFE_OPTION(ms_type, OPT_STR)   // messenger backend. It will be modified in runtime, so use SAFE_OPTION
+OPTION(ms_public_type, OPT_STR)   // messenger backend
+OPTION(ms_cluster_type, OPT_STR)   // messenger backend
+OPTION(ms_learn_addr_from_peer, OPT_BOOL)
+OPTION(ms_tcp_nodelay, OPT_BOOL)
+OPTION(ms_tcp_rcvbuf, OPT_INT)
+OPTION(ms_tcp_prefetch_max_size, OPT_U32) // max prefetch size, we limit this to avoid extra memcpy
+OPTION(ms_initial_backoff, OPT_DOUBLE)
+OPTION(ms_max_backoff, OPT_DOUBLE)
+OPTION(ms_crc_data, OPT_BOOL)
+OPTION(ms_crc_header, OPT_BOOL)
+OPTION(ms_die_on_bad_msg, OPT_BOOL)
+OPTION(ms_die_on_unhandled_msg, OPT_BOOL)
+OPTION(ms_die_on_old_message, OPT_BOOL)     // assert if we get a dup incoming message and shouldn't have (may be triggered by pre-541cd3c64be0dfa04e8a2df39422e0eb9541a428 code)
+OPTION(ms_die_on_skipped_message, OPT_BOOL)  // assert if we skip a seq (kernel client does this intentionally)
+OPTION(ms_die_on_bug, OPT_BOOL)
+OPTION(ms_dispatch_throttle_bytes, OPT_U64)
+OPTION(ms_bind_ipv6, OPT_BOOL)
+OPTION(ms_bind_port_min, OPT_INT)
+OPTION(ms_bind_port_max, OPT_INT)
+OPTION(ms_bind_retry_count, OPT_INT) // If binding fails, how many times do we retry to bind
+OPTION(ms_bind_retry_delay, OPT_INT) // Delay between attempts to bind
+OPTION(ms_bind_before_connect, OPT_BOOL)
+OPTION(ms_tcp_listen_backlog, OPT_INT)
+OPTION(ms_rwthread_stack_bytes, OPT_U64)
+OPTION(ms_connection_ready_timeout, OPT_U64)
+OPTION(ms_connection_idle_timeout, OPT_U64)
+OPTION(ms_pq_max_tokens_per_priority, OPT_U64)
+OPTION(ms_pq_min_cost, OPT_U64)
+OPTION(ms_inject_socket_failures, OPT_U64)
+SAFE_OPTION(ms_inject_delay_type, OPT_STR)          // "osd mds mon client" allowed
+OPTION(ms_inject_delay_msg_type, OPT_STR)      // the type of message to delay). This is an additional restriction on the general type filter ms_inject_delay_type.
+OPTION(ms_inject_delay_max, OPT_DOUBLE)         // seconds
+OPTION(ms_inject_delay_probability, OPT_DOUBLE) // range [0, 1]
+OPTION(ms_inject_internal_delays, OPT_DOUBLE)   // seconds
+OPTION(ms_dump_on_send, OPT_BOOL)           // hexdump msg to log on send
+OPTION(ms_dump_corrupt_message_level, OPT_INT)  // debug level to hexdump undecodeable messages at
+OPTION(ms_async_op_threads, OPT_U64)            // number of worker processing threads for async messenger created on init
+OPTION(ms_async_max_op_threads, OPT_U64)        // max number of worker processing threads for async messenger
+OPTION(ms_async_rdma_device_name, OPT_STR)
+OPTION(ms_async_rdma_enable_hugepage, OPT_BOOL)
+OPTION(ms_async_rdma_buffer_size, OPT_INT)
+OPTION(ms_async_rdma_send_buffers, OPT_U32)
+//size of the receive buffer pool, 0 is unlimited
+OPTION(ms_async_rdma_receive_buffers, OPT_U32)
+// max number of wr in srq
+OPTION(ms_async_rdma_receive_queue_len, OPT_U32)
+// support srq
+OPTION(ms_async_rdma_support_srq, OPT_BOOL)
+OPTION(ms_async_rdma_port_num, OPT_U32)
+OPTION(ms_async_rdma_polling_us, OPT_U32)
+OPTION(ms_async_rdma_local_gid, OPT_STR)       // GID format: "fe80:0000:0000:0000:7efe:90ff:fe72:6efe", no zero folding
+OPTION(ms_async_rdma_roce_ver, OPT_INT)         // 0=RoCEv1, 1=RoCEv2, 2=RoCEv1.5
+OPTION(ms_async_rdma_sl, OPT_INT)               // in RoCE, this means PCP
+OPTION(ms_async_rdma_dscp, OPT_INT)            // in RoCE, this means DSCP
+
+// rdma connection management
+OPTION(ms_async_rdma_cm, OPT_BOOL)
+OPTION(ms_async_rdma_type, OPT_STR)
+
+// when there are enough accept failures, indicating there are unrecoverable failures,
+// just do ceph_abort() . Here we make it configurable.
+OPTION(ms_max_accept_failures, OPT_INT)
+
+OPTION(ms_dpdk_port_id, OPT_INT)
+SAFE_OPTION(ms_dpdk_coremask, OPT_STR)        // it is modified in unittest so that use SAFE_OPTION to declare 
+OPTION(ms_dpdk_memory_channel, OPT_STR)
+OPTION(ms_dpdk_hugepages, OPT_STR)
+OPTION(ms_dpdk_pmd, OPT_STR)
+SAFE_OPTION(ms_dpdk_host_ipv4_addr, OPT_STR)
+SAFE_OPTION(ms_dpdk_gateway_ipv4_addr, OPT_STR)
+SAFE_OPTION(ms_dpdk_netmask_ipv4_addr, OPT_STR)
+OPTION(ms_dpdk_lro, OPT_BOOL)
+OPTION(ms_dpdk_hw_flow_control, OPT_BOOL)
+// Weighing of a hardware network queue relative to a software queue (0=no work, 1=     equal share)")
+OPTION(ms_dpdk_hw_queue_weight, OPT_FLOAT)
+OPTION(ms_dpdk_debug_allow_loopback, OPT_BOOL)
+OPTION(ms_dpdk_rx_buffer_count_per_core, OPT_INT)
+
+OPTION(inject_early_sigterm, OPT_BOOL)
+
+OPTION(mon_data, OPT_STR)
+OPTION(mon_initial_members, OPT_STR)    // list of initial cluster mon ids; if specified, need majority to form initial quorum and create new cluster
+OPTION(mon_compact_on_start, OPT_BOOL)  // compact leveldb on ceph-mon start
+OPTION(mon_compact_on_bootstrap, OPT_BOOL)  // trigger leveldb compaction on bootstrap
+OPTION(mon_compact_on_trim, OPT_BOOL)       // compact (a prefix) when we trim old states
+OPTION(mon_osd_cache_size, OPT_INT)  // the size of osdmaps cache, not to rely on underlying store's cache
+
+OPTION(mon_osd_cache_size_min, OPT_U64) // minimum amount of memory to cache osdmaps
+OPTION(mon_memory_target, OPT_U64) // amount of mapped memory for osdmaps
+OPTION(mon_memory_autotune, OPT_BOOL) // autotune cache memory for osdmap
+OPTION(mon_cpu_threads, OPT_INT)
+OPTION(mon_osd_mapping_pgs_per_chunk, OPT_INT)
+OPTION(mon_clean_pg_upmaps_per_chunk, OPT_INT)
+OPTION(mon_osd_max_creating_pgs, OPT_INT)
+OPTION(mon_tick_interval, OPT_INT)
+OPTION(mon_session_timeout, OPT_INT)    // must send keepalive or subscribe
+OPTION(mon_subscribe_interval, OPT_DOUBLE)  // for legacy clients only
+OPTION(mon_delta_reset_interval, OPT_DOUBLE)   // seconds of inactivity before we reset the pg delta to 0
+OPTION(mon_osd_laggy_halflife, OPT_INT)        // (seconds) how quickly our laggy estimations decay
+OPTION(mon_osd_laggy_weight, OPT_DOUBLE)          // weight for new 'samples's in laggy estimations
+OPTION(mon_osd_laggy_max_interval, OPT_INT)      // maximum value of laggy_interval in laggy estimations
+OPTION(mon_osd_adjust_heartbeat_grace, OPT_BOOL)    // true if we should scale based on laggy estimations
+OPTION(mon_osd_adjust_down_out_interval, OPT_BOOL)  // true if we should scale based on laggy estimations
+OPTION(mon_osd_auto_mark_in, OPT_BOOL)         // mark any booting osds 'in'
+OPTION(mon_osd_auto_mark_auto_out_in, OPT_BOOL) // mark booting auto-marked-out osds 'in'
+OPTION(mon_osd_auto_mark_new_in, OPT_BOOL)      // mark booting new osds 'in'
+OPTION(mon_osd_destroyed_out_interval, OPT_INT) // seconds
+OPTION(mon_osd_down_out_interval, OPT_INT) // seconds
+OPTION(mon_osd_min_up_ratio, OPT_DOUBLE)    // min osds required to be up to mark things down
+OPTION(mon_osd_min_in_ratio, OPT_DOUBLE)   // min osds required to be in to mark things out
+OPTION(mon_osd_warn_op_age, OPT_DOUBLE)     // max op age before we generate a warning (make it a power of 2)
+OPTION(mon_osd_err_op_age_ratio, OPT_DOUBLE)  // when to generate an error, as multiple of mon_osd_warn_op_age
+OPTION(mon_osd_prime_pg_temp, OPT_BOOL)  // prime osdmap with pg mapping changes
+OPTION(mon_osd_prime_pg_temp_max_time, OPT_FLOAT)  // max time to spend priming
+OPTION(mon_osd_prime_pg_temp_max_estimate, OPT_FLOAT) // max estimate of pg total before we do all pgs in parallel
+OPTION(mon_election_timeout, OPT_FLOAT)  // on election proposer, max waiting time for all ACKs
+OPTION(mon_lease, OPT_FLOAT)       // lease interval
+OPTION(mon_lease_renew_interval_factor, OPT_FLOAT) // on leader, to renew the lease
+OPTION(mon_lease_ack_timeout_factor, OPT_FLOAT) // on leader, if lease isn't acked by all peons
+OPTION(mon_accept_timeout_factor, OPT_FLOAT)    // on leader, if paxos update isn't accepted
+
+OPTION(mon_clock_drift_allowed, OPT_FLOAT) // allowed clock drift between monitors
+OPTION(mon_clock_drift_warn_backoff, OPT_FLOAT) // exponential backoff for clock drift warnings
+OPTION(mon_timecheck_interval, OPT_FLOAT) // on leader, timecheck (clock drift check) interval (seconds)
+OPTION(mon_timecheck_skew_interval, OPT_FLOAT) // on leader, timecheck (clock drift check) interval when in presence of a skew (seconds)
+OPTION(mon_pg_check_down_all_threshold, OPT_FLOAT) // threshold of down osds after which we check all pgs
+OPTION(mon_cache_target_full_warn_ratio, OPT_FLOAT) // position between pool cache_target_full and max where we start warning
+OPTION(mon_osd_full_ratio, OPT_FLOAT) // what % full makes an OSD "full"
+OPTION(mon_osd_backfillfull_ratio, OPT_FLOAT) // what % full makes an OSD backfill full (backfill halted)
+OPTION(mon_osd_nearfull_ratio, OPT_FLOAT) // what % full makes an OSD near full
+OPTION(mon_osd_initial_require_min_compat_client, OPT_STR)
+OPTION(mon_allow_pool_delete, OPT_BOOL) // allow pool deletion
+OPTION(mon_fake_pool_delete, OPT_BOOL)  // fake pool deletion (add _DELETED suffix)
+OPTION(mon_globalid_prealloc, OPT_U32)   // how many globalids to prealloc
+OPTION(mon_osd_report_timeout, OPT_INT)    // grace period before declaring unresponsive OSDs dead
+OPTION(mon_warn_on_legacy_crush_tunables, OPT_BOOL) // warn if crush tunables are too old (older than mon_min_crush_required_version)
+OPTION(mon_crush_min_required_version, OPT_STR)
+OPTION(mon_warn_on_crush_straw_calc_version_zero, OPT_BOOL) // warn if crush straw_calc_version==0
+OPTION(mon_warn_on_osd_down_out_interval_zero, OPT_BOOL) // warn if 'mon_osd_down_out_interval == 0'
+OPTION(mon_warn_on_cache_pools_without_hit_sets, OPT_BOOL)
+OPTION(mon_warn_on_misplaced, OPT_BOOL)
+OPTION(mon_min_osdmap_epochs, OPT_INT)
+OPTION(mon_max_log_epochs, OPT_INT)
+OPTION(mon_max_mdsmap_epochs, OPT_INT)
+OPTION(mon_max_osd, OPT_INT)
+OPTION(mon_probe_timeout, OPT_DOUBLE)
+OPTION(mon_client_bytes, OPT_U64)  // client msg data allowed in memory (in bytes)
+OPTION(mon_log_max_summary, OPT_U64)
+OPTION(mon_daemon_bytes, OPT_U64)  // mds, osd message memory cap (in bytes)
+OPTION(mon_max_log_entries_per_event, OPT_INT)
+OPTION(mon_reweight_min_pgs_per_osd, OPT_U64)   // min pgs per osd for reweight-by-pg command
+OPTION(mon_reweight_min_bytes_per_osd, OPT_U64)   // min bytes per osd for reweight-by-utilization command
+OPTION(mon_reweight_max_osds, OPT_INT)   // max osds to change per reweight-by-* command
+OPTION(mon_reweight_max_change, OPT_DOUBLE)
+OPTION(mon_health_to_clog, OPT_BOOL)
+OPTION(mon_health_to_clog_interval, OPT_INT)
+OPTION(mon_health_to_clog_tick_interval, OPT_DOUBLE)
+OPTION(mon_health_detail_to_clog, OPT_BOOL)
+OPTION(mon_data_avail_crit, OPT_INT)
+OPTION(mon_data_avail_warn, OPT_INT)
+OPTION(mon_data_size_warn, OPT_U64) // issue a warning when the monitor's data store goes over 15GB (in bytes)
+OPTION(mon_warn_pg_not_scrubbed_ratio, OPT_FLOAT)
+OPTION(mon_warn_pg_not_deep_scrubbed_ratio, OPT_FLOAT)
+OPTION(mon_scrub_interval, OPT_INT) // once a day
+OPTION(mon_scrub_timeout, OPT_INT) // let's give it 5 minutes; why not.
+OPTION(mon_scrub_max_keys, OPT_INT) // max number of keys to scrub each time
+OPTION(mon_scrub_inject_crc_mismatch, OPT_DOUBLE) // probability of injected crc mismatch [0.0, 1.0]
+OPTION(mon_scrub_inject_missing_keys, OPT_DOUBLE) // probability of injected missing keys [0.0, 1.0]
+OPTION(mon_config_key_max_entry_size, OPT_INT) // max num bytes per config-key entry
+OPTION(mon_sync_timeout, OPT_DOUBLE)
+OPTION(mon_sync_max_payload_size, OPT_SIZE)
+OPTION(mon_sync_max_payload_keys, OPT_INT)
+OPTION(mon_sync_debug, OPT_BOOL) // enable sync-specific debug
+OPTION(mon_inject_sync_get_chunk_delay, OPT_DOUBLE)  // inject N second delay on each get_chunk request
+OPTION(mon_osd_force_trim_to, OPT_INT)   // force mon to trim maps to this point, regardless of min_last_epoch_clean (dangerous)
+OPTION(mon_mds_force_trim_to, OPT_INT)   // force mon to trim mdsmaps to this point (dangerous)
+OPTION(mon_mds_skip_sanity, OPT_BOOL)  // skip safety assertions on FSMap (in case of bugs where we want to continue anyway)
+OPTION(mon_osd_snap_trim_queue_warn_on, OPT_INT)
+
+// monitor debug options
+OPTION(mon_debug_deprecated_as_obsolete, OPT_BOOL) // consider deprecated commands as obsolete
+
+// dump transactions
+OPTION(mon_debug_dump_transactions, OPT_BOOL)
+OPTION(mon_debug_dump_json, OPT_BOOL)
+OPTION(mon_debug_dump_location, OPT_STR)
+OPTION(mon_debug_no_require_mimic, OPT_BOOL)
+OPTION(mon_debug_no_require_bluestore_for_ec_overwrites, OPT_BOOL)
+OPTION(mon_debug_no_initial_persistent_features, OPT_BOOL)
+OPTION(mon_inject_transaction_delay_max, OPT_DOUBLE)      // seconds
+OPTION(mon_inject_transaction_delay_probability, OPT_DOUBLE) // range [0, 1]
+
+OPTION(mon_sync_provider_kill_at, OPT_INT)  // kill the sync provider at a specific point in the work flow
+OPTION(mon_sync_requester_kill_at, OPT_INT) // kill the sync requester at a specific point in the work flow
+OPTION(mon_force_quorum_join, OPT_BOOL) // force monitor to join quorum even if it has been previously removed from the map
+OPTION(mon_keyvaluedb, OPT_STR)   // type of keyvaluedb backend
+
+// UNSAFE -- TESTING ONLY! Allows addition of a cache tier with preexisting snaps
+OPTION(mon_debug_unsafe_allow_tier_with_nonempty_snaps, OPT_BOOL)
+OPTION(mon_osd_blacklist_default_expire, OPT_DOUBLE) // default one hour
+OPTION(mon_osd_crush_smoke_test, OPT_BOOL)
+
+OPTION(paxos_stash_full_interval, OPT_INT)   // how often (in commits) to stash a full copy of the PaxosService state
+OPTION(paxos_max_join_drift, OPT_INT) // max paxos iterations before we must first sync the monitor stores
+OPTION(paxos_propose_interval, OPT_DOUBLE)  // gather updates for this long before proposing a map update
+OPTION(paxos_min_wait, OPT_DOUBLE)  // min time to gather updates for after period of inactivity
+OPTION(paxos_min, OPT_INT)       // minimum number of paxos states to keep around
+OPTION(paxos_trim_min, OPT_INT)  // number of extra proposals tolerated before trimming
+OPTION(paxos_trim_max, OPT_INT) // max number of extra proposals to trim at a time
+OPTION(paxos_service_trim_min, OPT_INT) // minimum amount of versions to trigger a trim (0 disables it)
+OPTION(paxos_service_trim_max, OPT_INT) // maximum amount of versions to trim during a single proposal (0 disables it)
+OPTION(paxos_kill_at, OPT_INT)
+OPTION(auth_cluster_required, OPT_STR)   // required of mon, mds, osd daemons
+OPTION(auth_service_required, OPT_STR)   // required by daemons of clients
+OPTION(auth_client_required, OPT_STR)     // what clients require of daemons
+OPTION(auth_supported, OPT_STR)               // deprecated; default value for above if they are not defined.
+OPTION(max_rotating_auth_attempts, OPT_INT)
+OPTION(cephx_require_signatures, OPT_BOOL)
+OPTION(cephx_cluster_require_signatures, OPT_BOOL)
+OPTION(cephx_service_require_signatures, OPT_BOOL)
+OPTION(cephx_require_version, OPT_INT)
+OPTION(cephx_cluster_require_version, OPT_INT)
+OPTION(cephx_service_require_version, OPT_INT)
+OPTION(cephx_sign_messages, OPT_BOOL)  // Default to signing session messages if supported
+OPTION(auth_mon_ticket_ttl, OPT_DOUBLE)
+OPTION(auth_service_ticket_ttl, OPT_DOUBLE)
+OPTION(auth_allow_insecure_global_id_reclaim, OPT_BOOL)
+OPTION(auth_expose_insecure_global_id_reclaim, OPT_BOOL)
+OPTION(auth_debug, OPT_BOOL)          // if true, assert when weird things happen
+OPTION(mon_client_hunt_parallel, OPT_U32)   // how many mons to try to connect to in parallel during hunt
+OPTION(mon_client_hunt_interval, OPT_DOUBLE)   // try new mon every N seconds until we connect
+OPTION(mon_client_ping_interval, OPT_DOUBLE)  // ping every N seconds
+OPTION(mon_client_ping_timeout, OPT_DOUBLE)   // fail if we don't hear back
+OPTION(mon_client_hunt_interval_backoff, OPT_DOUBLE) // each time we reconnect to a monitor, double our timeout
+OPTION(mon_client_hunt_interval_max_multiple, OPT_DOUBLE) // up to a max of 10*default (30 seconds)
+OPTION(mon_client_max_log_entries_per_message, OPT_INT)
+OPTION(mon_client_directed_command_retry, OPT_INT)
+OPTION(client_cache_size, OPT_INT)
+OPTION(client_cache_mid, OPT_FLOAT)
+OPTION(client_use_random_mds, OPT_BOOL)
+OPTION(client_mount_timeout, OPT_DOUBLE)
+OPTION(client_tick_interval, OPT_DOUBLE)
+OPTION(client_trace, OPT_STR)
+OPTION(client_readahead_min, OPT_LONGLONG)  // readahead at _least_ this much.
+OPTION(client_readahead_max_bytes, OPT_LONGLONG)  // default unlimited
+OPTION(client_readahead_max_periods, OPT_LONGLONG)  // as multiple of file layout period (object size * num stripes)
+OPTION(client_reconnect_stale, OPT_BOOL)  // automatically reconnect stale session
+OPTION(client_snapdir, OPT_STR)
+OPTION(client_mount_uid, OPT_INT)
+OPTION(client_mount_gid, OPT_INT)
+OPTION(client_notify_timeout, OPT_INT) // in seconds
+OPTION(osd_client_watch_timeout, OPT_INT) // in seconds
+OPTION(client_caps_release_delay, OPT_INT) // in seconds
+OPTION(client_quota_df, OPT_BOOL) // use quota for df on subdir mounts
+OPTION(client_oc, OPT_BOOL)
+OPTION(client_oc_size, OPT_INT)    // MB * n
+OPTION(client_oc_max_dirty, OPT_INT)    // MB * n  (dirty OR tx.. bigish)
+OPTION(client_oc_target_dirty, OPT_INT) // target dirty (keep this smallish)
+OPTION(client_oc_max_dirty_age, OPT_DOUBLE)      // max age in cache before writeback
+OPTION(client_oc_max_objects, OPT_INT)      // max objects in cache
+OPTION(client_debug_getattr_caps, OPT_BOOL) // check if MDS reply contains wanted caps
+OPTION(client_debug_force_sync_read, OPT_BOOL)     // always read synchronously (go to osds)
+OPTION(client_debug_inject_tick_delay, OPT_INT) // delay the client tick for a number of seconds
+OPTION(client_max_inline_size, OPT_U64)
+OPTION(client_inject_release_failure, OPT_BOOL)  // synthetic client bug for testing
+OPTION(client_inject_fixed_oldest_tid, OPT_BOOL)  // synthetic client bug for testing
+OPTION(client_metadata, OPT_STR)
+OPTION(client_acl_type, OPT_STR)
+OPTION(client_permissions, OPT_BOOL)
+OPTION(client_dirsize_rbytes, OPT_BOOL)
+
+OPTION(client_try_dentry_invalidate, OPT_BOOL) // the client should try to use dentry invaldation instead of remounting, on kernels it believes that will work for
+OPTION(client_check_pool_perm, OPT_BOOL)
+OPTION(client_use_faked_inos, OPT_BOOL)
+
+OPTION(crush_location, OPT_STR)       // whitespace-separated list of key=value pairs describing crush location
+OPTION(crush_location_hook, OPT_STR)
+OPTION(crush_location_hook_timeout, OPT_INT)
+
+OPTION(objecter_tick_interval, OPT_DOUBLE)
+OPTION(objecter_timeout, OPT_DOUBLE)    // before we ask for a map
+OPTION(objecter_inflight_op_bytes, OPT_U64) // max in-flight data (both directions)
+OPTION(objecter_inflight_ops, OPT_U64)               // max in-flight ios
+OPTION(objecter_completion_locks_per_session, OPT_U64) // num of completion locks per each session, for serializing same object responses
+OPTION(objecter_inject_no_watch_ping, OPT_BOOL)   // suppress watch pings
+OPTION(objecter_retry_writes_after_first_reply, OPT_BOOL)   // ignore the first reply for each write, and resend the osd op instead
+OPTION(objecter_debug_inject_relock_delay, OPT_BOOL)
+
+// Max number of deletes at once in a single Filer::purge call
+OPTION(filer_max_purge_ops, OPT_U32)
+// Max number of truncate at once in a single Filer::truncate call
+OPTION(filer_max_truncate_ops, OPT_U32)
+
+OPTION(mds_data, OPT_STR)
+// max xattr kv pairs size for each dir/file
+OPTION(mds_max_xattr_pairs_size, OPT_U32)
+OPTION(mds_max_file_recover, OPT_U32)
+OPTION(mds_dir_max_commit_size, OPT_INT) // MB
+OPTION(mds_dir_keys_per_op, OPT_INT)
+OPTION(mds_decay_halflife, OPT_FLOAT)
+OPTION(mds_beacon_interval, OPT_FLOAT)
+OPTION(mds_beacon_grace, OPT_FLOAT)
+OPTION(mds_enforce_unique_name, OPT_BOOL)
+
+OPTION(mds_session_blacklist_on_timeout, OPT_BOOL)    // whether to blacklist clients whose sessions are dropped due to timeout
+OPTION(mds_session_blacklist_on_evict, OPT_BOOL)  // whether to blacklist clients whose sessions are dropped via admin commands
+
+OPTION(mds_sessionmap_keys_per_op, OPT_U32)    // how many sessions should I try to load/store in a single OMAP operation?
+OPTION(mds_freeze_tree_timeout, OPT_FLOAT)    // detecting freeze tree deadlock
+OPTION(mds_health_summarize_threshold, OPT_INT) // collapse N-client health metrics to a single 'many'
+OPTION(mds_reconnect_timeout, OPT_FLOAT)  // seconds to wait for clients during mds restart
+	      //  make it (mdsmap.session_timeout - mds_beacon_grace)
+OPTION(mds_tick_interval, OPT_FLOAT)
+OPTION(mds_dirstat_min_interval, OPT_FLOAT)    // try to avoid propagating more often than this
+OPTION(mds_scatter_nudge_interval, OPT_FLOAT)  // how quickly dirstat changes propagate up the hierarchy
+OPTION(mds_client_prealloc_inos, OPT_INT)
+OPTION(mds_early_reply, OPT_BOOL)
+OPTION(mds_default_dir_hash, OPT_INT)
+OPTION(mds_log_pause, OPT_BOOL)
+OPTION(mds_log_skip_corrupt_events, OPT_BOOL)
+OPTION(mds_log_max_events, OPT_INT)
+OPTION(mds_log_events_per_segment, OPT_INT)
+OPTION(mds_log_segment_size, OPT_INT)  // segment size for mds log, default to default file_layout_t
+OPTION(mds_log_max_segments, OPT_U32)
+OPTION(mds_bal_export_pin, OPT_BOOL)  // allow clients to pin directory trees to ranks
+OPTION(mds_bal_sample_interval, OPT_DOUBLE)  // every 3 seconds
+OPTION(mds_bal_replicate_threshold, OPT_FLOAT)
+OPTION(mds_bal_unreplicate_threshold, OPT_FLOAT)
+OPTION(mds_bal_split_size, OPT_INT)
+OPTION(mds_bal_split_rd, OPT_FLOAT)
+OPTION(mds_bal_split_wr, OPT_FLOAT)
+OPTION(mds_bal_split_bits, OPT_INT)
+OPTION(mds_bal_merge_size, OPT_INT)
+OPTION(mds_bal_fragment_size_max, OPT_INT) // order of magnitude higher than split size
+OPTION(mds_bal_fragment_fast_factor, OPT_FLOAT) // multiple of size_max that triggers immediate split
+OPTION(mds_bal_idle_threshold, OPT_FLOAT)
+OPTION(mds_bal_max, OPT_INT)
+OPTION(mds_bal_max_until, OPT_INT)
+OPTION(mds_bal_mode, OPT_INT)
+OPTION(mds_bal_min_rebalance, OPT_FLOAT)  // must be this much above average before we export anything
+OPTION(mds_bal_min_start, OPT_FLOAT)      // if we need less than this, we don't do anything
+OPTION(mds_bal_need_min, OPT_FLOAT)       // take within this range of what we need
+OPTION(mds_bal_need_max, OPT_FLOAT)
+OPTION(mds_bal_midchunk, OPT_FLOAT)       // any sub bigger than this taken in full
+OPTION(mds_bal_minchunk, OPT_FLOAT)     // never take anything smaller than this
+OPTION(mds_bal_target_decay, OPT_DOUBLE) // target decay half-life in MDSMap (2x larger is approx. 2x slower)
+OPTION(mds_replay_interval, OPT_FLOAT) // time to wait before starting replay again
+OPTION(mds_shutdown_check, OPT_INT)
+OPTION(mds_thrash_exports, OPT_INT)
+OPTION(mds_thrash_fragments, OPT_INT)
+OPTION(mds_dump_cache_on_map, OPT_BOOL)
+OPTION(mds_dump_cache_after_rejoin, OPT_BOOL)
+OPTION(mds_verify_scatter, OPT_BOOL)
+OPTION(mds_debug_scatterstat, OPT_BOOL)
+OPTION(mds_debug_frag, OPT_BOOL)
+OPTION(mds_debug_auth_pins, OPT_BOOL)
+OPTION(mds_debug_subtrees, OPT_BOOL)
+OPTION(mds_kill_mdstable_at, OPT_INT)
+OPTION(mds_kill_export_at, OPT_INT)
+OPTION(mds_kill_import_at, OPT_INT)
+OPTION(mds_kill_link_at, OPT_INT)
+OPTION(mds_kill_rename_at, OPT_INT)
+OPTION(mds_kill_openc_at, OPT_INT)
+OPTION(mds_kill_journal_expire_at, OPT_INT)
+OPTION(mds_kill_journal_replay_at, OPT_INT)
+OPTION(mds_journal_format, OPT_U32)  // Default to most recent JOURNAL_FORMAT_*
+OPTION(mds_kill_create_at, OPT_INT)
+OPTION(mds_inject_traceless_reply_probability, OPT_DOUBLE) /* percentage
+				of MDS modify replies to skip sending the
+				client a trace on [0-1]*/
+OPTION(mds_wipe_sessions, OPT_BOOL)
+OPTION(mds_wipe_ino_prealloc, OPT_BOOL)
+OPTION(mds_skip_ino, OPT_INT)
+OPTION(mds_enable_op_tracker, OPT_BOOL) // enable/disable MDS op tracking
+OPTION(mds_op_history_size, OPT_U32)    // Max number of completed ops to track
+OPTION(mds_op_history_duration, OPT_U32) // Oldest completed op to track
+OPTION(mds_op_complaint_time, OPT_FLOAT) // how many seconds old makes an op complaint-worthy
+OPTION(mds_op_log_threshold, OPT_INT) // how many op log messages to show in one go
+OPTION(mds_snap_min_uid, OPT_U32) // The minimum UID required to create a snapshot
+OPTION(mds_snap_max_uid, OPT_U32) // The maximum UID allowed to create a snapshot
+OPTION(mds_snap_rstat, OPT_BOOL) // enable/disable nested stat for snapshot
+OPTION(mds_verify_backtrace, OPT_U32)
+// detect clients which aren't trimming completed requests
+OPTION(mds_max_completed_flushes, OPT_U32)
+OPTION(mds_max_completed_requests, OPT_U32)
+
+OPTION(mds_action_on_write_error, OPT_U32) // 0: ignore; 1: force readonly; 2: crash
+OPTION(mds_mon_shutdown_timeout, OPT_DOUBLE)
+
+// Maximum number of concurrent stray files to purge
+OPTION(mds_max_purge_files, OPT_U32)
+// Maximum number of concurrent RADOS ops to issue in purging
+OPTION(mds_max_purge_ops, OPT_U32)
+// Maximum number of concurrent RADOS ops to issue in purging, scaled by PG count
+OPTION(mds_max_purge_ops_per_pg, OPT_FLOAT)
+
+OPTION(mds_purge_queue_busy_flush_period, OPT_FLOAT)
+
+OPTION(mds_root_ino_uid, OPT_INT) // The UID of / on new filesystems
+OPTION(mds_root_ino_gid, OPT_INT) // The GID of / on new filesystems
+
+OPTION(mds_max_scrub_ops_in_progress, OPT_INT) // the number of simultaneous scrubs allowed
+
+// Maximum number of damaged frags/dentries before whole MDS rank goes damaged
+OPTION(mds_damage_table_max_entries, OPT_INT)
+
+// Maximum increment for client writable range, counted by number of objects
+OPTION(mds_client_writeable_range_max_inc_objs, OPT_U32)
+
+// verify backend can support configured max object name length
+OPTION(osd_check_max_object_name_len_on_startup, OPT_BOOL)
+
+// Maximum number of backfills to or from a single osd
+OPTION(osd_max_backfills, OPT_U64)
+
+// Minimum recovery priority (255 = max, smaller = lower)
+OPTION(osd_min_recovery_priority, OPT_INT)
+
+// Seconds to wait before retrying refused backfills
+OPTION(osd_backfill_retry_interval, OPT_DOUBLE)
+
+// Seconds to wait before retrying refused recovery
+OPTION(osd_recovery_retry_interval, OPT_DOUBLE)
+
+// max agent flush ops
+OPTION(osd_agent_max_ops, OPT_INT)
+OPTION(osd_agent_max_low_ops, OPT_INT)
+OPTION(osd_agent_min_evict_effort, OPT_FLOAT)
+OPTION(osd_agent_quantize_effort, OPT_FLOAT)
+OPTION(osd_agent_delay_time, OPT_FLOAT)
+
+// osd ignore history.last_epoch_started in find_best_info
+OPTION(osd_find_best_info_ignore_history_les, OPT_BOOL)
+
+// decay atime and hist histograms after how many objects go by
+OPTION(osd_agent_hist_halflife, OPT_INT)
+
+// must be this amount over the threshold to enable,
+// this amount below the threshold to disable.
+OPTION(osd_agent_slop, OPT_FLOAT)
+
+OPTION(osd_uuid, OPT_UUID)
+OPTION(osd_data, OPT_STR)
+OPTION(osd_journal, OPT_STR)
+OPTION(osd_journal_size, OPT_INT)         // in mb
+OPTION(osd_journal_flush_on_shutdown, OPT_BOOL) // Flush journal to data store on shutdown
+// flags for specific control purpose during osd mount() process. 
+// e.g., can be 1 to skip over replaying journal
+// or 2 to skip over mounting omap or 3 to skip over both.
+// This might be helpful in case the journal is totally corrupted
+// and we still want to bring the osd daemon back normally, etc.
+OPTION(osd_os_flags, OPT_U32)
+OPTION(osd_max_write_size, OPT_INT)
+OPTION(osd_max_pgls, OPT_U64) // max number of pgls entries to return
+OPTION(osd_client_message_size_cap, OPT_U64) // client data allowed in-memory (in bytes)
+OPTION(osd_client_message_cap, OPT_U64)              // num client messages allowed in-memory
+OPTION(osd_crush_update_weight_set, OPT_BOOL) // update weight set while updating weights
+OPTION(osd_crush_chooseleaf_type, OPT_INT) // 1 = host
+OPTION(osd_pool_use_gmt_hitset, OPT_BOOL) // try to use gmt for hitset archive names if all osds in cluster support it.
+OPTION(osd_crush_update_on_start, OPT_BOOL)
+OPTION(osd_class_update_on_start, OPT_BOOL) // automatically set device class on start
+OPTION(osd_crush_initial_weight, OPT_DOUBLE) // if >=0, the initial weight is for newly added osds.
+OPTION(osd_erasure_code_plugins, OPT_STR) // list of erasure code plugins
+
+// Allows the "peered" state for recovery and backfill below min_size
+OPTION(osd_allow_recovery_below_min_size, OPT_BOOL)
+
+OPTION(osd_pool_default_ec_fast_read, OPT_BOOL) // whether turn on fast read on the pool or not
+OPTION(osd_pool_default_flags, OPT_INT)   // default flags for new pools
+OPTION(osd_pool_default_flag_hashpspool, OPT_BOOL)   // use new pg hashing to prevent pool/pg overlap
+OPTION(osd_pool_default_flag_nodelete, OPT_BOOL) // pool can't be deleted
+OPTION(osd_pool_default_flag_nopgchange, OPT_BOOL) // pool's pg and pgp num can't be changed
+OPTION(osd_pool_default_flag_nosizechange, OPT_BOOL) // pool's size and min size can't be changed
+OPTION(osd_pool_default_hit_set_bloom_fpp, OPT_FLOAT)
+OPTION(osd_pool_default_cache_target_dirty_ratio, OPT_FLOAT)
+OPTION(osd_pool_default_cache_target_dirty_high_ratio, OPT_FLOAT)
+OPTION(osd_pool_default_cache_target_full_ratio, OPT_FLOAT)
+OPTION(osd_pool_default_cache_min_flush_age, OPT_INT)  // seconds
+OPTION(osd_pool_default_cache_min_evict_age, OPT_INT)  // seconds
+OPTION(osd_pool_default_cache_max_evict_check_size, OPT_INT)  // max size to check for eviction
+OPTION(osd_hit_set_min_size, OPT_INT)  // min target size for a HitSet
+OPTION(osd_hit_set_max_size, OPT_INT)  // max target size for a HitSet
+OPTION(osd_hit_set_namespace, OPT_STR) // rados namespace for hit_set tracking
+
+// conservative default throttling values
+OPTION(osd_tier_promote_max_objects_sec, OPT_U64)
+OPTION(osd_tier_promote_max_bytes_sec, OPT_U64)
+
+OPTION(osd_objecter_finishers, OPT_INT)
+
+OPTION(osd_map_dedup, OPT_BOOL)
+OPTION(osd_map_cache_size, OPT_INT)
+OPTION(osd_map_message_max, OPT_INT)  // max maps per MOSDMap message
+OPTION(osd_map_message_max_bytes, OPT_SIZE)  // max maps per MOSDMap message
+OPTION(osd_map_share_max_epochs, OPT_INT)  // cap on # of inc maps we send to peers, clients
+OPTION(osd_inject_bad_map_crc_probability, OPT_FLOAT)
+OPTION(osd_inject_failure_on_pg_removal, OPT_BOOL)
+// shutdown the OSD if stuatus flipping more than max_markdown_count times in recent max_markdown_period seconds
+OPTION(osd_max_markdown_period , OPT_INT)
+OPTION(osd_max_markdown_count, OPT_INT)
+
+OPTION(osd_op_pq_max_tokens_per_priority, OPT_U64)
+OPTION(osd_op_pq_min_cost, OPT_U64)
+OPTION(osd_recover_clone_overlap, OPT_BOOL)   // preserve clone_overlap during recovery/migration
+OPTION(osd_op_num_threads_per_shard, OPT_INT)
+OPTION(osd_op_num_threads_per_shard_hdd, OPT_INT)
+OPTION(osd_op_num_threads_per_shard_ssd, OPT_INT)
+OPTION(osd_op_num_shards, OPT_INT)
+OPTION(osd_op_num_shards_hdd, OPT_INT)
+OPTION(osd_op_num_shards_ssd, OPT_INT)
+
+// PrioritzedQueue (prio), Weighted Priority Queue (wpq ; default),
+// mclock_opclass, mclock_client, or debug_random. "mclock_opclass"
+// and "mclock_client" are based on the mClock/dmClock algorithm
+// (Gulati, et al. 2010). "mclock_opclass" prioritizes based on the
+// class the operation belongs to. "mclock_client" does the same but
+// also works to ienforce fairness between clients. "debug_random"
+// chooses among all four with equal probability.
+OPTION(osd_op_queue, OPT_STR)
+
+OPTION(osd_op_queue_cut_off, OPT_STR) // Min priority to go to strict queue. (low, high)
+
+// mClock priority queue parameters for five types of ops
+OPTION(osd_op_queue_mclock_client_op_res, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_client_op_wgt, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_client_op_lim, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_osd_rep_op_res, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_osd_rep_op_wgt, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_osd_rep_op_lim, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_snap_res, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_snap_wgt, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_snap_lim, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_recov_res, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_recov_wgt, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_recov_lim, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_scrub_res, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_scrub_wgt, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_scrub_lim, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_pg_delete_res, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_pg_delete_wgt, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_pg_delete_lim, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_peering_event_res, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_peering_event_wgt, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_peering_event_lim, OPT_DOUBLE)
+OPTION(osd_op_queue_mclock_anticipation_timeout, OPT_DOUBLE)
+
+OPTION(osd_ignore_stale_divergent_priors, OPT_BOOL) // do not assert on divergent_prior entries which aren't in the log and whose on-disk objects are newer
+
+// Set to true for testing.  Users should NOT set this.
+// If set to true even after reading enough shards to
+// decode the object, any error will be reported.
+OPTION(osd_read_ec_check_for_errors, OPT_BOOL) // return error if any ec shard has an error
+
+// Only use clone_overlap for recovery if there are fewer than
+// osd_recover_clone_overlap_limit entries in the overlap set
+OPTION(osd_recover_clone_overlap_limit, OPT_INT)
+OPTION(osd_debug_feed_pullee, OPT_INT)
+
+OPTION(osd_backfill_scan_min, OPT_INT)
+OPTION(osd_backfill_scan_max, OPT_INT)
+OPTION(osd_op_thread_timeout, OPT_INT)
+OPTION(osd_op_thread_suicide_timeout, OPT_INT)
+OPTION(osd_recovery_sleep, OPT_FLOAT)         // seconds to sleep between recovery ops
+OPTION(osd_recovery_sleep_hdd, OPT_FLOAT)
+OPTION(osd_recovery_sleep_ssd, OPT_FLOAT)
+OPTION(osd_snap_trim_sleep, OPT_DOUBLE)
+OPTION(osd_scrub_invalid_stats, OPT_BOOL)
+OPTION(osd_command_thread_timeout, OPT_INT)
+OPTION(osd_command_thread_suicide_timeout, OPT_INT)
+OPTION(osd_heartbeat_interval, OPT_INT)       // (seconds) how often we ping peers
+
+// (seconds) how long before we decide a peer has failed
+// This setting is read by the MONs and OSDs and has to be set to a equal value in both settings of the configuration
+OPTION(osd_heartbeat_grace, OPT_INT)
+OPTION(osd_heartbeat_min_peers, OPT_INT)     // minimum number of peers
+OPTION(osd_heartbeat_use_min_delay_socket, OPT_BOOL) // prio the heartbeat tcp socket and set dscp as CS6 on it if true
+OPTION(osd_heartbeat_min_size, OPT_INT) // the minimum size of OSD heartbeat messages to send
+
+// max number of parallel snap trims/pg
+OPTION(osd_pg_max_concurrent_snap_trims, OPT_U64)
+// max number of trimming pgs
+OPTION(osd_max_trimming_pgs, OPT_U64)
+
+// minimum number of peers that must be reachable to mark ourselves
+// back up after being wrongly marked down.
+OPTION(osd_heartbeat_min_healthy_ratio, OPT_FLOAT)
+
+OPTION(osd_mon_heartbeat_interval, OPT_INT)  // (seconds) how often to ping monitor if no peers
+OPTION(osd_mon_report_interval, OPT_INT)  // failures, up_thru, boot.
+OPTION(osd_mon_report_max_in_flight, OPT_INT)  // max updates in flight
+OPTION(osd_beacon_report_interval, OPT_INT)       // (second) how often to send beacon message to monitor
+OPTION(osd_pg_stat_report_interval_max, OPT_INT)  // report pg stats for any given pg at least this often
+OPTION(osd_mon_ack_timeout, OPT_DOUBLE) // time out a mon if it doesn't ack stats
+OPTION(osd_stats_ack_timeout_factor, OPT_DOUBLE) // multiples of mon_ack_timeout
+OPTION(osd_stats_ack_timeout_decay, OPT_DOUBLE)
+OPTION(osd_default_data_pool_replay_window, OPT_INT)
+OPTION(osd_auto_mark_unfound_lost, OPT_BOOL)
+OPTION(osd_recovery_delay_start, OPT_FLOAT)
+OPTION(osd_recovery_max_active, OPT_U64)
+OPTION(osd_recovery_max_single_start, OPT_U64)
+OPTION(osd_recovery_max_chunk, OPT_U64)  // max size of push chunk
+OPTION(osd_recovery_max_omap_entries_per_chunk, OPT_U64) // max number of omap entries per chunk; 0 to disable limit
+OPTION(osd_copyfrom_max_chunk, OPT_U64)   // max size of a COPYFROM chunk
+OPTION(osd_push_per_object_cost, OPT_U64)  // push cost per object
+OPTION(osd_max_push_cost, OPT_U64)  // max size of push message
+OPTION(osd_max_push_objects, OPT_U64)  // max objects in single push op
+OPTION(osd_max_scrubs, OPT_INT)
+OPTION(osd_scrub_during_recovery, OPT_BOOL) // Allow new scrubs to start while recovery is active on the OSD
+OPTION(osd_repair_during_recovery, OPT_BOOL) // Allow new requested repairs to start while recovery is active on the OSD
+OPTION(osd_scrub_begin_hour, OPT_INT)
+OPTION(osd_scrub_end_hour, OPT_INT)
+OPTION(osd_scrub_begin_week_day, OPT_INT)
+OPTION(osd_scrub_end_week_day, OPT_INT)
+OPTION(osd_scrub_load_threshold, OPT_FLOAT)
+OPTION(osd_scrub_min_interval, OPT_FLOAT)    // if load is low
+OPTION(osd_scrub_max_interval, OPT_FLOAT)  // regardless of load
+OPTION(osd_scrub_interval_randomize_ratio, OPT_FLOAT) // randomize the scheduled scrub in the span of [min,min*(1+randomize_ratio))
+OPTION(osd_scrub_backoff_ratio, OPT_DOUBLE)   // the probability to back off the scheduled scrub
+OPTION(osd_scrub_chunk_min, OPT_INT)
+OPTION(osd_scrub_chunk_max, OPT_INT)
+OPTION(osd_scrub_sleep, OPT_FLOAT)   // sleep between [deep]scrub ops
+OPTION(osd_scrub_auto_repair, OPT_BOOL)   // whether auto-repair inconsistencies upon deep-scrubbing
+OPTION(osd_scrub_auto_repair_num_errors, OPT_U32)   // only auto-repair when number of errors is below this threshold
+OPTION(osd_deep_scrub_interval, OPT_FLOAT) // once a week
+OPTION(osd_deep_scrub_randomize_ratio, OPT_FLOAT) // scrubs will randomly become deep scrubs at this rate (0.15 -> 15% of scrubs are deep)
+OPTION(osd_deep_scrub_stride, OPT_INT)
+OPTION(osd_deep_scrub_keys, OPT_INT)
+OPTION(osd_deep_scrub_update_digest_min_age, OPT_INT)   // objects must be this old (seconds) before we update the whole-object digest on scrub
+OPTION(osd_skip_data_digest, OPT_BOOL)
+OPTION(osd_deep_scrub_large_omap_object_key_threshold, OPT_U64)
+OPTION(osd_deep_scrub_large_omap_object_value_sum_threshold, OPT_U64)
+OPTION(osd_class_dir, OPT_STR) // where rados plugins are stored
+OPTION(osd_open_classes_on_start, OPT_BOOL)
+OPTION(osd_class_load_list, OPT_STR) // list of object classes allowed to be loaded (allow all: *)
+OPTION(osd_class_default_list, OPT_STR) // list of object classes with default execute perm (allow all: *)
+OPTION(osd_check_for_log_corruption, OPT_BOOL)
+OPTION(osd_use_stale_snap, OPT_BOOL)
+OPTION(osd_rollback_to_cluster_snap, OPT_STR)
+OPTION(osd_default_notify_timeout, OPT_U32) // default notify timeout in seconds
+OPTION(osd_kill_backfill_at, OPT_INT)
+
+// Bounds how infrequently a new map epoch will be persisted for a pg
+OPTION(osd_pg_epoch_persisted_max_stale, OPT_U32) // make this < map_cache_size!
+
+OPTION(osd_min_pg_log_entries, OPT_U32)  // number of entries to keep in the pg log when trimming it
+OPTION(osd_max_pg_log_entries, OPT_U32) // max entries, say when degraded, before we trim
+OPTION(osd_pg_log_dups_tracked, OPT_U32) // how many versions back to track combined in both pglog's regular + dup logs
+OPTION(osd_force_recovery_pg_log_entries_factor, OPT_FLOAT) // max entries factor before force recovery
+OPTION(osd_pg_log_trim_min, OPT_U32)
+OPTION(osd_pg_log_trim_max, OPT_U32)
+OPTION(osd_op_complaint_time, OPT_FLOAT) // how many seconds old makes an op complaint-worthy
+OPTION(osd_command_max_records, OPT_INT)
+OPTION(osd_max_pg_blocked_by, OPT_U32)    // max peer osds to report that are blocking our progress
+OPTION(osd_op_log_threshold, OPT_INT) // how many op log messages to show in one go
+OPTION(osd_verify_sparse_read_holes, OPT_BOOL)  // read fiemap-reported holes and verify they are zeros
+OPTION(osd_backoff_on_unfound, OPT_BOOL)   // object unfound
+OPTION(osd_backoff_on_degraded, OPT_BOOL) // [mainly for debug?] object unreadable/writeable
+OPTION(osd_backoff_on_peering, OPT_BOOL)  // [debug] pg peering
+OPTION(osd_debug_crash_on_ignored_backoff, OPT_BOOL) // crash osd if client ignores a backoff; useful for debugging
+OPTION(osd_debug_inject_dispatch_delay_probability, OPT_DOUBLE)
+OPTION(osd_debug_inject_dispatch_delay_duration, OPT_DOUBLE)
+OPTION(osd_debug_drop_ping_probability, OPT_DOUBLE)
+OPTION(osd_debug_drop_ping_duration, OPT_INT)
+OPTION(osd_debug_op_order, OPT_BOOL)
+OPTION(osd_debug_verify_missing_on_start, OPT_BOOL)
+OPTION(osd_debug_verify_snaps, OPT_BOOL)
+OPTION(osd_debug_verify_stray_on_activate, OPT_BOOL)
+OPTION(osd_debug_skip_full_check_in_backfill_reservation, OPT_BOOL)
+OPTION(osd_debug_reject_backfill_probability, OPT_DOUBLE)
+OPTION(osd_debug_inject_copyfrom_error, OPT_BOOL)  // inject failure during copyfrom completion
+OPTION(osd_debug_misdirected_ops, OPT_BOOL)
+OPTION(osd_debug_skip_full_check_in_recovery, OPT_BOOL)
+OPTION(osd_debug_random_push_read_error, OPT_DOUBLE)
+OPTION(osd_debug_verify_cached_snaps, OPT_BOOL)
+OPTION(osd_debug_deep_scrub_sleep, OPT_FLOAT)
+OPTION(osd_debug_no_acting_change, OPT_BOOL)
+OPTION(osd_debug_pretend_recovery_active, OPT_BOOL)
+OPTION(osd_enable_op_tracker, OPT_BOOL) // enable/disable OSD op tracking
+OPTION(osd_num_op_tracker_shard, OPT_U32) // The number of shards for holding the ops
+OPTION(osd_op_history_size, OPT_U32)    // Max number of completed ops to track
+OPTION(osd_op_history_duration, OPT_U32) // Oldest completed op to track
+OPTION(osd_op_history_slow_op_size, OPT_U32)           // Max number of slow ops to track
+OPTION(osd_op_history_slow_op_threshold, OPT_DOUBLE) // track the op if over this threshold
+OPTION(osd_target_transaction_size, OPT_INT)     // to adjust various transactions that batch smaller items
+OPTION(osd_failsafe_full_ratio, OPT_FLOAT) // what % full makes an OSD "full" (failsafe)
+OPTION(osd_fast_shutdown, OPT_BOOL)
+OPTION(osd_fast_fail_on_connection_refused, OPT_BOOL) // immediately mark OSDs as down once they refuse to accept connections
+
+OPTION(osd_pg_object_context_cache_count, OPT_INT)
+OPTION(osd_tracing, OPT_BOOL) // true if LTTng-UST tracepoints should be enabled
+OPTION(osd_function_tracing, OPT_BOOL) // true if function instrumentation should use LTTng
+
+OPTION(osd_fast_info, OPT_BOOL) // use fast info attr, if we can
+
+// determines whether PGLog::check() compares written out log to stored log
+OPTION(osd_debug_pg_log_writeout, OPT_BOOL)
+OPTION(osd_loop_before_reset_tphandle, OPT_U32) // Max number of loop before we reset thread-pool's handle
+OPTION(osd_max_snap_prune_intervals_per_epoch, OPT_U64) // Max number of snap intervals to report to mgr in pg_stat_t
+
+// default timeout while caling WaitInterval on an empty queue
+OPTION(threadpool_default_timeout, OPT_INT)
+// default wait time for an empty queue before pinging the hb timeout
+OPTION(threadpool_empty_queue_max_wait, OPT_INT)
+
+OPTION(leveldb_log_to_ceph_log, OPT_BOOL)
+OPTION(leveldb_write_buffer_size, OPT_U64) // leveldb write buffer size
+OPTION(leveldb_cache_size, OPT_U64) // leveldb cache size
+OPTION(leveldb_block_size, OPT_U64) // leveldb block size
+OPTION(leveldb_bloom_size, OPT_INT) // leveldb bloom bits per entry
+OPTION(leveldb_max_open_files, OPT_INT) // leveldb max open files
+OPTION(leveldb_compression, OPT_BOOL) // leveldb uses compression
+OPTION(leveldb_paranoid, OPT_BOOL) // leveldb paranoid flag
+OPTION(leveldb_log, OPT_STR)  // enable leveldb log file
+OPTION(leveldb_compact_on_mount, OPT_BOOL)
+
+OPTION(kinetic_host, OPT_STR) // hostname or ip address of a kinetic drive to use
+OPTION(kinetic_port, OPT_INT) // port number of the kinetic drive
+OPTION(kinetic_user_id, OPT_INT) // kinetic user to authenticate as
+OPTION(kinetic_hmac_key, OPT_STR) // kinetic key to authenticate with
+OPTION(kinetic_use_ssl, OPT_BOOL) // whether to secure kinetic traffic with TLS
+
+
+OPTION(rocksdb_log_to_ceph_log, OPT_BOOL)  // log to ceph log
+OPTION(rocksdb_cache_size, OPT_U64)  // rocksdb cache size (unless set by bluestore/etc)
+OPTION(rocksdb_cache_row_ratio, OPT_FLOAT)   // ratio of cache for row (vs block)
+OPTION(rocksdb_cache_shard_bits, OPT_INT)  // rocksdb block cache shard bits, 4 bit -> 16 shards
+OPTION(rocksdb_cache_type, OPT_STR) // 'lru' or 'clock'
+OPTION(rocksdb_block_size, OPT_INT)  // default rocksdb block size
+OPTION(rocksdb_perf, OPT_BOOL) // Enabling this will have 5-10% impact on performance for the stats collection
+OPTION(rocksdb_collect_compaction_stats, OPT_BOOL) //For rocksdb, this behavior will be an overhead of 5%~10%, collected only rocksdb_perf is enabled.
+OPTION(rocksdb_collect_extended_stats, OPT_BOOL) //For rocksdb, this behavior will be an overhead of 5%~10%, collected only rocksdb_perf is enabled.
+OPTION(rocksdb_collect_memory_stats, OPT_BOOL) //For rocksdb, this behavior will be an overhead of 5%~10%, collected only rocksdb_perf is enabled.
+OPTION(rocksdb_enable_rmrange, OPT_BOOL) // see https://github.com/facebook/rocksdb/blob/master/include/rocksdb/db.h#L253
+
+// rocksdb options that will be used for omap(if omap_backend is rocksdb)
+OPTION(filestore_rocksdb_options, OPT_STR)
+// rocksdb options that will be used in monstore
+OPTION(mon_rocksdb_options, OPT_STR)
+
+/**
+ * osd_*_priority adjust the relative priority of client io, recovery io,
+ * snaptrim io, etc
+ *
+ * osd_*_priority determines the ratio of available io between client and
+ * recovery.  Each option may be set between
+ * 1..63.
+ */
+OPTION(osd_client_op_priority, OPT_U32)
+OPTION(osd_recovery_op_priority, OPT_U32)
+OPTION(osd_peering_op_priority, OPT_U32)
+
+OPTION(osd_snap_trim_priority, OPT_U32)
+OPTION(osd_snap_trim_cost, OPT_U32) // set default cost equal to 1MB io
+
+OPTION(osd_scrub_priority, OPT_U32)
+// set default cost equal to 50MB io
+OPTION(osd_scrub_cost, OPT_U32) 
+// set requested scrub priority higher than scrub priority to make the
+// requested scrubs jump the queue of scheduled scrubs
+OPTION(osd_requested_scrub_priority, OPT_U32)
+
+OPTION(osd_pg_delete_priority, OPT_U32)
+OPTION(osd_pg_delete_cost, OPT_U32) // set default cost equal to 1MB io
+
+OPTION(osd_recovery_priority, OPT_U32)
+// set default cost equal to 20MB io
+OPTION(osd_recovery_cost, OPT_U32)
+
+/**
+ * osd_recovery_op_warn_multiple scales the normal warning threshold,
+ * osd_op_complaint_time, so that slow recovery ops won't cause noise
+ */
+OPTION(osd_recovery_op_warn_multiple, OPT_U32)
+
+// Max time to wait between notifying mon of shutdown and shutting down
+OPTION(osd_mon_shutdown_timeout, OPT_DOUBLE)
+OPTION(osd_shutdown_pgref_assert, OPT_BOOL) // crash if the OSD has stray PG refs on shutdown
+
+OPTION(osd_max_object_size, OPT_U64) // OSD's maximum object size
+OPTION(osd_max_object_name_len, OPT_U32) // max rados object name len
+OPTION(osd_max_object_namespace_len, OPT_U32) // max rados object namespace len
+OPTION(osd_max_attr_name_len, OPT_U32)    // max rados attr name len; cannot go higher than 100 chars for file system backends
+OPTION(osd_max_attr_size, OPT_U64)
+
+OPTION(osd_max_omap_entries_per_request, OPT_U64)
+OPTION(osd_max_omap_bytes_per_request, OPT_U64)
+
+OPTION(osd_objectstore, OPT_STR)  // ObjectStore backend type
+OPTION(osd_objectstore_tracing, OPT_BOOL) // true if LTTng-UST tracepoints should be enabled
+OPTION(osd_objectstore_fuse, OPT_BOOL)
+
+OPTION(osd_bench_small_size_max_iops, OPT_U32) // 100 IOPS
+OPTION(osd_bench_large_size_max_throughput, OPT_U64) // 100 MB/s
+OPTION(osd_bench_max_block_size, OPT_U64) // cap the block size at 64MB
+OPTION(osd_bench_duration, OPT_U32) // duration of 'osd bench', capped at 30s to avoid triggering timeouts
+
+OPTION(osd_blkin_trace_all, OPT_BOOL) // create a blkin trace for all osd requests
+OPTION(osdc_blkin_trace_all, OPT_BOOL) // create a blkin trace for all objecter requests
+
+OPTION(osd_discard_disconnected_ops, OPT_BOOL)
+
+OPTION(memstore_device_bytes, OPT_U64)
+OPTION(memstore_page_set, OPT_BOOL)
+OPTION(memstore_page_size, OPT_U64)
+
+OPTION(bdev_debug_inflight_ios, OPT_BOOL)
+OPTION(bdev_inject_crash, OPT_INT)  // if N>0, then ~ 1/N IOs will complete before we crash on flush.
+OPTION(bdev_inject_crash_flush_delay, OPT_INT) // wait N more seconds on flush
+OPTION(bdev_aio, OPT_BOOL)
+OPTION(bdev_aio_poll_ms, OPT_INT)  // milliseconds
+OPTION(bdev_aio_max_queue_depth, OPT_INT)
+OPTION(bdev_aio_reap_max, OPT_INT)
+OPTION(bdev_block_size, OPT_INT)
+OPTION(bdev_debug_aio, OPT_BOOL)
+OPTION(bdev_debug_aio_suicide_timeout, OPT_FLOAT)
+OPTION(bdev_debug_aio_log_age, OPT_DOUBLE)
+
+// if yes, osd will unbind all NVMe devices from kernel driver and bind them
+// to the uio_pci_generic driver. The purpose is to prevent the case where
+// NVMe driver is loaded while osd is running.
+OPTION(bdev_nvme_unbind_from_kernel, OPT_BOOL)
+OPTION(bdev_nvme_retry_count, OPT_INT) // -1 means by default which is 4
+OPTION(bdev_enable_discard, OPT_BOOL)
+OPTION(bdev_async_discard, OPT_BOOL)
+
+OPTION(objectstore_blackhole, OPT_BOOL)
+
+OPTION(bluefs_alloc_size, OPT_U64)
+OPTION(bluefs_shared_alloc_size, OPT_U64)
+OPTION(bluefs_max_prefetch, OPT_U64)
+OPTION(bluefs_min_log_runway, OPT_U64)  // alloc when we get this low
+OPTION(bluefs_max_log_runway, OPT_U64)  // alloc this much at a time
+OPTION(bluefs_log_compact_min_ratio, OPT_FLOAT)      // before we consider
+OPTION(bluefs_log_compact_min_size, OPT_U64)  // before we consider
+OPTION(bluefs_min_flush_size, OPT_U64)  // ignore flush until its this big
+OPTION(bluefs_compact_log_sync, OPT_BOOL)  // sync or async log compaction?
+OPTION(bluefs_buffered_io, OPT_BOOL)
+OPTION(bluefs_sync_write, OPT_BOOL)
+OPTION(bluefs_allocator, OPT_STR)     // stupid | bitmap
+OPTION(bluefs_preextend_wal_files, OPT_BOOL)  // this *requires* that rocksdb has recycling enabled
+OPTION(bluefs_replay_recovery, OPT_BOOL)
+OPTION(bluefs_replay_recovery_disable_compact, OPT_BOOL)
+OPTION(bluefs_check_for_zeros, OPT_BOOL)
+
+OPTION(bluestore_bluefs, OPT_BOOL)
+OPTION(bluestore_bluefs_env_mirror, OPT_BOOL) // mirror to normal Env for debug
+OPTION(bluestore_bluefs_min, OPT_U64) // 1gb
+OPTION(bluestore_bluefs_min_ratio, OPT_FLOAT)  // min fs free / total free
+OPTION(bluestore_bluefs_max_ratio, OPT_FLOAT)  // max fs free / total free
+OPTION(bluestore_bluefs_gift_ratio, OPT_FLOAT) // how much to add at a time
+OPTION(bluestore_bluefs_reclaim_ratio, OPT_FLOAT) // how much to reclaim at a time
+OPTION(bluestore_bluefs_balance_interval, OPT_FLOAT) // how often (sec) to balance free space between bluefs and bluestore
+// how often (sec) to dump allocator on allocation failure
+OPTION(bluestore_bluefs_alloc_failure_dump_interval, OPT_FLOAT)
+
+// Enforces db sync with legacy bluefs extents information on close.
+// Enables downgrades to pre-nautilus releases
+OPTION(bluestore_bluefs_db_compatibility, OPT_BOOL)
+
+// If you want to use spdk driver, you need to specify NVMe serial number here
+// with "spdk:" prefix.
+// Users can use 'lspci -vvv -d 8086:0953 | grep "Device Serial Number"' to
+// get the serial number of Intel(R) Fultondale NVMe controllers.
+// Example:
+// bluestore_block_path = spdk:55cd2e404bd73932
+OPTION(bluestore_block_path, OPT_STR)
+OPTION(bluestore_block_size, OPT_U64)  // 10gb for testing
+OPTION(bluestore_block_create, OPT_BOOL)
+OPTION(bluestore_block_db_path, OPT_STR)
+OPTION(bluestore_block_db_size, OPT_U64)   // rocksdb ssts (hot/warm)
+OPTION(bluestore_block_db_create, OPT_BOOL)
+OPTION(bluestore_block_wal_path, OPT_STR)
+OPTION(bluestore_block_wal_size, OPT_U64) // rocksdb wal
+OPTION(bluestore_block_wal_create, OPT_BOOL)
+OPTION(bluestore_block_preallocate_file, OPT_BOOL) //whether preallocate space if block/db_path/wal_path is file rather that block device.
+OPTION(bluestore_ignore_data_csum, OPT_BOOL)
+OPTION(bluestore_csum_type, OPT_STR) // none|xxhash32|xxhash64|crc32c|crc32c_16|crc32c_8
+OPTION(bluestore_retry_disk_reads, OPT_U64)
+OPTION(bluestore_min_alloc_size, OPT_U32)
+OPTION(bluestore_min_alloc_size_hdd, OPT_U32)
+OPTION(bluestore_min_alloc_size_ssd, OPT_U32)
+OPTION(bluestore_max_alloc_size, OPT_U32)
+OPTION(bluestore_prefer_deferred_size, OPT_U32)
+OPTION(bluestore_prefer_deferred_size_hdd, OPT_U32)
+OPTION(bluestore_prefer_deferred_size_ssd, OPT_U32)
+OPTION(bluestore_compression_mode, OPT_STR)  // force|aggressive|passive|none
+OPTION(bluestore_compression_algorithm, OPT_STR)
+OPTION(bluestore_compression_min_blob_size, OPT_U32)
+OPTION(bluestore_compression_min_blob_size_hdd, OPT_U32)
+OPTION(bluestore_compression_min_blob_size_ssd, OPT_U32)
+OPTION(bluestore_compression_max_blob_size, OPT_U32)
+OPTION(bluestore_compression_max_blob_size_hdd, OPT_U32)
+OPTION(bluestore_compression_max_blob_size_ssd, OPT_U32)
+/*
+ * Specifies minimum expected amount of saved allocation units
+ * per single blob to enable compressed blobs garbage collection
+ * 
+ */
+OPTION(bluestore_gc_enable_blob_threshold, OPT_INT)  
+/*
+ * Specifies minimum expected amount of saved allocation units
+ * per all blobsb to enable compressed blobs garbage collection
+ * 
+ */
+OPTION(bluestore_gc_enable_total_threshold, OPT_INT)  
+
+OPTION(bluestore_max_blob_size, OPT_U32)
+OPTION(bluestore_max_blob_size_hdd, OPT_U32)
+OPTION(bluestore_max_blob_size_ssd, OPT_U32)
+/*
+ * Require the net gain of compression at least to be at this ratio,
+ * otherwise we don't compress.
+ * And ask for compressing at least 12.5%(1/8) off, by default.
+ */
+OPTION(bluestore_compression_required_ratio, OPT_DOUBLE)
+OPTION(bluestore_extent_map_shard_max_size, OPT_U32)
+OPTION(bluestore_extent_map_shard_target_size, OPT_U32)
+OPTION(bluestore_extent_map_shard_min_size, OPT_U32)
+OPTION(bluestore_extent_map_shard_target_size_slop, OPT_DOUBLE)
+OPTION(bluestore_extent_map_inline_shard_prealloc_size, OPT_U32)
+OPTION(bluestore_cache_trim_interval, OPT_DOUBLE)
+OPTION(bluestore_cache_trim_max_skip_pinned, OPT_U32) // skip this many onodes pinned in cache before we give up
+OPTION(bluestore_cache_type, OPT_STR)   // lru, 2q
+OPTION(bluestore_2q_cache_kin_ratio, OPT_DOUBLE)    // kin page slot size / max page slot size
+OPTION(bluestore_2q_cache_kout_ratio, OPT_DOUBLE)   // number of kout page slot / total number of page slot
+OPTION(bluestore_cache_size, OPT_U64)
+OPTION(bluestore_cache_size_hdd, OPT_U64)
+OPTION(bluestore_cache_size_ssd, OPT_U64)
+OPTION(bluestore_cache_meta_ratio, OPT_DOUBLE)
+OPTION(bluestore_cache_kv_ratio, OPT_DOUBLE)
+OPTION(bluestore_kvbackend, OPT_STR)
+OPTION(bluestore_allocator, OPT_STR)     // stupid | bitmap
+OPTION(bluestore_freelist_blocks_per_key, OPT_INT)
+OPTION(bluestore_bitmapallocator_blocks_per_zone, OPT_INT) // must be power of 2 aligned, e.g., 512, 1024, 2048...
+OPTION(bluestore_bitmapallocator_span_size, OPT_INT) // must be power of 2 aligned, e.g., 512, 1024, 2048...
+OPTION(bluestore_max_deferred_txc, OPT_U64)
+OPTION(bluestore_rocksdb_options, OPT_STR)
+OPTION(bluestore_fsck_on_mount, OPT_BOOL)
+OPTION(bluestore_fsck_on_mount_deep, OPT_BOOL)
+OPTION(bluestore_fsck_quick_fix_on_mount, OPT_BOOL)
+OPTION(bluestore_fsck_on_umount, OPT_BOOL)
+OPTION(bluestore_fsck_on_umount_deep, OPT_BOOL)
+OPTION(bluestore_fsck_on_mkfs, OPT_BOOL)
+OPTION(bluestore_fsck_on_mkfs_deep, OPT_BOOL)
+OPTION(bluestore_sync_submit_transaction, OPT_BOOL) // submit kv txn in queueing thread (not kv_sync_thread)
+OPTION(bluestore_fsck_read_bytes_cap, OPT_U64)
+OPTION(bluestore_fsck_quick_fix_threads, OPT_INT)
+OPTION(bluestore_throttle_bytes, OPT_U64)
+OPTION(bluestore_throttle_deferred_bytes, OPT_U64)
+OPTION(bluestore_throttle_cost_per_io_hdd, OPT_U64)
+OPTION(bluestore_throttle_cost_per_io_ssd, OPT_U64)
+OPTION(bluestore_throttle_cost_per_io, OPT_U64)
+OPTION(bluestore_deferred_batch_ops, OPT_U64)
+OPTION(bluestore_deferred_batch_ops_hdd, OPT_U64)
+OPTION(bluestore_deferred_batch_ops_ssd, OPT_U64)
+OPTION(bluestore_nid_prealloc, OPT_INT)
+OPTION(bluestore_blobid_prealloc, OPT_U64)
+OPTION(bluestore_clone_cow, OPT_BOOL)  // do copy-on-write for clones
+OPTION(bluestore_default_buffered_read, OPT_BOOL)
+OPTION(bluestore_default_buffered_write, OPT_BOOL)
+OPTION(bluestore_debug_misc, OPT_BOOL)
+OPTION(bluestore_debug_no_reuse_blocks, OPT_BOOL)
+OPTION(bluestore_debug_small_allocations, OPT_INT)
+OPTION(bluestore_debug_max_cached_onodes, OPT_INT)
+OPTION(bluestore_debug_too_many_blobs_threshold, OPT_INT)
+OPTION(bluestore_debug_freelist, OPT_BOOL)
+OPTION(bluestore_debug_prefill, OPT_FLOAT)
+OPTION(bluestore_debug_prefragment_max, OPT_INT)
+OPTION(bluestore_debug_inject_read_err, OPT_BOOL)
+OPTION(bluestore_debug_randomize_serial_transaction, OPT_INT)
+OPTION(bluestore_debug_omit_block_device_write, OPT_BOOL)
+OPTION(bluestore_debug_fsck_abort, OPT_BOOL)
+OPTION(bluestore_debug_omit_kv_commit, OPT_BOOL)
+OPTION(bluestore_debug_permit_any_bdev_label, OPT_BOOL)
+OPTION(bluestore_debug_random_read_err, OPT_DOUBLE)
+OPTION(bluestore_debug_inject_bug21040, OPT_BOOL)
+OPTION(bluestore_debug_inject_csum_err_probability, OPT_FLOAT)
+OPTION(bluestore_fsck_error_on_no_per_pool_stats, OPT_BOOL)
+OPTION(bluestore_warn_on_bluefs_spillover, OPT_BOOL)
+OPTION(bluestore_warn_on_legacy_statfs, OPT_BOOL)
+OPTION(bluestore_log_op_age, OPT_DOUBLE)
+OPTION(bluestore_log_omap_iterator_age, OPT_DOUBLE)
+OPTION(bluestore_log_collection_list_age, OPT_DOUBLE)
+OPTION(bluestore_volume_selection_policy, OPT_STR)
+OPTION(bluestore_volume_selection_reserved_factor, OPT_DOUBLE)
+OPTION(bluestore_volume_selection_reserved, OPT_INT)
+OPTION(bluestore_kv_sync_util_logging_s, OPT_DOUBLE)
+
+OPTION(kstore_max_ops, OPT_U64)
+OPTION(kstore_max_bytes, OPT_U64)
+OPTION(kstore_backend, OPT_STR)
+OPTION(kstore_rocksdb_options, OPT_STR)
+OPTION(kstore_fsck_on_mount, OPT_BOOL)
+OPTION(kstore_fsck_on_mount_deep, OPT_BOOL)
+OPTION(kstore_nid_prealloc, OPT_U64)
+OPTION(kstore_sync_transaction, OPT_BOOL)
+OPTION(kstore_sync_submit_transaction, OPT_BOOL)
+OPTION(kstore_onode_map_size, OPT_U64)
+OPTION(kstore_default_stripe_size, OPT_INT)
+
+OPTION(filestore_omap_backend, OPT_STR)
+OPTION(filestore_omap_backend_path, OPT_STR)
+
+/// filestore wb throttle limits
+OPTION(filestore_wbthrottle_enable, OPT_BOOL)
+OPTION(filestore_wbthrottle_btrfs_bytes_start_flusher, OPT_U64)
+OPTION(filestore_wbthrottle_btrfs_bytes_hard_limit, OPT_U64)
+OPTION(filestore_wbthrottle_btrfs_ios_start_flusher, OPT_U64)
+OPTION(filestore_wbthrottle_btrfs_ios_hard_limit, OPT_U64)
+OPTION(filestore_wbthrottle_btrfs_inodes_start_flusher, OPT_U64)
+OPTION(filestore_wbthrottle_xfs_bytes_start_flusher, OPT_U64)
+OPTION(filestore_wbthrottle_xfs_bytes_hard_limit, OPT_U64)
+OPTION(filestore_wbthrottle_xfs_ios_start_flusher, OPT_U64)
+OPTION(filestore_wbthrottle_xfs_ios_hard_limit, OPT_U64)
+OPTION(filestore_wbthrottle_xfs_inodes_start_flusher, OPT_U64)
+
+/// These must be less than the fd limit
+OPTION(filestore_wbthrottle_btrfs_inodes_hard_limit, OPT_U64)
+OPTION(filestore_wbthrottle_xfs_inodes_hard_limit, OPT_U64)
+
+//Introduce a O_DSYNC write in the filestore
+OPTION(filestore_odsync_write, OPT_BOOL)
+
+// Tests index failure paths
+OPTION(filestore_index_retry_probability, OPT_DOUBLE)
+
+// Allow object read error injection
+OPTION(filestore_debug_inject_read_err, OPT_BOOL)
+OPTION(filestore_debug_random_read_err, OPT_DOUBLE)
+
+OPTION(filestore_debug_omap_check, OPT_BOOL) // Expensive debugging check on sync
+OPTION(filestore_omap_header_cache_size, OPT_INT)
+
+// Use omap for xattrs for attrs over
+// filestore_max_inline_xattr_size or
+OPTION(filestore_max_inline_xattr_size, OPT_U32)	//Override
+OPTION(filestore_max_inline_xattr_size_xfs, OPT_U32)
+OPTION(filestore_max_inline_xattr_size_btrfs, OPT_U32)
+OPTION(filestore_max_inline_xattr_size_other, OPT_U32)
+
+// for more than filestore_max_inline_xattrs attrs
+OPTION(filestore_max_inline_xattrs, OPT_U32)	//Override
+OPTION(filestore_max_inline_xattrs_xfs, OPT_U32)
+OPTION(filestore_max_inline_xattrs_btrfs, OPT_U32)
+OPTION(filestore_max_inline_xattrs_other, OPT_U32)
+
+// max xattr value size
+OPTION(filestore_max_xattr_value_size, OPT_U32)	//Override
+OPTION(filestore_max_xattr_value_size_xfs, OPT_U32)
+OPTION(filestore_max_xattr_value_size_btrfs, OPT_U32)
+// ext4 allows 4k xattrs total including some smallish extra fields and the
+// keys.  We're allowing 2 512 inline attrs in addition some some filestore
+// replay attrs.  After accounting for those, we still need to fit up to
+// two attrs of this value.  That means we need this value to be around 1k
+// to be safe.  This is hacky, but it's not worth complicating the code
+// to work around ext4's total xattr limit.
+OPTION(filestore_max_xattr_value_size_other, OPT_U32)
+
+OPTION(filestore_sloppy_crc, OPT_BOOL)         // track sloppy crcs
+OPTION(filestore_sloppy_crc_block_size, OPT_INT)
+
+OPTION(filestore_max_alloc_hint_size, OPT_U64) // bytes
+
+OPTION(filestore_max_sync_interval, OPT_DOUBLE)    // seconds
+OPTION(filestore_min_sync_interval, OPT_DOUBLE)  // seconds
+OPTION(filestore_btrfs_snap, OPT_BOOL)
+OPTION(filestore_btrfs_clone_range, OPT_BOOL)
+OPTION(filestore_zfs_snap, OPT_BOOL) // zfsonlinux is still unstable
+OPTION(filestore_fsync_flushes_journal_data, OPT_BOOL)
+OPTION(filestore_fiemap, OPT_BOOL)     // (try to) use fiemap
+OPTION(filestore_punch_hole, OPT_BOOL)
+OPTION(filestore_seek_data_hole, OPT_BOOL)     // (try to) use seek_data/hole
+OPTION(filestore_splice, OPT_BOOL)
+OPTION(filestore_fadvise, OPT_BOOL)
+//collect device partition information for management application to use
+OPTION(filestore_collect_device_partition_information, OPT_BOOL)
+
+// (try to) use extsize for alloc hint NOTE: extsize seems to trigger
+// data corruption in xfs prior to kernel 3.5.  filestore will
+// implicitly disable this if it cannot confirm the kernel is newer
+// than that.
+// NOTE: This option involves a tradeoff: When disabled, fragmentation is
+// worse, but large sequential writes are faster. When enabled, large
+// sequential writes are slower, but fragmentation is reduced.
+OPTION(filestore_xfs_extsize, OPT_BOOL)
+
+OPTION(filestore_journal_parallel, OPT_BOOL)
+OPTION(filestore_journal_writeahead, OPT_BOOL)
+OPTION(filestore_journal_trailing, OPT_BOOL)
+OPTION(filestore_queue_max_ops, OPT_U64)
+OPTION(filestore_queue_max_bytes, OPT_U64)
+
+OPTION(filestore_caller_concurrency, OPT_INT)
+
+/// Expected filestore throughput in B/s
+OPTION(filestore_expected_throughput_bytes, OPT_DOUBLE)
+/// Expected filestore throughput in ops/s
+OPTION(filestore_expected_throughput_ops, OPT_DOUBLE)
+
+/// Filestore max delay multiple.  Defaults to 0 (disabled)
+OPTION(filestore_queue_max_delay_multiple, OPT_DOUBLE)
+/// Filestore high delay multiple.  Defaults to 0 (disabled)
+OPTION(filestore_queue_high_delay_multiple, OPT_DOUBLE)
+
+/// Filestore max delay multiple bytes.  Defaults to 0 (disabled)
+OPTION(filestore_queue_max_delay_multiple_bytes, OPT_DOUBLE)
+/// Filestore high delay multiple bytes.  Defaults to 0 (disabled)
+OPTION(filestore_queue_high_delay_multiple_bytes, OPT_DOUBLE)
+
+/// Filestore max delay multiple ops.  Defaults to 0 (disabled)
+OPTION(filestore_queue_max_delay_multiple_ops, OPT_DOUBLE)
+/// Filestore high delay multiple ops.  Defaults to 0 (disabled)
+OPTION(filestore_queue_high_delay_multiple_ops, OPT_DOUBLE)
+
+/// Use above to inject delays intended to keep the op queue between low and high
+OPTION(filestore_queue_low_threshhold, OPT_DOUBLE)
+OPTION(filestore_queue_high_threshhold, OPT_DOUBLE)
+
+OPTION(filestore_op_threads, OPT_INT)
+OPTION(filestore_op_thread_timeout, OPT_INT)
+OPTION(filestore_op_thread_suicide_timeout, OPT_INT)
+OPTION(filestore_commit_timeout, OPT_FLOAT)
+OPTION(filestore_fiemap_threshold, OPT_INT)
+OPTION(filestore_merge_threshold, OPT_INT)
+OPTION(filestore_split_multiple, OPT_INT)
+OPTION(filestore_split_rand_factor, OPT_U32) // randomize the split threshold by adding 16 * [0)
+OPTION(filestore_update_to, OPT_INT)
+OPTION(filestore_blackhole, OPT_BOOL)     // drop any new transactions on the floor
+OPTION(filestore_fd_cache_size, OPT_INT)    // FD lru size
+OPTION(filestore_fd_cache_shards, OPT_INT)   // FD number of shards
+OPTION(filestore_ondisk_finisher_threads, OPT_INT)
+OPTION(filestore_apply_finisher_threads, OPT_INT)
+OPTION(filestore_dump_file, OPT_STR)         // file onto which store transaction dumps
+OPTION(filestore_kill_at, OPT_INT)            // inject a failure at the n'th opportunity
+OPTION(filestore_inject_stall, OPT_INT)       // artificially stall for N seconds in op queue thread
+OPTION(filestore_fail_eio, OPT_BOOL)       // fail/crash on EIO
+OPTION(filestore_debug_verify_split, OPT_BOOL)
+OPTION(journal_dio, OPT_BOOL)
+OPTION(journal_aio, OPT_BOOL)
+OPTION(journal_force_aio, OPT_BOOL)
+OPTION(journal_block_size, OPT_INT)
+
+OPTION(journal_block_align, OPT_BOOL)
+OPTION(journal_write_header_frequency, OPT_U64)
+OPTION(journal_max_write_bytes, OPT_INT)
+OPTION(journal_max_write_entries, OPT_INT)
+
+/// Target range for journal fullness
+OPTION(journal_throttle_low_threshhold, OPT_DOUBLE)
+OPTION(journal_throttle_high_threshhold, OPT_DOUBLE)
+
+/// Multiple over expected at high_threshhold. Defaults to 0 (disabled).
+OPTION(journal_throttle_high_multiple, OPT_DOUBLE)
+/// Multiple over expected at max.  Defaults to 0 (disabled).
+OPTION(journal_throttle_max_multiple, OPT_DOUBLE)
+
+OPTION(journal_align_min_size, OPT_INT)  // align data payloads >= this.
+OPTION(journal_replay_from, OPT_INT)
+OPTION(journal_zero_on_create, OPT_BOOL)
+OPTION(journal_ignore_corruption, OPT_BOOL) // assume journal is not corrupt
+OPTION(journal_discard, OPT_BOOL) //using ssd disk as journal, whether support discard nouse journal-data.
+
+OPTION(fio_dir, OPT_STR) // fio data directory for fio-objectstore
+
+OPTION(rados_mon_op_timeout, OPT_DOUBLE) // how many seconds to wait for a response from the monitor before returning an error from a rados operation. 0 means no limit.
+OPTION(rados_osd_op_timeout, OPT_DOUBLE) // how many seconds to wait for a response from osds before returning an error from a rados operation. 0 means no limit.
+OPTION(rados_tracing, OPT_BOOL) // true if LTTng-UST tracepoints should be enabled
+
+OPTION(nss_db_path, OPT_STR) // path to nss db
+
+
+OPTION(rgw_max_attr_name_len, OPT_SIZE)
+OPTION(rgw_max_attr_size, OPT_SIZE)
+OPTION(rgw_max_attrs_num_in_req, OPT_U64)
+
+OPTION(rgw_max_chunk_size, OPT_INT)
+OPTION(rgw_put_obj_min_window_size, OPT_INT)
+OPTION(rgw_put_obj_max_window_size, OPT_INT)
+OPTION(rgw_max_put_size, OPT_U64)
+OPTION(rgw_max_put_param_size, OPT_U64) // max input size for PUT requests accepting json/xml params
+
+/**
+ * override max bucket index shards in zone configuration (if not zero)
+ *
+ * Represents the number of shards for the bucket index object, a value of zero
+ * indicates there is no sharding. By default (no sharding, the name of the object
+ * is '.dir.{marker}', with sharding, the name is '.dir.{markder}.{sharding_id}',
+ * sharding_id is zero-based value. It is not recommended to set a too large value
+ * (e.g. thousand) as it increases the cost for bucket listing.
+ */
+OPTION(rgw_override_bucket_index_max_shards, OPT_U32)
+
+/**
+ * Represents the maximum AIO pending requests for the bucket index object shards.
+ */
+OPTION(rgw_bucket_index_max_aio, OPT_U32)
+
+/**
+ * whether or not the quota/gc threads should be started
+ */
+OPTION(rgw_enable_quota_threads, OPT_BOOL)
+OPTION(rgw_enable_gc_threads, OPT_BOOL)
+OPTION(rgw_enable_lc_threads, OPT_BOOL)
+
+
+OPTION(rgw_data, OPT_STR)
+OPTION(rgw_enable_apis, OPT_STR)
+OPTION(rgw_cache_enabled, OPT_BOOL)   // rgw cache enabled
+OPTION(rgw_cache_lru_size, OPT_INT)   // num of entries in rgw cache
+OPTION(rgw_socket_path, OPT_STR)   // path to unix domain socket, if not specified, rgw will not run as external fcgi
+OPTION(rgw_host, OPT_STR)  // host for radosgw, can be an IP, default is 0.0.0.0
+OPTION(rgw_port, OPT_STR)  // port to listen, format as "8080" "5000", if not specified, rgw will not run external fcgi
+OPTION(rgw_dns_name, OPT_STR) // hostname suffix on buckets
+OPTION(rgw_dns_s3website_name, OPT_STR) // hostname suffix on buckets for s3-website endpoint
+OPTION(rgw_service_provider_name, OPT_STR) //service provider name which is contained in http response headers
+OPTION(rgw_content_length_compat, OPT_BOOL) // Check both HTTP_CONTENT_LENGTH and CONTENT_LENGTH in fcgi env
+OPTION(rgw_lifecycle_work_time, OPT_STR) //job process lc  at 00:00-06:00s
+OPTION(rgw_lc_lock_max_time, OPT_INT)  // total run time for a single lc processor work
+OPTION(rgw_lc_max_objs, OPT_INT)
+OPTION(rgw_lc_max_rules, OPT_U32)  // Max rules set on one bucket
+OPTION(rgw_lc_debug_interval, OPT_INT)  // Debug run interval, in seconds
+OPTION(rgw_script_uri, OPT_STR) // alternative value for SCRIPT_URI if not set in request
+OPTION(rgw_request_uri, OPT_STR) // alternative value for REQUEST_URI if not set in request
+OPTION(rgw_ignore_get_invalid_range, OPT_BOOL) // treat invalid (e.g., negative) range requests as full
+OPTION(rgw_swift_url, OPT_STR)             // the swift url, being published by the internal swift auth
+OPTION(rgw_swift_url_prefix, OPT_STR) // entry point for which a url is considered a swift url
+OPTION(rgw_swift_auth_url, OPT_STR)        // default URL to go and verify tokens for v1 auth (if not using internal swift auth)
+OPTION(rgw_swift_auth_entry, OPT_STR)  // entry point for which a url is considered a swift auth url
+OPTION(rgw_swift_tenant_name, OPT_STR)  // tenant name to use for swift access
+OPTION(rgw_swift_account_in_url, OPT_BOOL)  // assume that URL always contain the account (aka tenant) part
+OPTION(rgw_swift_enforce_content_length, OPT_BOOL)  // enforce generation of Content-Length even in cost of performance or scalability
+OPTION(rgw_keystone_url, OPT_STR)  // url for keystone server
+OPTION(rgw_keystone_admin_token, OPT_STR)  // keystone admin token (shared secret)
+OPTION(rgw_keystone_admin_token_path, OPT_STR)  // path to keystone admin token (shared secret)
+OPTION(rgw_keystone_admin_user, OPT_STR)  // keystone admin user name
+OPTION(rgw_keystone_admin_password, OPT_STR)  // keystone admin user password
+OPTION(rgw_keystone_admin_password_path, OPT_STR)  // path to keystone admin user password
+OPTION(rgw_keystone_admin_tenant, OPT_STR)  // keystone admin user tenant (for keystone v2.0)
+OPTION(rgw_keystone_admin_project, OPT_STR)  // keystone admin user project (for keystone v3)
+OPTION(rgw_keystone_admin_domain, OPT_STR)  // keystone admin user domain
+OPTION(rgw_keystone_barbican_user, OPT_STR)  // keystone user to access barbican secrets
+OPTION(rgw_keystone_barbican_password, OPT_STR)  // keystone password for barbican user
+OPTION(rgw_keystone_barbican_tenant, OPT_STR)  // keystone barbican user tenant (for keystone v2.0)
+OPTION(rgw_keystone_barbican_project, OPT_STR)  // keystone barbican user project (for keystone v3)
+OPTION(rgw_keystone_barbican_domain, OPT_STR)  // keystone barbican user domain
+OPTION(rgw_keystone_api_version, OPT_INT) // Version of Keystone API to use (2 or 3)
+OPTION(rgw_keystone_accepted_roles, OPT_STR)  // roles required to serve requests
+OPTION(rgw_keystone_accepted_admin_roles, OPT_STR) // list of roles allowing an user to gain admin privileges
+OPTION(rgw_keystone_token_cache_size, OPT_INT)  // max number of entries in keystone token cache
+OPTION(rgw_keystone_revocation_interval, OPT_INT)  // seconds between tokens revocation check
+OPTION(rgw_keystone_verify_ssl, OPT_BOOL) // should we try to verify keystone's ssl
+OPTION(rgw_cross_domain_policy, OPT_STR)
+OPTION(rgw_healthcheck_disabling_path, OPT_STR) // path that existence causes the healthcheck to respond 503
+OPTION(rgw_s3_auth_use_rados, OPT_BOOL)  // should we try to use the internal credentials for s3?
+OPTION(rgw_s3_auth_use_keystone, OPT_BOOL)  // should we try to use keystone for s3?
+OPTION(rgw_s3_auth_order, OPT_STR) // s3 authentication order to try
+OPTION(rgw_barbican_url, OPT_STR)  // url for barbican server
+OPTION(rgw_opa_url, OPT_STR)  // url for OPA server
+OPTION(rgw_opa_token, OPT_STR)  // Bearer token OPA uses to authenticate client requests
+OPTION(rgw_opa_verify_ssl, OPT_BOOL) // should we try to verify OPA's ssl
+OPTION(rgw_use_opa_authz, OPT_BOOL) // should we use OPA to authorize client requests?
+
+/* OpenLDAP-style LDAP parameter strings */
+/* rgw_ldap_uri  space-separated list of LDAP servers in URI format */
+OPTION(rgw_ldap_uri, OPT_STR)
+/* rgw_ldap_binddn  LDAP entry RGW will bind with (user match) */
+OPTION(rgw_ldap_binddn, OPT_STR)
+/* rgw_ldap_searchdn  LDAP search base (basedn) */
+OPTION(rgw_ldap_searchdn, OPT_STR)
+/* rgw_ldap_dnattr  LDAP attribute containing RGW user names (to form binddns)*/
+OPTION(rgw_ldap_dnattr, OPT_STR)
+/* rgw_ldap_secret  file containing credentials for rgw_ldap_binddn */
+OPTION(rgw_ldap_secret, OPT_STR)
+/* rgw_s3_auth_use_ldap  use LDAP for RGW auth? */
+OPTION(rgw_s3_auth_use_ldap, OPT_BOOL)
+/* rgw_ldap_searchfilter  LDAP search filter */
+OPTION(rgw_ldap_searchfilter, OPT_STR)
+
+OPTION(rgw_admin_entry, OPT_STR)  // entry point for which a url is considered an admin request
+OPTION(rgw_enforce_swift_acls, OPT_BOOL)
+OPTION(rgw_swift_token_expiration, OPT_INT) // time in seconds for swift token expiration
+OPTION(rgw_print_continue, OPT_BOOL)  // enable if 100-Continue works
+OPTION(rgw_print_prohibited_content_length, OPT_BOOL) // violate RFC 7230 and send Content-Length in 204 and 304
+OPTION(rgw_remote_addr_param, OPT_STR)  // e.g. X-Forwarded-For, if you have a reverse proxy
+OPTION(rgw_op_thread_timeout, OPT_INT)
+OPTION(rgw_op_thread_suicide_timeout, OPT_INT)
+OPTION(rgw_thread_pool_size, OPT_INT)
+OPTION(rgw_num_control_oids, OPT_INT)
+OPTION(rgw_verify_ssl, OPT_BOOL) // should http_client try to verify ssl when sent https request
+
+/* The following are tunables for caches of RGW NFS (and other file
+ * client) objects.
+ *
+ * The file handle cache is a partitioned hash table
+ * (fhcache_partitions), each with a closed hash part and backing
+ * b-tree mapping.  The number of partions is expected to be a small
+ * prime, the cache size something larger but less than 5K, the total
+ * size of the cache is n_part * cache_size.
+ */
+OPTION(rgw_nfs_lru_lanes, OPT_INT)
+OPTION(rgw_nfs_lru_lane_hiwat, OPT_INT)
+OPTION(rgw_nfs_fhcache_partitions, OPT_INT)
+OPTION(rgw_nfs_fhcache_size, OPT_INT) /* 3*2017=6051 */
+OPTION(rgw_nfs_namespace_expire_secs, OPT_INT) /* namespace invalidate
+						     * timer */
+OPTION(rgw_nfs_max_gc, OPT_INT) /* max gc events per cycle */
+OPTION(rgw_nfs_write_completion_interval_s, OPT_INT) /* stateless (V3)
+							  * commit
+							  * delay */
+OPTION(rgw_nfs_s3_fast_attrs, OPT_BOOL) /* use fast S3 attrs from
+					 * bucket index--currently
+					 * assumes NFS mounts are
+					 * immutable */
+
+OPTION(rgw_zone, OPT_STR) // zone name
+OPTION(rgw_zone_root_pool, OPT_STR)    // pool where zone specific info is stored
+OPTION(rgw_default_zone_info_oid, OPT_STR)  // oid where default zone info is stored
+OPTION(rgw_region, OPT_STR) // region name
+OPTION(rgw_region_root_pool, OPT_STR)  // pool where all region info is stored
+OPTION(rgw_default_region_info_oid, OPT_STR)  // oid where default region info is stored
+OPTION(rgw_zonegroup, OPT_STR) // zone group name
+OPTION(rgw_zonegroup_root_pool, OPT_STR)  // pool where all zone group info is stored
+OPTION(rgw_default_zonegroup_info_oid, OPT_STR)  // oid where default zone group info is stored
+OPTION(rgw_realm, OPT_STR) // realm name
+OPTION(rgw_realm_root_pool, OPT_STR)  // pool where all realm info is stored
+OPTION(rgw_default_realm_info_oid, OPT_STR)  // oid where default realm info is stored
+OPTION(rgw_period_root_pool, OPT_STR)  // pool where all period info is stored
+OPTION(rgw_period_latest_epoch_info_oid, OPT_STR) // oid where current period info is stored
+OPTION(rgw_log_nonexistent_bucket, OPT_BOOL)
+OPTION(rgw_log_object_name, OPT_STR)      // man date to see codes (a subset are supported)
+OPTION(rgw_log_object_name_utc, OPT_BOOL)
+OPTION(rgw_usage_max_shards, OPT_INT)
+OPTION(rgw_usage_max_user_shards, OPT_INT)
+OPTION(rgw_enable_ops_log, OPT_BOOL) // enable logging every rgw operation
+OPTION(rgw_enable_usage_log, OPT_BOOL) // enable logging bandwidth usage
+OPTION(rgw_ops_log_rados, OPT_BOOL) // whether ops log should go to rados
+OPTION(rgw_ops_log_socket_path, OPT_STR) // path to unix domain socket where ops log can go
+OPTION(rgw_ops_log_data_backlog, OPT_INT) // max data backlog for ops log
+OPTION(rgw_fcgi_socket_backlog, OPT_INT) // socket  backlog for fcgi
+OPTION(rgw_usage_log_flush_threshold, OPT_INT) // threshold to flush pending log data
+OPTION(rgw_usage_log_tick_interval, OPT_INT) // flush pending log data every X seconds
+OPTION(rgw_init_timeout, OPT_INT) // time in seconds
+OPTION(rgw_mime_types_file, OPT_STR)
+OPTION(rgw_gc_max_objs, OPT_INT)
+OPTION(rgw_gc_obj_min_wait, OPT_INT)    // wait time before object may be handled by gc
+OPTION(rgw_gc_processor_max_time, OPT_INT)  // total run time for a single gc processor work
+OPTION(rgw_gc_processor_period, OPT_INT)  // gc processor cycle time
+OPTION(rgw_gc_max_concurrent_io, OPT_INT)  // gc processor cycle time
+OPTION(rgw_gc_max_trim_chunk, OPT_INT)  // gc trim chunk size
+OPTION(rgw_s3_success_create_obj_status, OPT_INT) // alternative success status response for create-obj (0 - default)
+OPTION(rgw_resolve_cname, OPT_BOOL)  // should rgw try to resolve hostname as a dns cname record
+OPTION(rgw_obj_stripe_size, OPT_INT)
+OPTION(rgw_extended_http_attrs, OPT_STR) // list of extended attrs that can be set on objects (beyond the default)
+OPTION(rgw_exit_timeout_secs, OPT_INT) // how many seconds to wait for process to go down before exiting unconditionally
+OPTION(rgw_get_obj_window_size, OPT_INT) // window size in bytes for single get obj request
+OPTION(rgw_get_obj_max_req_size, OPT_INT) // max length of a single get obj rados op
+OPTION(rgw_relaxed_s3_bucket_names, OPT_BOOL) // enable relaxed bucket name rules for US region buckets
+OPTION(rgw_defer_to_bucket_acls, OPT_STR) // if the user has bucket perms)
+OPTION(rgw_list_buckets_max_chunk, OPT_INT) // max buckets to retrieve in a single op when listing user buckets
+OPTION(rgw_md_log_max_shards, OPT_INT) // max shards for metadata log
+OPTION(rgw_curl_wait_timeout_ms, OPT_INT) // timeout for certain curl calls
+OPTION(rgw_curl_low_speed_limit, OPT_INT) // low speed limit for certain curl calls
+OPTION(rgw_curl_low_speed_time, OPT_INT) // low speed time for certain curl calls
+OPTION(rgw_copy_obj_progress, OPT_BOOL) // should dump progress during long copy operations?
+OPTION(rgw_copy_obj_progress_every_bytes, OPT_INT) // min bytes between copy progress output
+OPTION(rgw_sync_obj_etag_verify, OPT_BOOL) // verify if the copied object from remote is identical to source
+OPTION(rgw_obj_tombstone_cache_size, OPT_INT) // how many objects in tombstone cache, which is used in multi-zone sync to keep
+                                                    // track of removed objects' mtime
+
+OPTION(rgw_data_log_window, OPT_INT) // data log entries window (in seconds)
+OPTION(rgw_data_log_changes_size, OPT_INT) // number of in-memory entries to hold for data changes log
+OPTION(rgw_data_log_num_shards, OPT_INT) // number of objects to keep data changes log on
+OPTION(rgw_data_log_obj_prefix, OPT_STR) //
+
+OPTION(rgw_bucket_quota_ttl, OPT_INT) // time for cached bucket stats to be cached within rgw instance
+OPTION(rgw_bucket_quota_soft_threshold, OPT_DOUBLE) // threshold from which we don't rely on cached info for quota decisions
+OPTION(rgw_bucket_quota_cache_size, OPT_INT) // number of entries in bucket quota cache
+OPTION(rgw_bucket_default_quota_max_objects, OPT_INT) // number of objects allowed
+OPTION(rgw_bucket_default_quota_max_size, OPT_LONGLONG) // Max size of object in bytes
+
+OPTION(rgw_expose_bucket, OPT_BOOL) // Return the bucket name in the 'Bucket' response header
+
+OPTION(rgw_frontends, OPT_STR) // rgw front ends
+
+OPTION(rgw_user_quota_bucket_sync_interval, OPT_INT) // time period for accumulating modified buckets before syncing stats
+OPTION(rgw_user_quota_sync_interval, OPT_INT) // time period for accumulating modified buckets before syncing entire user stats
+OPTION(rgw_user_quota_sync_idle_users, OPT_BOOL) // whether stats for idle users be fully synced
+OPTION(rgw_user_quota_sync_wait_time, OPT_INT) // min time between two full stats sync for non-idle users
+OPTION(rgw_user_default_quota_max_objects, OPT_INT) // number of objects allowed
+OPTION(rgw_user_default_quota_max_size, OPT_LONGLONG) // Max size of object in bytes
+
+OPTION(rgw_multipart_min_part_size, OPT_INT) // min size for each part (except for last one) in multipart upload
+OPTION(rgw_multipart_part_upload_limit, OPT_INT) // parts limit in multipart upload
+
+OPTION(rgw_max_slo_entries, OPT_INT) // default number of max entries in slo
+
+OPTION(rgw_olh_pending_timeout_sec, OPT_INT) // time until we retire a pending olh change
+OPTION(rgw_user_max_buckets, OPT_INT) // global option to set max buckets count for all user
+
+OPTION(rgw_objexp_gc_interval, OPT_U32) // maximum time between round of expired objects garbage collecting
+OPTION(rgw_objexp_hints_num_shards, OPT_U32) // maximum number of parts in which the hint index is stored in
+OPTION(rgw_objexp_chunk_size, OPT_U32) // maximum number of entries in a single operation when processing objexp data
+
+OPTION(rgw_enable_static_website, OPT_BOOL) // enable static website feature
+OPTION(rgw_log_http_headers, OPT_STR) // list of HTTP headers to log when seen, ignores case (e.g., http_x_forwarded_for
+
+OPTION(rgw_num_async_rados_threads, OPT_INT) // num of threads to use for async rados operations
+OPTION(rgw_md_notify_interval_msec, OPT_INT) // metadata changes notification interval to followers
+OPTION(rgw_run_sync_thread, OPT_BOOL) // whether radosgw (not radosgw-admin) spawns the sync thread
+OPTION(rgw_sync_lease_period, OPT_INT) // time in second for lease that rgw takes on a specific log (or log shard)
+OPTION(rgw_sync_log_trim_interval, OPT_INT) // time in seconds between attempts to trim sync logs
+
+OPTION(rgw_sync_data_inject_err_probability, OPT_DOUBLE) // range [0, 1]
+OPTION(rgw_sync_meta_inject_err_probability, OPT_DOUBLE) // range [0, 1]
+OPTION(rgw_sync_trace_history_size, OPT_INT) // max number of complete sync trace entries to keep
+OPTION(rgw_sync_trace_per_node_log_size, OPT_INT) // how many log entries to keep per node
+OPTION(rgw_sync_trace_servicemap_update_interval, OPT_INT) // interval in seconds between sync trace servicemap update
+
+
+OPTION(rgw_period_push_interval, OPT_DOUBLE) // seconds to wait before retrying "period push"
+OPTION(rgw_period_push_interval_max, OPT_DOUBLE) // maximum interval after exponential backoff
+
+OPTION(rgw_safe_max_objects_per_shard, OPT_INT) // safe max loading
+OPTION(rgw_shard_warning_threshold, OPT_DOUBLE) // pct of safe max
+						    // at which to warn
+
+OPTION(rgw_swift_versioning_enabled, OPT_BOOL) // whether swift object versioning feature is enabled
+
+OPTION(rgw_trust_forwarded_https, OPT_BOOL) // trust Forwarded and X-Forwarded-Proto headers for ssl termination
+OPTION(rgw_crypt_require_ssl, OPT_BOOL) // requests including encryption key headers must be sent over ssl
+OPTION(rgw_crypt_default_encryption_key, OPT_STR) // base64 encoded key for encryption of rgw objects
+OPTION(rgw_crypt_s3_kms_encryption_keys, OPT_STR) // extra keys that may be used for aws:kms
+                                                      // defined as map "key1=YmluCmJvb3N0CmJvb3N0LQ== key2=b3V0CnNyYwpUZXN0aW5nCg=="
+OPTION(rgw_crypt_suppress_logs, OPT_BOOL)   // suppress logs that might print customer key
+OPTION(rgw_list_bucket_min_readahead, OPT_INT) // minimum number of entries to read from rados for bucket listing
+
+OPTION(rgw_rest_getusage_op_compat, OPT_BOOL) // dump description of total stats for s3 GetUsage API
+
+OPTION(throttler_perf_counter, OPT_BOOL) // enable/disable throttler perf counter
+
+/* The following are tunables for torrent data */
+OPTION(rgw_torrent_flag, OPT_BOOL)    // produce torrent function flag
+OPTION(rgw_torrent_tracker, OPT_STR)    // torrent field announce and announce list
+OPTION(rgw_torrent_createby, OPT_STR)    // torrent field created by
+OPTION(rgw_torrent_comment, OPT_STR)    // torrent field comment
+OPTION(rgw_torrent_encoding, OPT_STR)    // torrent field encoding
+OPTION(rgw_torrent_origin, OPT_STR)    // torrent origin
+OPTION(rgw_torrent_sha_unit, OPT_INT)    // torrent field piece length 512K
+
+OPTION(event_tracing, OPT_BOOL) // true if LTTng-UST tracepoints should be enabled
+
+OPTION(debug_deliberately_leak_memory, OPT_BOOL)
+OPTION(debug_asok_assert_abort, OPT_BOOL)
+
+OPTION(rgw_swift_custom_header, OPT_STR) // option to enable swift custom headers
+
+OPTION(rgw_swift_need_stats, OPT_BOOL) // option to enable stats on bucket listing for swift
+
+OPTION(rgw_acl_grants_max_num, OPT_INT) // According to AWS S3(http://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html), An ACL can have up to 100 grants.
+OPTION(rgw_cors_rules_max_num, OPT_INT) // According to AWS S3(http://docs.aws.amazon.com/AmazonS3/latest/dev/cors.html), An cors can have up to 100 rules.
+OPTION(rgw_delete_multi_obj_max_num, OPT_INT) // According to AWS S3(https://docs.aws.amazon.com/AmazonS3/latest/dev/DeletingObjects.html), Amazon S3 also provides the Multi-Object Delete API that you can use to delete up to 1000 objects in a single HTTP request.
+OPTION(rgw_website_routing_rules_max_num, OPT_INT) // According to AWS S3, An website routing config can have up to 50 rules.
+OPTION(rgw_sts_entry, OPT_STR)
+OPTION(rgw_sts_key, OPT_STR)
+OPTION(rgw_s3_auth_use_sts, OPT_BOOL)  // should we try to use sts for s3?
+OPTION(rgw_sts_max_session_duration, OPT_U64) // Max duration in seconds for which the session token is valid.
+OPTION(fake_statfs_for_testing, OPT_INT) // Set a value for kb and compute kb_used from total of num_bytes
+OPTION(rgw_sts_token_introspection_url, OPT_STR)  // url for introspecting web tokens
+OPTION(rgw_sts_client_id, OPT_STR) // Client Id
+OPTION(rgw_sts_client_secret, OPT_STR) // Client Secret
+OPTION(debug_allow_any_pool_priority, OPT_BOOL)
diff --git a/src/common/likely.h b/src/common/likely.h
new file mode 100644
index 00000000..abaf2d2e
--- /dev/null
+++ b/src/common/likely.h
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2010 Dreamhost
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_LIKELY_DOT_H
+#define CEPH_LIKELY_DOT_H
+
+/*
+ * Likely / Unlikely macros
+ */
+#ifndef likely
+#define likely(x)       __builtin_expect((x),1)
+#endif
+#ifndef unlikely
+#define unlikely(x)     __builtin_expect((x),0)
+#endif
+#ifndef expect
+#define expect(x, hint) __builtin_expect((x),(hint))
+#endif
+
+#endif
diff --git a/src/common/linux_version.c b/src/common/linux_version.c
new file mode 100644
index 00000000..b83dc71e
--- /dev/null
+++ b/src/common/linux_version.c
@@ -0,0 +1,25 @@
+#include "common/linux_version.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <sys/utsname.h>
+
+int get_linux_version(void)
+{
+	struct utsname ubuf;
+	int a, b, c;
+	int n;
+
+	if (uname(&ubuf) || strcmp(ubuf.sysname, "Linux"))
+		return 0;
+
+	n = sscanf(ubuf.release, "%d.%d.%d", &a, &b, &c);
+	switch (n) {
+	case 3:
+		return KERNEL_VERSION(a, b, c);
+	case 2:
+		return KERNEL_VERSION(a, b, 0);
+	default:
+		return 0;
+	}
+}
diff --git a/src/common/linux_version.h b/src/common/linux_version.h
new file mode 100644
index 00000000..5588c55b
--- /dev/null
+++ b/src/common/linux_version.h
@@ -0,0 +1,22 @@
+#ifndef CEPH_LINUX_VERSION_H
+#define CEPH_LINUX_VERSION_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef HAVE_LINUX_VERSION_H
+# include <linux/version.h>
+#endif
+
+#ifndef KERNEL_VERSION
+# define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))
+#endif
+
+int get_linux_version(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CEPH_LINUX_VERSION_H */
diff --git a/src/common/lockdep.cc b/src/common/lockdep.cc
new file mode 100644
index 00000000..cd87adce
--- /dev/null
+++ b/src/common/lockdep.cc
@@ -0,0 +1,403 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2008-2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "lockdep.h"
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "common/valgrind.h"
+
+/******* Constants **********/
+#define lockdep_dout(v) lsubdout(g_lockdep_ceph_ctx, lockdep, v)
+#define MAX_LOCKS  4096   // increase me as needed
+#define BACKTRACE_SKIP 2
+
+/******* Globals **********/
+bool g_lockdep;
+struct lockdep_stopper_t {
+  // disable lockdep when this module destructs.
+  ~lockdep_stopper_t() {
+    g_lockdep = 0;
+  }
+};
+static pthread_mutex_t lockdep_mutex = PTHREAD_MUTEX_INITIALIZER;
+static CephContext *g_lockdep_ceph_ctx = NULL;
+static lockdep_stopper_t lockdep_stopper;
+static ceph::unordered_map<std::string, int> lock_ids;
+static map<int, std::string> lock_names;
+static map<int, int> lock_refs;
+static char free_ids[MAX_LOCKS/8]; // bit set = free
+static ceph::unordered_map<pthread_t, map<int,BackTrace*> > held;
+static char follows[MAX_LOCKS][MAX_LOCKS/8]; // follows[a][b] means b taken after a
+static BackTrace *follows_bt[MAX_LOCKS][MAX_LOCKS];
+unsigned current_maxid;
+int last_freed_id = -1;
+static bool free_ids_inited;
+
+static bool lockdep_force_backtrace()
+{
+  return (g_lockdep_ceph_ctx != NULL &&
+          g_lockdep_ceph_ctx->_conf->lockdep_force_backtrace);
+}
+
+/******* Functions **********/
+void lockdep_register_ceph_context(CephContext *cct)
+{
+  static_assert((MAX_LOCKS > 0) && (MAX_LOCKS % 8 == 0),                   
+    "lockdep's MAX_LOCKS needs to be divisible by 8 to operate correctly.");
+  pthread_mutex_lock(&lockdep_mutex);
+  if (g_lockdep_ceph_ctx == NULL) {
+    ANNOTATE_BENIGN_RACE_SIZED(&g_lockdep_ceph_ctx, sizeof(g_lockdep_ceph_ctx),
+                               "lockdep cct");
+    ANNOTATE_BENIGN_RACE_SIZED(&g_lockdep, sizeof(g_lockdep),
+                               "lockdep enabled");
+    g_lockdep = true;
+    g_lockdep_ceph_ctx = cct;
+    lockdep_dout(1) << "lockdep start" << dendl;
+    if (!free_ids_inited) {
+      free_ids_inited = true;
+      // FIPS zeroization audit 20191115: this memset is not security related.
+      memset((void*) &free_ids[0], 255, sizeof(free_ids));
+    }
+  }
+  pthread_mutex_unlock(&lockdep_mutex);
+}
+
+void lockdep_unregister_ceph_context(CephContext *cct)
+{
+  pthread_mutex_lock(&lockdep_mutex);
+  if (cct == g_lockdep_ceph_ctx) {
+    lockdep_dout(1) << "lockdep stop" << dendl;
+    // this cct is going away; shut it down!
+    g_lockdep = false;
+    g_lockdep_ceph_ctx = NULL;
+
+    // blow away all of our state, too, in case it starts up again.
+    for (unsigned i = 0; i < current_maxid; ++i) {
+      for (unsigned j = 0; j < current_maxid; ++j) {
+        delete follows_bt[i][j];
+      }
+    }
+
+    held.clear();
+    lock_names.clear();
+    lock_ids.clear();
+    // FIPS zeroization audit 20191115: these memsets are not security related.
+    memset((void*)&follows[0][0], 0, current_maxid * MAX_LOCKS/8);
+    memset((void*)&follows_bt[0][0], 0, sizeof(BackTrace*) * current_maxid * MAX_LOCKS);
+  }
+  pthread_mutex_unlock(&lockdep_mutex);
+}
+
+int lockdep_dump_locks()
+{
+  pthread_mutex_lock(&lockdep_mutex);
+  if (!g_lockdep)
+    goto out;
+
+  for (ceph::unordered_map<pthread_t, map<int,BackTrace*> >::iterator p = held.begin();
+       p != held.end();
+       ++p) {
+    lockdep_dout(0) << "--- thread " << p->first << " ---" << dendl;
+    for (map<int,BackTrace*>::iterator q = p->second.begin();
+	 q != p->second.end();
+	 ++q) {
+      lockdep_dout(0) << "  * " << lock_names[q->first] << "\n";
+      if (q->second)
+	*_dout << *(q->second);
+      *_dout << dendl;
+    }
+  }
+out:
+  pthread_mutex_unlock(&lockdep_mutex);
+  return 0;
+}
+
+int lockdep_get_free_id(void)
+{
+  // if there's id known to be freed lately, reuse it
+  if ((last_freed_id >= 0) && 
+     (free_ids[last_freed_id/8] & (1 << (last_freed_id % 8)))) {
+    int tmp = last_freed_id;
+    last_freed_id = -1;
+    free_ids[tmp/8] &= 255 - (1 << (tmp % 8));
+    lockdep_dout(1) << "lockdep reusing last freed id " << tmp << dendl;
+    return tmp;
+  }
+  
+  // walk through entire array and locate nonzero char, then find
+  // actual bit.
+  for (int i = 0; i < MAX_LOCKS / 8; ++i) {
+    if (free_ids[i] != 0) {
+      for (int j = 0; j < 8; ++j) {
+        if (free_ids[i] & (1 << j)) {
+          free_ids[i] &= 255 - (1 << j);
+          lockdep_dout(1) << "lockdep using id " << i * 8 + j << dendl;
+          return i * 8 + j;
+        }
+      }
+    }
+  }
+  
+  // not found
+  lockdep_dout(0) << "failing miserably..." << dendl;
+  return -1;
+}
+
+static int _lockdep_register(const char *name)
+{
+  int id = -1;
+
+  if (!g_lockdep)
+    return id;
+  ceph::unordered_map<std::string, int>::iterator p = lock_ids.find(name);
+  if (p == lock_ids.end()) {
+    id = lockdep_get_free_id();
+    if (id < 0) {
+      lockdep_dout(0) << "ERROR OUT OF IDS .. have 0"
+		      << " max " << MAX_LOCKS << dendl;
+      for (auto& p : lock_names) {
+	lockdep_dout(0) << "  lock " << p.first << " " << p.second << dendl;
+      }
+      ceph_abort();
+    }
+    if (current_maxid <= (unsigned)id) {
+        current_maxid = (unsigned)id + 1;
+    }
+    lock_ids[name] = id;
+    lock_names[id] = name;
+    lockdep_dout(10) << "registered '" << name << "' as " << id << dendl;
+  } else {
+    id = p->second;
+    lockdep_dout(20) << "had '" << name << "' as " << id << dendl;
+  }
+
+  ++lock_refs[id];
+
+  return id;
+}
+
+int lockdep_register(const char *name)
+{
+  int id;
+
+  pthread_mutex_lock(&lockdep_mutex);
+  id = _lockdep_register(name);
+  pthread_mutex_unlock(&lockdep_mutex);
+  return id;
+}
+
+void lockdep_unregister(int id)
+{
+  if (id < 0) {
+    return;
+  }
+
+  pthread_mutex_lock(&lockdep_mutex);
+
+  std::string name;
+  map<int, std::string>::iterator p = lock_names.find(id);
+  if (p == lock_names.end())
+    name = "unknown" ;
+  else
+    name = p->second;
+
+  int &refs = lock_refs[id];
+  if (--refs == 0) {
+    if (p != lock_names.end()) {
+      // reset dependency ordering
+      // FIPS zeroization audit 20191115: this memset is not security related.
+      memset((void*)&follows[id][0], 0, MAX_LOCKS/8);
+      for (unsigned i=0; i<current_maxid; ++i) {
+        delete follows_bt[id][i];
+        follows_bt[id][i] = NULL;
+
+        delete follows_bt[i][id];
+        follows_bt[i][id] = NULL;
+        follows[i][id / 8] &= 255 - (1 << (id % 8));
+      }
+
+      lockdep_dout(10) << "unregistered '" << name << "' from " << id << dendl;
+      lock_ids.erase(p->second);
+      lock_names.erase(id);
+    }
+    lock_refs.erase(id);
+    free_ids[id/8] |= (1 << (id % 8));
+    last_freed_id = id;
+  } else if (g_lockdep) {
+    lockdep_dout(20) << "have " << refs << " of '" << name << "' " <<
+			"from " << id << dendl;
+  }
+  pthread_mutex_unlock(&lockdep_mutex);
+}
+
+
+// does b follow a?
+static bool does_follow(int a, int b)
+{
+  if (follows[a][b/8] & (1 << (b % 8))) {
+    lockdep_dout(0) << "\n";
+    *_dout << "------------------------------------" << "\n";
+    *_dout << "existing dependency " << lock_names[a] << " (" << a << ") -> "
+           << lock_names[b] << " (" << b << ") at:\n";
+    if (follows_bt[a][b]) {
+      follows_bt[a][b]->print(*_dout);
+    }
+    *_dout << dendl;
+    return true;
+  }
+
+  for (unsigned i=0; i<current_maxid; i++) {
+    if ((follows[a][i/8] & (1 << (i % 8))) &&
+	does_follow(i, b)) {
+      lockdep_dout(0) << "existing intermediate dependency " << lock_names[a]
+          << " (" << a << ") -> " << lock_names[i] << " (" << i << ") at:\n";
+      if (follows_bt[a][i]) {
+        follows_bt[a][i]->print(*_dout);
+      }
+      *_dout << dendl;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+int lockdep_will_lock(const char *name, int id, bool force_backtrace,
+		      bool recursive)
+{
+  pthread_t p = pthread_self();
+
+  pthread_mutex_lock(&lockdep_mutex);
+  if (!g_lockdep) {
+    pthread_mutex_unlock(&lockdep_mutex);
+    return id;
+  }
+
+  if (id < 0)
+    id = _lockdep_register(name);
+
+  lockdep_dout(20) << "_will_lock " << name << " (" << id << ")" << dendl;
+
+  // check dependency graph
+  map<int, BackTrace *> &m = held[p];
+  for (map<int, BackTrace *>::iterator p = m.begin();
+       p != m.end();
+       ++p) {
+    if (p->first == id) {
+      if (!recursive) {
+	lockdep_dout(0) << "\n";
+	*_dout << "recursive lock of " << name << " (" << id << ")\n";
+	BackTrace *bt = new BackTrace(BACKTRACE_SKIP);
+	bt->print(*_dout);
+	if (p->second) {
+	  *_dout << "\npreviously locked at\n";
+	  p->second->print(*_dout);
+	}
+	delete bt;
+	*_dout << dendl;
+	ceph_abort();
+      }
+    }
+    else if (!(follows[p->first][id/8] & (1 << (id % 8)))) {
+      // new dependency
+
+      // did we just create a cycle?
+      if (does_follow(id, p->first)) {
+        BackTrace *bt = new BackTrace(BACKTRACE_SKIP);
+	lockdep_dout(0) << "new dependency " << lock_names[p->first]
+		<< " (" << p->first << ") -> " << name << " (" << id << ")"
+		<< " creates a cycle at\n";
+	bt->print(*_dout);
+	*_dout << dendl;
+
+	lockdep_dout(0) << "btw, i am holding these locks:" << dendl;
+	for (map<int, BackTrace *>::iterator q = m.begin();
+	     q != m.end();
+	     ++q) {
+	  lockdep_dout(0) << "  " << lock_names[q->first] << " (" << q->first << ")" << dendl;
+	  if (q->second) {
+	    lockdep_dout(0) << " ";
+	    q->second->print(*_dout);
+	    *_dout << dendl;
+	  }
+	}
+
+	lockdep_dout(0) << "\n" << dendl;
+
+	// don't add this dependency, or we'll get aMutex. cycle in the graph, and
+	// does_follow() won't terminate.
+
+	ceph_abort();  // actually, we should just die here.
+      } else {
+        BackTrace *bt = NULL;
+        if (force_backtrace || lockdep_force_backtrace()) {
+          bt = new BackTrace(BACKTRACE_SKIP);
+        }
+        follows[p->first][id/8] |= 1 << (id % 8);
+        follows_bt[p->first][id] = bt;
+	lockdep_dout(10) << lock_names[p->first] << " -> " << name << " at" << dendl;
+	//bt->print(*_dout);
+      }
+    }
+  }
+  pthread_mutex_unlock(&lockdep_mutex);
+  return id;
+}
+
+int lockdep_locked(const char *name, int id, bool force_backtrace)
+{
+  pthread_t p = pthread_self();
+
+  pthread_mutex_lock(&lockdep_mutex);
+  if (!g_lockdep)
+    goto out;
+  if (id < 0)
+    id = _lockdep_register(name);
+
+  lockdep_dout(20) << "_locked " << name << dendl;
+  if (force_backtrace || lockdep_force_backtrace())
+    held[p][id] = new BackTrace(BACKTRACE_SKIP);
+  else
+    held[p][id] = 0;
+out:
+  pthread_mutex_unlock(&lockdep_mutex);
+  return id;
+}
+
+int lockdep_will_unlock(const char *name, int id)
+{
+  pthread_t p = pthread_self();
+
+  if (id < 0) {
+    //id = lockdep_register(name);
+    ceph_assert(id == -1);
+    return id;
+  }
+
+  pthread_mutex_lock(&lockdep_mutex);
+  if (!g_lockdep)
+    goto out;
+  lockdep_dout(20) << "_will_unlock " << name << dendl;
+
+  // don't assert.. lockdep may be enabled at any point in time
+  //assert(held.count(p));
+  //assert(held[p].count(id));
+
+  delete held[p][id];
+  held[p].erase(id);
+out:
+  pthread_mutex_unlock(&lockdep_mutex);
+  return id;
+}
+
+
diff --git a/src/common/lockdep.h b/src/common/lockdep.h
new file mode 100644
index 00000000..1c854a9e
--- /dev/null
+++ b/src/common/lockdep.h
@@ -0,0 +1,32 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2008-2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_LOCKDEP_H
+#define CEPH_LOCKDEP_H
+
+class CephContext;
+
+extern bool g_lockdep;
+
+extern void lockdep_register_ceph_context(CephContext *cct);
+extern void lockdep_unregister_ceph_context(CephContext *cct);
+extern int lockdep_register(const char *n);
+extern void lockdep_unregister(int id);
+extern int lockdep_will_lock(const char *n, int id, bool force_backtrace=false,
+			     bool recursive=false);
+extern int lockdep_locked(const char *n, int id, bool force_backtrace=false);
+extern int lockdep_will_unlock(const char *n, int id);
+extern int lockdep_dump_locks();
+
+#endif
diff --git a/src/common/lru_map.h b/src/common/lru_map.h
new file mode 100644
index 00000000..4c1c2dad
--- /dev/null
+++ b/src/common/lru_map.h
@@ -0,0 +1,132 @@
+#ifndef CEPH_LRU_MAP_H
+#define CEPH_LRU_MAP_H
+
+#include "common/ceph_mutex.h"
+
+template <class K, class V>
+class lru_map {
+  struct entry {
+    V value;
+    typename std::list<K>::iterator lru_iter;
+  };
+
+  std::map<K, entry> entries;
+  std::list<K> entries_lru;
+
+  ceph::mutex lock = ceph::make_mutex("lru_map::lock");
+
+  size_t max;
+
+public:
+  class UpdateContext {
+    public:
+      virtual ~UpdateContext() {}
+
+      /* update should return true if object is updated */
+      virtual bool update(V *v) = 0;
+  };
+
+  bool _find(const K& key, V *value, UpdateContext *ctx);
+  void _add(const K& key, V& value);
+
+public:
+  lru_map(int _max) : max(_max) {}
+  virtual ~lru_map() {}
+
+  bool find(const K& key, V& value);
+
+  /*
+   * find_and_update()
+   *
+   * - will return true if object is found
+   * - if ctx is set will return true if object is found and updated
+   */
+  bool find_and_update(const K& key, V *value, UpdateContext *ctx);
+  void add(const K& key, V& value);
+  void erase(const K& key);
+};
+
+template <class K, class V>
+bool lru_map<K, V>::_find(const K& key, V *value, UpdateContext *ctx)
+{
+  typename std::map<K, entry>::iterator iter = entries.find(key);
+  if (iter == entries.end()) {
+    return false;
+  }
+
+  entry& e = iter->second;
+  entries_lru.erase(e.lru_iter);
+
+  bool r = true;
+
+  if (ctx)
+    r = ctx->update(&e.value);
+
+  if (value)
+    *value = e.value;
+
+  entries_lru.push_front(key);
+  e.lru_iter = entries_lru.begin();
+
+  return r;
+}
+
+template <class K, class V>
+bool lru_map<K, V>::find(const K& key, V& value)
+{
+  std::lock_guard l(lock);
+  return _find(key, &value, NULL);
+}
+
+template <class K, class V>
+bool lru_map<K, V>::find_and_update(const K& key, V *value, UpdateContext *ctx)
+{
+  std::lock_guard l(lock);
+  return _find(key, value, ctx);
+}
+
+template <class K, class V>
+void lru_map<K, V>::_add(const K& key, V& value)
+{
+  typename std::map<K, entry>::iterator iter = entries.find(key);
+  if (iter != entries.end()) {
+    entry& e = iter->second;
+    entries_lru.erase(e.lru_iter);
+  }
+
+  entries_lru.push_front(key);
+  entry& e = entries[key];
+  e.value = value;
+  e.lru_iter = entries_lru.begin();
+
+  while (entries.size() > max) {
+    typename std::list<K>::reverse_iterator riter = entries_lru.rbegin();
+    iter = entries.find(*riter);
+    // ceph_assert(iter != entries.end());
+    entries.erase(iter);
+    entries_lru.pop_back();
+  }
+}
+
+
+template <class K, class V>
+void lru_map<K, V>::add(const K& key, V& value)
+{
+  std::lock_guard l(lock);
+  _add(key, value);
+}
+
+template <class K, class V>
+void lru_map<K, V>::erase(const K& key)
+{
+  std::lock_guard l(lock);
+  typename std::map<K, entry>::iterator iter = entries.find(key);
+  if (iter == entries.end())
+    return;
+
+  entry& e = iter->second;
+  entries_lru.erase(e.lru_iter);
+  entries.erase(iter);
+}
+
+#endif
diff --git a/src/common/mClockPriorityQueue.h b/src/common/mClockPriorityQueue.h
new file mode 100644
index 00000000..ae425920
--- /dev/null
+++ b/src/common/mClockPriorityQueue.h
@@ -0,0 +1,365 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+
+#include <functional>
+#include <map>
+#include <list>
+#include <cmath>
+
+#include "common/Formatter.h"
+#include "common/OpQueue.h"
+
+#include "dmclock/src/dmclock_server.h"
+
+// the following is done to unclobber _ASSERT_H so it returns to the
+// way ceph likes it
+#include "include/ceph_assert.h"
+
+
+namespace ceph {
+
+  namespace dmc = crimson::dmclock;
+
+  template <typename T, typename K>
+  class mClockQueue : public OpQueue <T, K> {
+
+    using priority_t = unsigned;
+    using cost_t = unsigned;
+
+    typedef std::list<std::pair<cost_t, T> > ListPairs;
+
+    static void filter_list_pairs(ListPairs *l,
+				  std::function<bool (T&&)> f) {
+      for (typename ListPairs::iterator i = l->end();
+	   i != l->begin();
+	   /* no inc */
+	) {
+	auto next = i;
+	--next;
+	if (f(std::move(next->second))) {
+	  l->erase(next);
+	} else {
+	  i = next;
+	}
+      }
+    }
+
+    struct SubQueue {
+    private:
+      typedef std::map<K, ListPairs> Classes;
+      // client-class to ordered queue
+      Classes q;
+
+      unsigned tokens, max_tokens;
+
+      typename Classes::iterator cur;
+
+    public:
+
+      SubQueue(const SubQueue &other)
+	: q(other.q),
+	  tokens(other.tokens),
+	  max_tokens(other.max_tokens),
+	  cur(q.begin()) {}
+
+      SubQueue()
+	: tokens(0),
+	  max_tokens(0),
+	  cur(q.begin()) {}
+
+      void set_max_tokens(unsigned mt) {
+	max_tokens = mt;
+      }
+
+      unsigned get_max_tokens() const {
+	return max_tokens;
+      }
+
+      unsigned num_tokens() const {
+	return tokens;
+      }
+
+      void put_tokens(unsigned t) {
+	tokens += t;
+	if (tokens > max_tokens) {
+	  tokens = max_tokens;
+	}
+      }
+
+      void take_tokens(unsigned t) {
+	if (tokens > t) {
+	  tokens -= t;
+	} else {
+	  tokens = 0;
+	}
+      }
+
+      void enqueue(K cl, cost_t cost, T&& item) {
+	q[cl].emplace_back(cost, std::move(item));
+	if (cur == q.end())
+	  cur = q.begin();
+      }
+
+      void enqueue_front(K cl, cost_t cost, T&& item) {
+	q[cl].emplace_front(cost, std::move(item));
+	if (cur == q.end())
+	  cur = q.begin();
+      }
+
+      const std::pair<cost_t, T>& front() const {
+	ceph_assert(!(q.empty()));
+	ceph_assert(cur != q.end());
+	return cur->second.front();
+      }
+
+      std::pair<cost_t, T>& front() {
+	ceph_assert(!(q.empty()));
+	ceph_assert(cur != q.end());
+	return cur->second.front();
+      }
+
+      void pop_front() {
+	ceph_assert(!(q.empty()));
+	ceph_assert(cur != q.end());
+	cur->second.pop_front();
+	if (cur->second.empty()) {
+	  auto i = cur;
+	  ++cur;
+	  q.erase(i);
+	} else {
+	  ++cur;
+	}
+	if (cur == q.end()) {
+	  cur = q.begin();
+	}
+      }
+
+      unsigned get_size_slow() const {
+	unsigned count = 0;
+	for (const auto& cls : q) {
+	  count += cls.second.size();
+	}
+	return count;
+      }
+
+      bool empty() const {
+	return q.empty();
+      }
+
+      void remove_by_filter(std::function<bool (T&&)> f) {
+	for (typename Classes::iterator i = q.begin();
+	     i != q.end();
+	     /* no-inc */) {
+	  filter_list_pairs(&(i->second), f);
+	  if (i->second.empty()) {
+	    if (cur == i) {
+	      ++cur;
+	    }
+	    i = q.erase(i);
+	  } else {
+	    ++i;
+	  }
+	}
+	if (cur == q.end()) cur = q.begin();
+      }
+
+      void remove_by_class(K k, std::list<T> *out) {
+	typename Classes::iterator i = q.find(k);
+	if (i == q.end()) {
+	  return;
+	}
+	if (i == cur) {
+	  ++cur;
+	}
+	if (out) {
+	  for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
+	    out->push_front(std::move(j->second));
+	  }
+	}
+	q.erase(i);
+	if (cur == q.end()) cur = q.begin();
+      }
+
+      void dump(ceph::Formatter *f) const {
+	f->dump_int("size", get_size_slow());
+	f->dump_int("num_keys", q.size());
+      }
+    };
+
+    using SubQueues = std::map<priority_t, SubQueue>;
+
+    SubQueues high_queue;
+
+    using Queue = dmc::PullPriorityQueue<K,T,false>;
+    Queue queue;
+
+    // when enqueue_front is called, rather than try to re-calc tags
+    // to put in mClock priority queue, we'll just keep a separate
+    // list from which we dequeue items first, and only when it's
+    // empty do we use queue.
+    std::list<std::pair<K,T>> queue_front;
+
+  public:
+
+    mClockQueue(
+      const typename Queue::ClientInfoFunc& info_func,
+      double anticipation_timeout = 0.0) :
+      queue(info_func, dmc::AtLimit::Allow, anticipation_timeout)
+    {
+      // empty
+    }
+
+    unsigned get_size_slow() const {
+      unsigned total = 0;
+      total += queue_front.size();
+      total += queue.request_count();
+      for (auto i = high_queue.cbegin(); i != high_queue.cend(); ++i) {
+	ceph_assert(i->second.get_size_slow());
+	total += i->second.get_size_slow();
+      }
+      return total;
+    }
+
+    // be sure to do things in reverse priority order and push_front
+    // to the list so items end up on list in front-to-back priority
+    // order
+    void remove_by_filter(std::function<bool (T&&)> filter_accum) {
+      queue.remove_by_req_filter([&] (std::unique_ptr<T>&& r) {
+          return filter_accum(std::move(*r));
+        }, true);
+
+      for (auto i = queue_front.rbegin(); i != queue_front.rend(); /* no-inc */) {
+	if (filter_accum(std::move(i->second))) {
+	  i = decltype(i){ queue_front.erase(std::next(i).base()) };
+	} else {
+	  ++i;
+	}
+      }
+
+      for (typename SubQueues::iterator i = high_queue.begin();
+	   i != high_queue.end();
+	   /* no-inc */ ) {
+	i->second.remove_by_filter(filter_accum);
+	if (i->second.empty()) {
+	  i = high_queue.erase(i);
+	} else {
+	  ++i;
+	}
+      }
+    }
+
+    void remove_by_class(K k, std::list<T> *out = nullptr) override final {
+      if (out) {
+	queue.remove_by_client(k,
+			       true,
+			       [&out] (std::unique_ptr<T>&& t) {
+				 out->push_front(std::move(*t));
+			       });
+      } else {
+	queue.remove_by_client(k, true);
+      }
+
+      for (auto i = queue_front.rbegin(); i != queue_front.rend(); /* no-inc */) {
+	if (k == i->first) {
+	  if (nullptr != out) out->push_front(std::move(i->second));
+	  i = decltype(i){ queue_front.erase(std::next(i).base()) };
+	} else {
+	  ++i;
+	}
+      }
+
+      for (auto i = high_queue.begin(); i != high_queue.end(); /* no-inc */) {
+	i->second.remove_by_class(k, out);
+	if (i->second.empty()) {
+	  i = high_queue.erase(i);
+	} else {
+	  ++i;
+	}
+      }
+    }
+
+    void enqueue_strict(K cl, unsigned priority, T&& item) override final {
+      high_queue[priority].enqueue(cl, 1, std::move(item));
+    }
+
+    void enqueue_strict_front(K cl, unsigned priority, T&& item) override final {
+      high_queue[priority].enqueue_front(cl, 1, std::move(item));
+    }
+
+    void enqueue(K cl, unsigned priority, unsigned cost, T&& item) override final {
+      // priority is ignored
+      queue.add_request(std::move(item), cl, cost);
+    }
+
+    void enqueue_front(K cl,
+		       unsigned priority,
+		       unsigned cost,
+		       T&& item) override final {
+      queue_front.emplace_front(std::pair<K,T>(cl, std::move(item)));
+    }
+
+    bool empty() const override final {
+      return queue.empty() && high_queue.empty() && queue_front.empty();
+    }
+
+    T dequeue() override final {
+      ceph_assert(!empty());
+
+      if (!high_queue.empty()) {
+	T ret = std::move(high_queue.rbegin()->second.front().second);
+	high_queue.rbegin()->second.pop_front();
+	if (high_queue.rbegin()->second.empty()) {
+	  high_queue.erase(high_queue.rbegin()->first);
+	}
+	return ret;
+      }
+
+      if (!queue_front.empty()) {
+	T ret = std::move(queue_front.front().second);
+	queue_front.pop_front();
+	return ret;
+      }
+
+      auto pr = queue.pull_request();
+      ceph_assert(pr.is_retn());
+      auto& retn = pr.get_retn();
+      return std::move(*(retn.request));
+    }
+
+    void dump(ceph::Formatter *f) const override final {
+      f->open_array_section("high_queues");
+      for (typename SubQueues::const_iterator p = high_queue.begin();
+	   p != high_queue.end();
+	   ++p) {
+	f->open_object_section("subqueue");
+	f->dump_int("priority", p->first);
+	p->second.dump(f);
+	f->close_section();
+      }
+      f->close_section();
+
+      f->open_object_section("queue_front");
+      f->dump_int("size", queue_front.size());
+      f->close_section();
+
+      f->open_object_section("queue");
+      f->dump_int("size", queue.request_count());
+      f->close_section();
+    } // dump
+  };
+
+} // namespace ceph
diff --git a/src/common/map_cacher.hpp b/src/common/map_cacher.hpp
new file mode 100644
index 00000000..e95edfb5
--- /dev/null
+++ b/src/common/map_cacher.hpp
@@ -0,0 +1,191 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef MAPCACHER_H
+#define MAPCACHER_H
+
+#include "include/Context.h"
+#include "common/sharedptr_registry.hpp"
+
+namespace MapCacher {
+/**
+ * Abstraction for ordering key updates
+ */
+template<typename K, typename V>
+class Transaction {
+public:
+  /// Set keys according to map
+  virtual void set_keys(
+    const std::map<K, V> &keys ///< [in] keys/values to set
+    ) = 0;
+
+  /// Remove keys
+  virtual void remove_keys(
+    const std::set<K> &to_remove ///< [in] keys to remove
+    ) = 0;
+
+  /// Add context to fire when data is readable
+  virtual void add_callback(
+    Context *c ///< [in] Context to fire on readable
+    ) = 0;
+  virtual ~Transaction() {}
+};
+
+/**
+ * Abstraction for fetching keys
+ */
+template<typename K, typename V>
+class StoreDriver {
+public:
+  /// Returns requested key values
+  virtual int get_keys(
+    const std::set<K> &keys,   ///< [in] keys requested
+    std::map<K, V> *got  ///< [out] values for keys obtained
+    ) = 0; ///< @return error value
+
+  /// Returns next key
+  virtual int get_next(
+    const K &key,       ///< [in] key after which to get next
+    pair<K, V> *next    ///< [out] first key after key
+    ) = 0; ///< @return 0 on success, -ENOENT if there is no next
+
+  virtual ~StoreDriver() {}
+};
+
+/**
+ * Uses SharedPtrRegistry to cache objects of in progress writes
+ * allowing the user to read/write a consistent view of the map
+ * without flushing writes.
+ */
+template<typename K, typename V>
+class MapCacher {
+private:
+  StoreDriver<K, V> *driver;
+
+  SharedPtrRegistry<K, boost::optional<V> > in_progress;
+  typedef typename SharedPtrRegistry<K, boost::optional<V> >::VPtr VPtr;
+  typedef ContainerContext<set<VPtr> > TransHolder;
+
+public:
+  MapCacher(StoreDriver<K, V> *driver) : driver(driver) {}
+
+  /// Fetch first key/value pair after specified key
+  int get_next(
+    K key,               ///< [in] key after which to get next
+    pair<K, V> *next     ///< [out] next key
+    ) {
+    while (true) {
+      pair<K, boost::optional<V> > cached;
+      pair<K, V> store;
+      bool got_cached = in_progress.get_next(key, &cached);
+
+      bool got_store = false;
+      int r = driver->get_next(key, &store);
+      if (r < 0 && r != -ENOENT) {
+	return r;
+      } else if (r == 0) {
+	got_store = true;
+      }
+
+      if (!got_cached && !got_store) {
+	return -ENOENT;
+      } else if (
+	got_cached &&
+	(!got_store || store.first >= cached.first)) {
+	if (cached.second) {
+	  if (next)
+	    *next = make_pair(cached.first, cached.second.get());
+	  return 0;
+	} else {
+	  key = cached.first;
+	  continue; // value was cached as removed, recurse
+	}
+      } else {
+	if (next)
+	  *next = store;
+	return 0;
+      }
+    }
+    ceph_abort(); // not reachable
+    return -EINVAL;
+  } ///< @return error value, 0 on success, -ENOENT if no more entries
+
+  /// Adds operation setting keys to Transaction
+  void set_keys(
+    const map<K, V> &keys,  ///< [in] keys/values to set
+    Transaction<K, V> *t    ///< [out] transaction to use
+    ) {
+    std::set<VPtr> vptrs;
+    for (typename map<K, V>::const_iterator i = keys.begin();
+	 i != keys.end();
+	 ++i) {
+      VPtr ip = in_progress.lookup_or_create(i->first, i->second);
+      *ip = i->second;
+      vptrs.insert(ip);
+    }
+    t->set_keys(keys);
+    t->add_callback(new TransHolder(vptrs));
+  }
+
+  /// Adds operation removing keys to Transaction
+  void remove_keys(
+    const set<K> &keys,  ///< [in]
+    Transaction<K, V> *t ///< [out] transaction to use
+    ) {
+    std::set<VPtr> vptrs;
+    for (typename set<K>::const_iterator i = keys.begin();
+	 i != keys.end();
+	 ++i) {
+      boost::optional<V> empty;
+      VPtr ip = in_progress.lookup_or_create(*i, empty);
+      *ip = empty;
+      vptrs.insert(ip);
+    }
+    t->remove_keys(keys);
+    t->add_callback(new TransHolder(vptrs));
+  }
+
+  /// Gets keys, uses cached values for unstable keys
+  int get_keys(
+    const set<K> &keys_to_get, ///< [in] set of keys to fetch
+    map<K, V> *got             ///< [out] keys gotten
+    ) {
+    set<K> to_get;
+    map<K, V> _got;
+    for (typename set<K>::const_iterator i = keys_to_get.begin();
+	 i != keys_to_get.end();
+	 ++i) {
+      VPtr val = in_progress.lookup(*i);
+      if (val) {
+	if (*val)
+	  got->insert(make_pair(*i, val->get()));
+	//else: value cached is empty, key doesn't exist
+      } else {
+	to_get.insert(*i);
+      }
+    }
+    int r = driver->get_keys(to_get, &_got);
+    if (r < 0)
+      return r;
+    for (typename map<K, V>::iterator i = _got.begin();
+	 i != _got.end();
+	 ++i) {
+      got->insert(*i);
+    }
+    return 0;
+  } ///< @return error value, 0 on success
+};
+} // namespace
+
+#endif
diff --git a/src/common/mempool.cc b/src/common/mempool.cc
new file mode 100644
index 00000000..e4ab5d4a
--- /dev/null
+++ b/src/common/mempool.cc
@@ -0,0 +1,137 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Allen Samuels <allen.samuels@sandisk.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/mempool.h"
+#include "include/demangle.h"
+
+
+// default to debug_mode off
+bool mempool::debug_mode = false;
+
+// --------------------------------------------------------------
+
+mempool::pool_t& mempool::get_pool(mempool::pool_index_t ix)
+{
+  // We rely on this array being initialized before any invocation of
+  // this function, even if it is called by ctors in other compilation
+  // units that are being initialized before this compilation unit.
+  static mempool::pool_t table[num_pools];
+  return table[ix];
+}
+
+const char *mempool::get_pool_name(mempool::pool_index_t ix) {
+#define P(x) #x,
+  static const char *names[num_pools] = {
+    DEFINE_MEMORY_POOLS_HELPER(P)
+  };
+#undef P
+  return names[ix];
+}
+
+void mempool::dump(ceph::Formatter *f)
+{
+  stats_t total;
+  f->open_object_section("mempool"); // we need (dummy?) topmost section for 
+				     // JSON Formatter to print pool names. It omits them otherwise.
+  f->open_object_section("by_pool");
+  for (size_t i = 0; i < num_pools; ++i) {
+    const pool_t &pool = mempool::get_pool((pool_index_t)i);
+    f->open_object_section(get_pool_name((pool_index_t)i));
+    pool.dump(f, &total);
+    f->close_section();
+  }
+  f->close_section();
+  f->dump_object("total", total);
+  f->close_section();
+}
+
+void mempool::set_debug_mode(bool d)
+{
+  debug_mode = d;
+}
+
+// --------------------------------------------------------------
+// pool_t
+
+size_t mempool::pool_t::allocated_bytes() const
+{
+  ssize_t result = 0;
+  for (size_t i = 0; i < num_shards; ++i) {
+    result += shard[i].bytes;
+  }
+  if (result < 0) {
+    // we raced with some unbalanced allocations/deallocations
+    result = 0;
+  }
+  return (size_t) result;
+}
+
+size_t mempool::pool_t::allocated_items() const
+{
+  ssize_t result = 0;
+  for (size_t i = 0; i < num_shards; ++i) {
+    result += shard[i].items;
+  }
+  if (result < 0) {
+    // we raced with some unbalanced allocations/deallocations
+    result = 0;
+  }
+  return (size_t) result;
+}
+
+void mempool::pool_t::adjust_count(ssize_t items, ssize_t bytes)
+{
+  shard_t *shard = pick_a_shard();
+  shard->items += items;
+  shard->bytes += bytes;
+}
+
+void mempool::pool_t::get_stats(
+  stats_t *total,
+  std::map<std::string, stats_t> *by_type) const
+{
+  for (size_t i = 0; i < num_shards; ++i) {
+    total->items += shard[i].items;
+    total->bytes += shard[i].bytes;
+  }
+  if (debug_mode) {
+    std::lock_guard shard_lock(lock);
+    for (auto &p : type_map) {
+      std::string n = ceph_demangle(p.second.type_name);
+      stats_t &s = (*by_type)[n];
+      s.bytes = p.second.items * p.second.item_size;
+      s.items = p.second.items;
+    }
+  }
+}
+
+void mempool::pool_t::dump(ceph::Formatter *f, stats_t *ptotal) const
+{
+  stats_t total;
+  std::map<std::string, stats_t> by_type;
+  get_stats(&total, &by_type);
+  if (ptotal) {
+    *ptotal += total;
+  }
+  total.dump(f);
+  if (!by_type.empty()) {
+    f->open_object_section("by_type");
+    for (auto &i : by_type) {
+      f->open_object_section(i.first.c_str());
+      i.second.dump(f);
+      f->close_section();
+    }
+    f->close_section();
+  }
+}
diff --git a/src/common/mime.c b/src/common/mime.c
new file mode 100644
index 00000000..fe45123c
--- /dev/null
+++ b/src/common/mime.c
@@ -0,0 +1,132 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "common/utf8.h"
+
+#include <errno.h>
+#include <stdio.h>
+
+int mime_encode_as_qp(const char *input, char *output, int outlen)
+{
+	int ret = 1;
+	char *o = output;
+	const unsigned char *i = (const unsigned char*)input;
+	while (1) {
+		int c = *i;
+		if (c == '\0') {
+			break;
+		}
+		else if ((c & 0x80) || (c == '=') || (is_control_character(c))) {
+			if (outlen >= 3) {
+				snprintf(o, outlen, "=%02X", c);
+				outlen -= 3;
+				o += 3;
+			}
+			else
+				outlen = 0;
+			ret += 3;
+		}
+		else {
+			if (outlen >= 1) {
+				snprintf(o, outlen, "%c", c);
+				outlen -= 1;
+				o += 1;
+			}
+			ret += 1;
+		}
+		++i;
+	}
+	return ret;
+}
+
+static inline signed int hexchar_to_int(unsigned int c)
+{
+	switch(c) {
+	case '0':
+		return 0;
+	case '1':
+		return 1;
+	case '2':
+		return 2;
+	case '3':
+		return 3;
+	case '4':
+		return 4;
+	case '5':
+		return 5;
+	case '6':
+		return 6;
+	case '7':
+		return 7;
+	case '8':
+		return 8;
+	case '9':
+		return 9;
+	case 'A':
+	case 'a':
+		return 10;
+	case 'B':
+	case 'b':
+		return 11;
+	case 'C':
+	case 'c':
+		return 12;
+	case 'D':
+	case 'd':
+		return 13;
+	case 'E':
+	case 'e':
+		return 14;
+	case 'F':
+	case 'f':
+		return 15;
+	case '\0':
+	default:
+	    return -EDOM;
+	}
+}
+
+int mime_decode_from_qp(const char *input, char *output, int outlen)
+{
+	int ret = 1;
+	char *o = output;
+	const unsigned char *i = (const unsigned char*)input;
+	while (1) {
+		unsigned int c = *i;
+		if (c == '\0') {
+			break;
+		}
+		else if (c & 0x80) {
+			/* The high bit is never set in quoted-printable encoding! */
+			return -EDOM;
+		}
+		else if (c == '=') {
+			int high = hexchar_to_int(*++i);
+			if (high < 0)
+				return -EINVAL;
+			int low = hexchar_to_int(*++i);
+			if (low < 0)
+				return -EINVAL;
+			c = (high << 4) + low;
+		}
+		++i;
+
+		if (outlen >= 1) {
+			snprintf(o, outlen, "%c", c);
+			outlen -= 1;
+			o += 1;
+		}
+		ret += 1;
+	}
+	return ret;
+}
diff --git a/src/common/mime.h b/src/common/mime.h
new file mode 100644
index 00000000..f62040a2
--- /dev/null
+++ b/src/common/mime.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_MIME_H
+#define CEPH_COMMON_MIME_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Encode a buffer as quoted-printable.
+ *
+ * The input is a null-terminated string.
+ * The output is a null-terminated string representing the input encoded as
+ * a MIME quoted-printable.
+ *
+ * Returns the length of the buffer we would need to do the encoding.
+ * If we don't have enough buffer space, the output will be truncated.
+ *
+ * You may call mime_encode_as_qp(input, NULL, 0) to find the size of the
+ * buffer you will need.
+ */
+signed int mime_encode_as_qp(const char *input, char *output, int outlen);
+
+/* Decode a quoted-printable buffer.
+ *
+ * The input is a null-terminated string encoded as a MIME quoted-printable.
+ * The output is a null-terminated string representing the input decoded.
+ *
+ * Returns a negative error code if the input is not a valid quoted-printable
+ * buffer.
+ * Returns the length of the buffer we would need to do the encoding.
+ * If we don't have enough buffer space, the output will be truncated.
+ *
+ * You may call mime_decode_as_qp(input, NULL, 0) to find the size of the
+ * buffer you will need. The output will never be longer than the input for
+ * this function.
+ */
+signed int mime_decode_from_qp(const char *input, char *output, int outlen);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/common/module.c b/src/common/module.c
new file mode 100644
index 00000000..f19f7432
--- /dev/null
+++ b/src/common/module.c
@@ -0,0 +1,83 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "acconfig.h"
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#if defined(__FreeBSD__)
+#include <sys/wait.h>
+#endif 
+
+/*
+ * TODO: Switch to libkmod when we abandon older platforms.  The APIs
+ * we want are:
+ *
+ * - kmod_module_new_from_name() for obtaining handles;
+ * - kmod_module_probe_insert_module() for module_load();
+ * - kmod_module_get_info(), kmod_module_info_get_{key,value}() for
+ *   module_has_param().
+ */
+
+/*
+ * Return command's exit status or -1 on error.
+ */
+static int run_command(const char *command)
+{
+	int status;
+
+	status = system(command);
+	if (status >= 0 && WIFEXITED(status))
+		return WEXITSTATUS(status);
+
+	if (status < 0) {
+		char error_buf[80];
+#ifdef STRERROR_R_CHAR_P
+		char* dummy = strerror_r(errno, error_buf, sizeof(error_buf));
+		(void)dummy;
+#else
+		strerror_r(errno, error_buf, sizeof(error_buf));
+#endif
+		fprintf(stderr, "couldn't run '%s': %s\n", command,
+			error_buf);
+	} else if (WIFSIGNALED(status)) {
+		fprintf(stderr, "'%s' killed by signal %d\n", command,
+			WTERMSIG(status));
+	} else {
+		fprintf(stderr, "weird status from '%s': %d\n", command,
+			status);
+	}
+
+	return -1;
+}
+
+int module_has_param(const char *module, const char *param)
+{
+	char command[128];
+
+	snprintf(command, sizeof(command),
+		 "/sbin/modinfo -F parm %s | /bin/grep -q ^%s:",
+		 module, param);
+
+	return run_command(command) == 0;
+}
+
+int module_load(const char *module, const char *options)
+{
+	char command[128];
+
+	snprintf(command, sizeof(command), "/sbin/modprobe %s %s",
+		 module, (options ? options : ""));
+
+	return run_command(command);
+}
diff --git a/src/common/module.h b/src/common/module.h
new file mode 100644
index 00000000..d5fa6a1a
--- /dev/null
+++ b/src/common/module.h
@@ -0,0 +1,27 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MODULE_H
+#define CEPH_MODULE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int module_has_param(const char *module, const char *param);
+int module_load(const char *module, const char *options);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CEPH_MODULE_H */
diff --git a/src/common/mutex_debug.cc b/src/common/mutex_debug.cc
new file mode 100644
index 00000000..5660826e
--- /dev/null
+++ b/src/common/mutex_debug.cc
@@ -0,0 +1,62 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/mutex_debug.h"
+#include "common/perf_counters.h"
+#include "common/ceph_context.h"
+#include "common/config.h"
+
+namespace ceph {
+namespace mutex_debug_detail {
+enum {
+  l_mutex_first = 999082,
+  l_mutex_wait,
+  l_mutex_last
+};
+
+mutex_debugging_base::mutex_debugging_base(const std::string &n, bool bt)
+  : name(n), id(-1), backtrace(bt), nlock(0), locked_by(thread::id())
+{
+  if (g_lockdep)
+    _register();
+}
+mutex_debugging_base::mutex_debugging_base(const char *n, bool bt)
+  : name(n), id(-1), backtrace(bt), nlock(0), locked_by(thread::id())
+{
+  if (g_lockdep)
+    _register();
+}
+
+mutex_debugging_base::~mutex_debugging_base() {
+  ceph_assert(nlock == 0);
+  if (g_lockdep) {
+    lockdep_unregister(id);
+  }
+}
+
+void mutex_debugging_base::_register() {
+  id = lockdep_register(name.c_str());
+}
+void mutex_debugging_base::_will_lock(bool recursive) { // about to lock
+  id = lockdep_will_lock(name.c_str(), id, backtrace, recursive);
+}
+void mutex_debugging_base::_locked() {    // just locked
+  id = lockdep_locked(name.c_str(), id, backtrace);
+}
+void mutex_debugging_base::_will_unlock() {  // about to unlock
+  id = lockdep_will_unlock(name.c_str(), id);
+}
+
+} // namespace mutex_debug_detail
+} // namespace ceph
diff --git a/src/common/mutex_debug.h b/src/common/mutex_debug.h
new file mode 100644
index 00000000..247233fd
--- /dev/null
+++ b/src/common/mutex_debug.h
@@ -0,0 +1,211 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_MUTEX_DEBUG_H
+#define CEPH_COMMON_MUTEX_DEBUG_H
+
+#include <system_error>
+#include <thread>
+
+#include <pthread.h>
+
+#include "include/ceph_assert.h"
+
+#include "ceph_time.h"
+#include "likely.h"
+#include "lockdep.h"
+
+class CephContext;
+class PerfCounters;
+
+namespace ceph {
+namespace mutex_debug_detail {
+
+class mutex_debugging_base
+{
+protected:
+  std::string name;
+  int id;
+  bool backtrace; // gather backtrace on lock acquisition
+
+  int nlock;
+  std::thread::id locked_by;
+
+
+  void _register();
+  void _will_lock(bool recursive=false); // about to lock
+  void _locked(); // just locked
+  void _will_unlock(); // about to unlock
+
+  mutex_debugging_base(const std::string &n, bool bt = false);
+  mutex_debugging_base(const char *n, bool bt = false);
+  ~mutex_debugging_base();
+
+public:
+  bool is_locked() const {
+    return (nlock > 0);
+  }
+  bool is_locked_by_me() const {
+    return nlock > 0 && locked_by == std::this_thread::get_id();
+  }
+  operator bool() const {
+    return nlock > 0 && locked_by == std::this_thread::get_id();
+  }
+};
+
+// Since this is a /debugging/ mutex just define it in terms of the
+// pthread error check mutex.
+template<bool Recursive>
+class mutex_debug_impl : public mutex_debugging_base
+{
+private:
+  pthread_mutex_t m;
+
+  void _init() {
+    pthread_mutexattr_t a;
+    pthread_mutexattr_init(&a);
+    int r;
+    if (recursive)
+      r = pthread_mutexattr_settype(&a, PTHREAD_MUTEX_RECURSIVE);
+    else
+      r = pthread_mutexattr_settype(&a, PTHREAD_MUTEX_ERRORCHECK);
+    ceph_assert(r == 0);
+    r = pthread_mutex_init(&m, &a);
+    ceph_assert(r == 0);
+  }
+
+  bool enable_lockdep(bool no_lockdep) const {
+    if (recursive) {
+      return false;
+    } else if (no_lockdep) {
+      return false;
+    } else {
+      return g_lockdep;
+    }
+  }
+
+public:
+  static constexpr bool recursive = Recursive;
+
+  // Mutex concept is DefaultConstructible
+  mutex_debug_impl(const std::string &n, bool bt = false)
+    : mutex_debugging_base(n, bt) {
+    _init();
+  }
+  mutex_debug_impl(const char *n, bool bt = false)
+    : mutex_debugging_base(n, bt) {
+    _init();
+  }
+
+  // Mutex is Destructible
+  ~mutex_debug_impl() {
+    int r = pthread_mutex_destroy(&m);
+    ceph_assert(r == 0);
+  }
+
+  // Mutex concept is non-Copyable
+  mutex_debug_impl(const mutex_debug_impl&) = delete;
+  mutex_debug_impl& operator =(const mutex_debug_impl&) = delete;
+
+  // Mutex concept is non-Movable
+  mutex_debug_impl(mutex_debug_impl&&) = delete;
+  mutex_debug_impl& operator =(mutex_debug_impl&&) = delete;
+
+  void lock_impl() {
+    int r = pthread_mutex_lock(&m);
+    // Allowed error codes for Mutex concept
+    if (unlikely(r == EPERM ||
+		 r == EDEADLK ||
+		 r == EBUSY)) {
+      throw std::system_error(r, std::generic_category());
+    }
+    ceph_assert(r == 0);
+  }
+
+  void unlock_impl() noexcept {
+    int r = pthread_mutex_unlock(&m);
+    ceph_assert(r == 0);
+  }
+
+  bool try_lock_impl() {
+    int r = pthread_mutex_trylock(&m);
+    switch (r) {
+    case 0:
+      return true;
+    case EBUSY:
+      return false;
+    default:
+      throw std::system_error(r, std::generic_category());
+    }
+  }
+  pthread_mutex_t* native_handle() {
+    return &m;
+  }
+
+  void _post_lock() {
+    if (!recursive)
+      ceph_assert(nlock == 0);
+    locked_by = std::this_thread::get_id();
+    nlock++;
+  }
+
+  void _pre_unlock() {
+    ceph_assert(nlock > 0);
+    --nlock;
+    ceph_assert(locked_by == std::this_thread::get_id());
+    if (!recursive)
+      ceph_assert(nlock == 0);
+    if (nlock == 0)
+      locked_by = std::thread::id();
+  }
+
+  bool try_lock(bool no_lockdep = false) {
+    bool locked = try_lock_impl();
+    if (locked) {
+      if (enable_lockdep(no_lockdep))
+	_locked();
+      _post_lock();
+    }
+    return locked;
+  }
+
+  void lock(bool no_lockdep = false) {
+    if (enable_lockdep(no_lockdep))
+      _will_lock(recursive);
+
+    if (try_lock())
+      return;
+
+    lock_impl();
+    if (enable_lockdep(no_lockdep))
+      _locked();
+    _post_lock();
+  }
+
+  void unlock(bool no_lockdep = false) {
+    _pre_unlock();
+    if (enable_lockdep(no_lockdep))
+      _will_unlock();
+    unlock_impl();
+  }
+
+};
+
+
+} // namespace mutex_debug_detail
+typedef mutex_debug_detail::mutex_debug_impl<false> mutex_debug;
+typedef mutex_debug_detail::mutex_debug_impl<true> mutex_recursive_debug;
+} // namespace ceph
+
+#endif
diff --git a/src/common/numa.cc b/src/common/numa.cc
new file mode 100644
index 00000000..dc80d0f3
--- /dev/null
+++ b/src/common/numa.cc
@@ -0,0 +1,221 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "numa.h"
+
+#include <cstring>
+#include <errno.h>
+#include <iostream>
+
+#include "include/stringify.h"
+#include "common/safe_io.h"
+
+
+// list
+#if defined(__linux__)
+int parse_cpu_set_list(const char *s,
+		       size_t *cpu_set_size,
+		       cpu_set_t *cpu_set)
+{
+  CPU_ZERO(cpu_set);
+  while (*s) {
+    char *end;
+    int a = strtol(s, &end, 10);
+    if (end == s) {
+      return -EINVAL;
+    }
+    if (*end == '-') {
+      s = end + 1;
+      int b = strtol(s, &end, 10);
+      if (end == s) {
+	return -EINVAL;
+      }
+      for (; a <= b; ++a) {
+	CPU_SET(a, cpu_set);
+      }
+      *cpu_set_size = a;
+    } else {
+      CPU_SET(a, cpu_set);
+      *cpu_set_size = a + 1;
+    }
+    if (*end == 0) {
+      break;
+    }
+    if (*end != ',') {
+      return -EINVAL;
+    }
+    s = end + 1;
+  }
+  return 0;
+}
+
+std::string cpu_set_to_str_list(size_t cpu_set_size,
+				const cpu_set_t *cpu_set)
+{
+  std::string r;
+  unsigned a = 0;
+  while (true) {
+    while (a < cpu_set_size && !CPU_ISSET(a, cpu_set)) {
+      ++a;
+    }
+    if (a >= cpu_set_size) {
+      break;
+    }
+    unsigned b = a + 1;
+    while (b < cpu_set_size && CPU_ISSET(b, cpu_set)) {
+      ++b;
+    }
+    if (r.size()) {
+      r += ",";
+    }
+    if (b > a + 1) {
+      r += stringify(a) + "-" + stringify(b - 1);
+    } else {
+      r += stringify(a);
+    }
+    a = b;
+  }
+  return r;
+}
+
+std::set<int> cpu_set_to_set(size_t cpu_set_size,
+			     const cpu_set_t *cpu_set)
+{
+  set<int> r;
+  unsigned a = 0;
+  while (true) {
+    while (a < cpu_set_size && !CPU_ISSET(a, cpu_set)) {
+      ++a;
+    }
+    if (a >= cpu_set_size) {
+      break;
+    }
+    unsigned b = a + 1;
+    while (b < cpu_set_size && CPU_ISSET(b, cpu_set)) {
+      ++b;
+    }
+    while (a < b) {
+      r.insert(a);
+      ++a;
+    }
+  }
+  return r;
+}
+
+
+int get_numa_node_cpu_set(
+  int node,
+  size_t *cpu_set_size,
+  cpu_set_t *cpu_set)
+{
+  std::string fn = "/sys/devices/system/node/node";
+  fn += stringify(node);
+  fn += "/cpulist";
+  int fd = ::open(fn.c_str(), O_RDONLY);
+  if (fd < 0) {
+    return -errno;
+  }
+  char buf[1024];
+  int r = safe_read(fd, &buf, sizeof(buf));
+  if (r < 0) {
+    goto out;
+  }
+  buf[r] = 0;
+  while (r > 0 && ::isspace(buf[--r])) {
+    buf[r] = 0;
+  }
+  r = parse_cpu_set_list(buf, cpu_set_size, cpu_set);
+  if (r < 0) {
+    goto out;
+  }
+  r = 0;
+ out:
+  ::close(fd);
+  return r;
+}
+
+static int easy_readdir(const std::string& dir, std::set<std::string> *out)
+{
+  DIR *h = ::opendir(dir.c_str());
+  if (!h) {
+    return -errno;
+  }
+  struct dirent *de = nullptr;
+  while ((de = ::readdir(h))) {
+    if (strcmp(de->d_name, ".") == 0 ||
+	strcmp(de->d_name, "..") == 0) {
+      continue;
+    }
+    out->insert(de->d_name);
+  }
+  closedir(h);
+  return 0;
+}
+
+int set_cpu_affinity_all_threads(size_t cpu_set_size, cpu_set_t *cpu_set)
+{
+  // first set my affinity
+  int r = sched_setaffinity(getpid(), cpu_set_size, cpu_set);
+  if (r < 0) {
+    return -errno;
+  }
+
+  // make 2 passes here so that we (hopefully) catch racing threads creating
+  // threads.
+  for (unsigned pass = 0; pass < 2; ++pass) {
+    // enumerate all child threads from /proc
+    std::set<std::string> ls;
+    std::string path = "/proc/"s + stringify(getpid()) + "/task";
+    r = easy_readdir(path, &ls);
+    if (r < 0) {
+      return r;
+    }
+    for (auto& i : ls) {
+      pid_t tid = atoll(i.c_str());
+      if (!tid) {
+	continue;  // wtf
+      }
+      r = sched_setaffinity(tid, cpu_set_size, cpu_set);
+      if (r < 0) {
+	return -errno;
+      }
+    }
+  }
+  return 0;
+}
+
+#elif defined(__FreeBSD__)
+
+int parse_cpu_set_list(const char *s,
+		       size_t *cpu_set_size,
+		       cpu_set_t *cpu_set)
+{
+  return -ENOTSUP;
+}
+
+std::string cpu_set_to_str_list(size_t cpu_set_size,
+				const cpu_set_t *cpu_set)
+{
+  return {};
+}
+
+std::set<int> cpu_set_to_set(size_t cpu_set_size,
+			     const cpu_set_t *cpu_set)
+{
+  return {};
+}
+
+int get_numa_node_cpu_set(int node,
+                          size_t *cpu_set_size,
+                          cpu_set_t *cpu_set)
+{
+  return -ENOTSUP;
+}
+
+int set_cpu_affinity_all_threads(size_t cpu_set_size,
+				 cpu_set_t *cpu_set)
+{
+  return -ENOTSUP;
+}
+
+#endif
diff --git a/src/common/numa.h b/src/common/numa.h
new file mode 100644
index 00000000..78851dee
--- /dev/null
+++ b/src/common/numa.h
@@ -0,0 +1,24 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <include/compat.h>
+#include <sched.h>
+#include <ostream>
+#include <set>
+
+int parse_cpu_set_list(const char *s,
+		       size_t *cpu_set_size,
+		       cpu_set_t *cpu_set);
+std::string cpu_set_to_str_list(size_t cpu_set_size,
+				const cpu_set_t *cpu_set);
+std::set<int> cpu_set_to_set(size_t cpu_set_size,
+			     const cpu_set_t *cpu_set);
+
+int get_numa_node_cpu_set(int node,
+			  size_t *cpu_set_size,
+			  cpu_set_t *cpu_set);
+
+int set_cpu_affinity_all_threads(size_t cpu_set_size,
+				 cpu_set_t *cpu_set);
diff --git a/src/common/obj_bencher.cc b/src/common/obj_bencher.cc
new file mode 100644
index 00000000..de73e70b
--- /dev/null
+++ b/src/common/obj_bencher.cc
@@ -0,0 +1,1506 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2009 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ * Series of functions to test your rados installation. Notice
+ * that this code is not terribly robust -- for instance, if you
+ * try and bench on a pool you don't have permission to access
+ * it will just loop forever.
+ */
+#include "include/compat.h"
+#include <pthread.h>
+#include "common/Cond.h"
+#include "obj_bencher.h"
+
+const std::string BENCH_LASTRUN_METADATA = "benchmark_last_metadata";
+const std::string BENCH_PREFIX = "benchmark_data";
+const std::string BENCH_OBJ_NAME = BENCH_PREFIX + "_%s_%d_object%d";
+
+static char cached_hostname[30] = {0};
+int cached_pid = 0;
+
+static std::string generate_object_prefix_nopid() {
+  if (cached_hostname[0] == 0) {
+    gethostname(cached_hostname, sizeof(cached_hostname)-1);
+    cached_hostname[sizeof(cached_hostname)-1] = 0;
+  }
+
+  std::ostringstream oss;
+  oss << BENCH_PREFIX << "_" << cached_hostname;
+  return oss.str();
+}
+
+static std::string generate_object_prefix(int pid = 0) {
+  if (pid)
+    cached_pid = pid;
+  else if (!cached_pid)
+    cached_pid = getpid();
+
+  std::ostringstream oss;
+  oss << generate_object_prefix_nopid() << "_" << cached_pid;
+  return oss.str();
+}
+
+// this is 8x faster than previous impl based on chained, deduped functions call
+static std::string generate_object_name_fast(int objnum, int pid = 0)
+{
+  if (cached_hostname[0] == 0) {
+	gethostname(cached_hostname, sizeof(cached_hostname)-1);
+	cached_hostname[sizeof(cached_hostname)-1] = 0;
+  }
+
+  if (pid)
+	cached_pid = pid;
+  else if (!cached_pid)
+	cached_pid = getpid();
+
+  char name[512];
+  int n = snprintf(&name[0], sizeof(name),  BENCH_OBJ_NAME.c_str(), cached_hostname, cached_pid, objnum);
+  ceph_assert(n > 0 && n < (int)sizeof(name));
+  return std::string(&name[0], (size_t)n);
+}
+
+static void sanitize_object_contents (bench_data *data, size_t length) {
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(data->object_contents, 'z', length);
+}
+
+ostream& ObjBencher::out(ostream& os, utime_t& t)
+{
+  if (show_time)
+    return t.localtime(os) << " ";
+  else
+    return os;
+}
+
+ostream& ObjBencher::out(ostream& os)
+{
+  utime_t cur_time = ceph_clock_now();
+  return out(os, cur_time);
+}
+
+void *ObjBencher::status_printer(void *_bencher) {
+  ObjBencher *bencher = static_cast<ObjBencher *>(_bencher);
+  bench_data& data = bencher->data;
+  Formatter *formatter = bencher->formatter;
+  ostream *outstream = bencher->outstream;
+  Cond cond;
+  int i = 0;
+  int previous_writes = 0;
+  int cycleSinceChange = 0;
+  double bandwidth;
+  int iops = 0;
+  mono_clock::duration ONE_SECOND = std::chrono::seconds(1);
+  bencher->lock.lock();
+  if (formatter)
+    formatter->open_array_section("datas");
+  while(!data.done) {
+    mono_time cur_time = mono_clock::now();
+    utime_t t = ceph_clock_now();
+
+    if (i % 20 == 0 && !formatter) {
+      if (i > 0)
+        t.localtime(cout)
+          << " min lat: " << data.min_latency
+          << " max lat: " << data.max_latency
+          << " avg lat: " << data.avg_latency << std::endl;
+      //I'm naughty and don't reset the fill
+      bencher->out(cout, t) << setfill(' ')
+          << setw(5) << "sec"
+          << setw(8) << "Cur ops"
+          << setw(10) << "started"
+          << setw(10) << "finished"
+          << setw(10) << "avg MB/s"
+          << setw(10) << "cur MB/s"
+          << setw(12) << "last lat(s)"
+          << setw(12) << "avg lat(s)" << std::endl;
+    }
+    if (cycleSinceChange)
+      bandwidth = (double)(data.finished - previous_writes)
+        * (data.op_size)
+        / (1024*1024)
+        / cycleSinceChange;
+    else
+      bandwidth = -1;
+
+    if (!std::isnan(bandwidth) && bandwidth > -1) {
+      if (bandwidth > data.idata.max_bandwidth)
+        data.idata.max_bandwidth = bandwidth;
+      if (bandwidth < data.idata.min_bandwidth)
+        data.idata.min_bandwidth = bandwidth;
+
+      ++data.idata.bandwidth_cycles;
+      double delta = bandwidth - data.idata.avg_bandwidth;
+      data.idata.avg_bandwidth += delta / data.idata.bandwidth_cycles;
+      data.idata.bandwidth_diff_sum += delta * (bandwidth - data.idata.avg_bandwidth);
+    }
+
+    if (cycleSinceChange)
+      iops = (double)(data.finished - previous_writes)
+        / cycleSinceChange;
+    else
+      iops = -1;
+
+    if (!std::isnan(iops) && iops > -1) {
+      if (iops > data.idata.max_iops)
+        data.idata.max_iops = iops;
+      if (iops < data.idata.min_iops)
+        data.idata.min_iops = iops;
+
+      ++data.idata.iops_cycles;
+      double delta = iops - data.idata.avg_iops;
+      data.idata.avg_iops += delta / data.idata.iops_cycles;
+      data.idata.iops_diff_sum += delta * (iops - data.idata.avg_iops);
+    }
+    
+    if (formatter)
+      formatter->open_object_section("data");
+
+    // elapsed will be in seconds, by default
+    std::chrono::duration<double> elapsed = cur_time - data.start_time;
+    double avg_bandwidth = (double) (data.op_size) * (data.finished)
+      / elapsed.count() / (1024*1024);
+    if (previous_writes != data.finished) {
+      previous_writes = data.finished;
+      cycleSinceChange = 0;
+      if (!formatter) {
+        bencher->out(cout, t)
+	  << setfill(' ')
+          << setw(5) << i
+	  << ' ' << setw(7) << data.in_flight
+          << ' ' << setw(9) << data.started
+          << ' ' << setw(9) << data.finished
+          << ' ' << setw(9) << avg_bandwidth
+          << ' ' << setw(9) << bandwidth
+          << ' ' << setw(11) << (double)data.cur_latency.count()
+          << ' ' << setw(11) << data.avg_latency << std::endl;
+      } else {
+        formatter->dump_format("sec", "%d", i);
+        formatter->dump_format("cur_ops", "%d", data.in_flight);
+        formatter->dump_format("started", "%d", data.started);
+        formatter->dump_format("finished", "%d", data.finished);
+        formatter->dump_format("avg_bw", "%f", avg_bandwidth);
+        formatter->dump_format("cur_bw", "%f", bandwidth);
+        formatter->dump_format("last_lat", "%f", (double)data.cur_latency.count());
+        formatter->dump_format("avg_lat", "%f", data.avg_latency);
+      }
+    }
+    else {
+      if (!formatter) {
+        bencher->out(cout, t)
+	  << setfill(' ')
+          << setw(5) << i
+	  << ' ' << setw(7) << data.in_flight
+          << ' ' << setw(9) << data.started
+          << ' ' << setw(9) << data.finished
+          << ' ' << setw(9) << avg_bandwidth
+	  << ' ' << setw(9) << '0'
+          << ' ' << setw(11) << '-'
+          << ' '<< setw(11) << data.avg_latency << std::endl;
+      } else {
+        formatter->dump_format("sec", "%d", i);
+        formatter->dump_format("cur_ops", "%d", data.in_flight);
+        formatter->dump_format("started", "%d", data.started);
+        formatter->dump_format("finished", "%d", data.finished);
+        formatter->dump_format("avg_bw", "%f", avg_bandwidth);
+        formatter->dump_format("cur_bw", "%f", 0);
+        formatter->dump_format("last_lat", "%f", 0);
+        formatter->dump_format("avg_lat", "%f", data.avg_latency);
+      }
+    }
+    if (formatter) {
+      formatter->close_section(); // data
+      formatter->flush(*outstream);
+    }
+    ++i;
+    ++cycleSinceChange;
+    cond.WaitInterval(bencher->lock, ONE_SECOND);
+  }
+  if (formatter)
+    formatter->close_section(); //datas
+  if (iops < 0) {
+    std::chrono::duration<double> runtime = mono_clock::now() - data.start_time;
+    data.idata.min_iops = data.idata.max_iops = data.finished / runtime.count();
+  }
+  bencher->lock.unlock();
+  return NULL;
+}
+
+int ObjBencher::aio_bench(
+  int operation, int secondsToRun,
+  int concurrentios,
+  uint64_t op_size, uint64_t object_size,
+  unsigned max_objects,
+  bool cleanup, bool hints,
+  const std::string& run_name, bool reuse_bench, bool no_verify) {
+
+  if (concurrentios <= 0)
+    return -EINVAL;
+
+  int num_objects = 0;
+  int r = 0;
+  int prev_pid = 0;
+  std::chrono::duration<double> timePassed;
+
+  // default metadata object is used if user does not specify one
+  const std::string run_name_meta = (run_name.empty() ? BENCH_LASTRUN_METADATA : run_name);
+
+  //get data from previous write run, if available
+  if (operation != OP_WRITE || reuse_bench) {
+    uint64_t prev_op_size, prev_object_size;
+    r = fetch_bench_metadata(run_name_meta, &prev_op_size, &prev_object_size,
+			     &num_objects, &prev_pid);
+    if (r < 0) {
+      if (r == -ENOENT) {
+        if (reuse_bench)
+          cerr << "Must write data before using reuse_bench for a write benchmark!" << std::endl;
+        else
+          cerr << "Must write data before running a read benchmark!" << std::endl;
+      }
+      return r;
+    }
+    object_size = prev_object_size;   
+    op_size = prev_op_size;           
+  }
+
+  char* contentsChars = new char[op_size];
+  lock.lock();
+  data.done = false;
+  data.hints = hints;
+  data.object_size = object_size;
+  data.op_size = op_size;
+  data.in_flight = 0;
+  data.started = 0;
+  data.finished = 0;
+  data.min_latency = 9999.0; // this better be higher than initial latency!
+  data.max_latency = 0;
+  data.avg_latency = 0;
+  data.latency_diff_sum = 0;
+  data.object_contents = contentsChars;
+  lock.unlock();
+
+  //fill in contentsChars deterministically so we can check returns
+  sanitize_object_contents(&data, data.op_size);
+
+  if (formatter)
+    formatter->open_object_section("bench");
+
+  if (OP_WRITE == operation) {
+    r = write_bench(secondsToRun, concurrentios, run_name_meta, max_objects, prev_pid);
+    if (r != 0) goto out;
+  }
+  else if (OP_SEQ_READ == operation) {
+    r = seq_read_bench(secondsToRun, num_objects, concurrentios, prev_pid, no_verify);
+    if (r != 0) goto out;
+  }
+  else if (OP_RAND_READ == operation) {
+    r = rand_read_bench(secondsToRun, num_objects, concurrentios, prev_pid, no_verify);
+    if (r != 0) goto out;
+  }
+
+  if (OP_WRITE == operation && cleanup) {
+    r = fetch_bench_metadata(run_name_meta, &op_size, &object_size,
+                            &num_objects, &prev_pid);
+    if (r < 0) {
+      if (r == -ENOENT)
+        cerr << "Should never happen: bench metadata missing for current run!" << std::endl;
+      goto out;
+    }
+
+    data.start_time = mono_clock::now();
+    out(cout) << "Cleaning up (deleting benchmark objects)" << std::endl;
+
+    r = clean_up(num_objects, prev_pid, concurrentios);
+    if (r != 0) goto out;
+
+    timePassed = mono_clock::now() - data.start_time;
+    out(cout) << "Clean up completed and total clean up time :" << timePassed.count() << std::endl;
+
+    // lastrun file
+    r = sync_remove(run_name_meta);
+    if (r != 0) goto out;
+  }
+
+ out:
+  if (formatter) {
+    formatter->close_section(); // bench
+    formatter->flush(*outstream);
+    *outstream << std::endl;
+  }
+  delete[] contentsChars;
+  return r;
+}
+
+struct lock_cond {
+  explicit lock_cond(Mutex *_lock) : lock(_lock) {}
+  Mutex *lock;
+  Cond cond;
+};
+
+void _aio_cb(void *cb, void *arg) {
+  struct lock_cond *lc = (struct lock_cond *)arg;
+  lc->lock->lock();
+  lc->cond.Signal();
+  lc->lock->unlock();
+}
+
+int ObjBencher::fetch_bench_metadata(const std::string& metadata_file,
+				     uint64_t *op_size, uint64_t* object_size,
+				     int* num_objects, int* prevPid) {
+  int r = 0;
+  bufferlist object_data;
+
+  r = sync_read(metadata_file, object_data,
+		sizeof(int) * 2 + sizeof(size_t) * 2);
+  if (r <= 0) {
+    // treat an empty file as a file that does not exist
+    if (r == 0) {
+      r = -ENOENT;
+    }
+    return r;
+  }
+  auto p = object_data.cbegin();
+  decode(*object_size, p);
+  decode(*num_objects, p);
+  decode(*prevPid, p);
+  if (!p.end()) {
+    decode(*op_size, p);
+  } else {
+    *op_size = *object_size;
+  }
+
+  return 0;
+}
+
+int ObjBencher::write_bench(int secondsToRun,
+			    int concurrentios, const string& run_name_meta,
+			    unsigned max_objects, int prev_pid) {
+  if (concurrentios <= 0) 
+    return -EINVAL;
+  
+  if (!formatter) {
+    out(cout) << "Maintaining " << concurrentios << " concurrent writes of "
+	      << data.op_size << " bytes to objects of size "
+	      << data.object_size << " for up to "
+	      << secondsToRun << " seconds or "
+	      << max_objects << " objects"
+	      << std::endl;
+  } else {
+    formatter->dump_format("concurrent_ios", "%d", concurrentios);
+    formatter->dump_format("object_size", "%d", data.object_size);
+    formatter->dump_format("op_size", "%d", data.op_size);
+    formatter->dump_format("seconds_to_run", "%d", secondsToRun);
+    formatter->dump_format("max_objects", "%d", max_objects);
+  }
+  bufferlist* newContents = 0;
+
+  std::string prefix = prev_pid ? generate_object_prefix(prev_pid) : generate_object_prefix();
+  if (!formatter)
+    out(cout) << "Object prefix: " << prefix << std::endl;
+  else
+    formatter->dump_string("object_prefix", prefix);
+
+  std::vector<string> name(concurrentios);
+  std::string newName;
+  unique_ptr<bufferlist> contents[concurrentios];
+  int r = 0;
+  bufferlist b_write;
+  lock_cond lc(&lock);
+  double total_latency = 0;
+  std::vector<mono_time> start_times(concurrentios);
+  mono_time stopTime;
+  std::chrono::duration<double> timePassed;
+
+  unsigned writes_per_object = 1;
+  if (data.op_size)
+    writes_per_object = data.object_size / data.op_size;
+
+  r = completions_init(concurrentios);
+
+  //set up writes so I can start them together
+  for (int i = 0; i<concurrentios; ++i) {
+    name[i] = generate_object_name_fast(i / writes_per_object);
+    contents[i] = std::make_unique<bufferlist>();
+    snprintf(data.object_contents, data.op_size, "I'm the %16dth op!", i);
+    contents[i]->append(data.object_contents, data.op_size);
+  }
+
+  pthread_t print_thread;
+
+  pthread_create(&print_thread, NULL, ObjBencher::status_printer, (void *)this);
+  ceph_pthread_setname(print_thread, "write_stat");
+  lock.lock();
+  data.finished = 0;
+  data.start_time = mono_clock::now();
+  lock.unlock();
+  for (int i = 0; i<concurrentios; ++i) {
+    start_times[i] = mono_clock::now();
+    r = create_completion(i, _aio_cb, (void *)&lc);
+    if (r < 0)
+      goto ERR;
+    r = aio_write(name[i], i, *contents[i], data.op_size,
+		  data.op_size * (i % writes_per_object));
+    if (r < 0) {
+      goto ERR;
+    }
+    lock.lock();
+    ++data.started;
+    ++data.in_flight;
+    lock.unlock();
+  }
+
+  //keep on adding new writes as old ones complete until we've passed minimum time
+  int slot;
+  int num_objects;
+
+  //don't need locking for reads because other thread doesn't write
+
+  stopTime = data.start_time + std::chrono::seconds(secondsToRun);
+  slot = 0;
+  lock.lock();
+  while (secondsToRun && mono_clock::now() < stopTime) {
+    bool found = false;
+    while (1) {
+      int old_slot = slot;
+      do {
+        if (completion_is_done(slot)) {
+            found = true;
+            break;
+        }
+        slot++;
+        if (slot == concurrentios) {
+          slot = 0;
+        }
+      } while (slot != old_slot);
+      if (found)
+        break;
+      lc.cond.Wait(lock);
+    }
+    lock.unlock();
+    //create new contents and name on the heap, and fill them
+    newName = generate_object_name_fast(data.started / writes_per_object);
+    newContents = contents[slot].get();
+    snprintf(newContents->c_str(), data.op_size, "I'm the %16dth op!", data.started);
+    // we wrote to buffer, going around internal crc cache, so invalidate it now.
+    newContents->invalidate_crc();
+
+    completion_wait(slot);
+    lock.lock();
+    r = completion_ret(slot);
+    if (r != 0) {
+      lock.unlock();
+      goto ERR;
+    }
+    data.cur_latency = mono_clock::now() - start_times[slot];
+    total_latency += data.cur_latency.count();
+    if( data.cur_latency.count() > data.max_latency)
+      data.max_latency = data.cur_latency.count();
+    if (data.cur_latency.count() < data.min_latency)
+      data.min_latency = data.cur_latency.count();
+    ++data.finished;
+    double delta = data.cur_latency.count() - data.avg_latency;
+    data.avg_latency = total_latency / data.finished;
+    data.latency_diff_sum += delta * (data.cur_latency.count() - data.avg_latency);
+    --data.in_flight;
+    lock.unlock();
+    release_completion(slot);
+
+    //write new stuff to backend
+    start_times[slot] = mono_clock::now();
+    r = create_completion(slot, _aio_cb, &lc);
+    if (r < 0)
+      goto ERR;
+    r = aio_write(newName, slot, *newContents, data.op_size,
+		  data.op_size * (data.started % writes_per_object));
+    if (r < 0) {
+      goto ERR;
+    }
+    name[slot] = newName;
+    lock.lock();
+    ++data.started;
+    ++data.in_flight;
+    if (data.op_size) {
+      if (max_objects &&
+	  data.started >= (int)((data.object_size * max_objects + data.op_size - 1) /
+			       data.op_size))
+        break;
+    }
+  }
+  lock.unlock();
+
+  while (data.finished < data.started) {
+    slot = data.finished % concurrentios;
+    completion_wait(slot);
+    lock.lock();
+    r = completion_ret(slot);
+    if (r != 0) {
+      lock.unlock();
+      goto ERR;
+    }
+    data.cur_latency = mono_clock::now() - start_times[slot];
+    total_latency += data.cur_latency.count();
+    if (data.cur_latency.count() > data.max_latency)
+      data.max_latency = data.cur_latency.count();
+    if (data.cur_latency.count() < data.min_latency)
+      data.min_latency = data.cur_latency.count();
+    ++data.finished;
+    double delta = data.cur_latency.count() - data.avg_latency;
+    data.avg_latency = total_latency / data.finished;
+    data.latency_diff_sum += delta * (data.cur_latency.count() - data.avg_latency);
+    --data.in_flight;
+    lock.unlock();
+    release_completion(slot);
+  }
+
+  timePassed = mono_clock::now() - data.start_time;
+  lock.lock();
+  data.done = true;
+  lock.unlock();
+
+  pthread_join(print_thread, NULL);
+
+  double bandwidth;
+  bandwidth = ((double)data.finished)*((double)data.op_size) /
+       timePassed.count();
+  bandwidth = bandwidth/(1024*1024); // we want it in MB/sec
+
+  double bandwidth_stddev;
+  double iops_stddev;
+  double latency_stddev;
+  if (data.idata.bandwidth_cycles > 1) {
+    bandwidth_stddev = std::sqrt(data.idata.bandwidth_diff_sum / (data.idata.bandwidth_cycles - 1));
+  } else {
+    bandwidth_stddev = 0;
+  }
+  if (data.idata.iops_cycles > 1) {
+    iops_stddev = std::sqrt(data.idata.iops_diff_sum / (data.idata.iops_cycles - 1));
+  } else {
+    iops_stddev = 0;
+  }
+  if (data.finished > 1) {
+    latency_stddev = std::sqrt(data.latency_diff_sum / (data.finished - 1));
+  } else {
+    latency_stddev = 0;
+  }
+
+  if (!formatter) {
+    out(cout) << "Total time run:         " << timePassed.count() << std::endl
+       << "Total writes made:      " << data.finished << std::endl
+       << "Write size:             " << data.op_size << std::endl
+       << "Object size:            " << data.object_size << std::endl      
+       << "Bandwidth (MB/sec):     " << setprecision(6) << bandwidth << std::endl
+       << "Stddev Bandwidth:       " << bandwidth_stddev << std::endl
+       << "Max bandwidth (MB/sec): " << data.idata.max_bandwidth << std::endl
+       << "Min bandwidth (MB/sec): " << data.idata.min_bandwidth << std::endl
+       << "Average IOPS:           " << (int)(data.finished/timePassed.count()) << std::endl
+       << "Stddev IOPS:            " << iops_stddev << std::endl
+       << "Max IOPS:               " << data.idata.max_iops << std::endl
+       << "Min IOPS:               " << data.idata.min_iops << std::endl
+       << "Average Latency(s):     " << data.avg_latency << std::endl
+       << "Stddev Latency(s):      " << latency_stddev << std::endl
+       << "Max latency(s):         " << data.max_latency << std::endl
+       << "Min latency(s):         " << data.min_latency << std::endl;
+  } else {
+    formatter->dump_format("total_time_run", "%f", timePassed.count());
+    formatter->dump_format("total_writes_made", "%d", data.finished);
+    formatter->dump_format("write_size", "%d", data.op_size);
+    formatter->dump_format("object_size", "%d", data.object_size);
+    formatter->dump_format("bandwidth", "%f", bandwidth);
+    formatter->dump_format("stddev_bandwidth", "%f", bandwidth_stddev);
+    formatter->dump_format("max_bandwidth", "%f", data.idata.max_bandwidth);
+    formatter->dump_format("min_bandwidth", "%f", data.idata.min_bandwidth);
+    formatter->dump_format("average_iops", "%d", (int)(data.finished/timePassed.count()));
+    formatter->dump_format("stddev_iops", "%d", iops_stddev);
+    formatter->dump_format("max_iops", "%d", data.idata.max_iops);
+    formatter->dump_format("min_iops", "%d", data.idata.min_iops);
+    formatter->dump_format("average_latency", "%f", data.avg_latency);
+    formatter->dump_format("stddev_latency", "%f", latency_stddev);
+    formatter->dump_format("max_latency", "%f", data.max_latency);
+    formatter->dump_format("min_latency", "%f", data.min_latency);
+  }
+  //write object size/number data for read benchmarks
+  encode(data.object_size, b_write);
+  num_objects = (data.finished + writes_per_object - 1) / writes_per_object;
+  encode(num_objects, b_write);
+  encode(prev_pid ? prev_pid : getpid(),  b_write);
+  encode(data.op_size, b_write);
+
+  // persist meta-data for further cleanup or read
+  sync_write(run_name_meta, b_write, sizeof(int)*3);
+
+  completions_done();
+
+  return 0;
+
+ ERR:
+  lock.lock();
+  data.done = 1;
+  lock.unlock();
+  pthread_join(print_thread, NULL);
+  return r;
+}
+
+int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurrentios, int pid, bool no_verify) {
+  lock_cond lc(&lock);
+
+  if (concurrentios <= 0) 
+    return -EINVAL;
+
+  std::vector<string> name(concurrentios);
+  std::string newName;
+  unique_ptr<bufferlist> contents[concurrentios];
+  int index[concurrentios];
+  int errors = 0;
+  double total_latency = 0;
+  int r = 0;
+  std::vector<mono_time> start_times(concurrentios);
+  mono_clock::duration time_to_run = std::chrono::seconds(seconds_to_run);
+  std::chrono::duration<double> timePassed;
+  sanitize_object_contents(&data, data.op_size); //clean it up once; subsequent
+  //changes will be safe because string length should remain the same
+
+  unsigned reads_per_object = 1;
+  if (data.op_size)
+    reads_per_object = data.object_size / data.op_size;
+
+  r = completions_init(concurrentios);
+  if (r < 0)
+    return r;
+
+  //set up initial reads
+  for (int i = 0; i < concurrentios; ++i) {
+    name[i] = generate_object_name_fast(i / reads_per_object, pid);
+    contents[i] = std::make_unique<bufferlist>();
+  }
+
+  lock.lock();
+  data.finished = 0;
+  data.start_time = mono_clock::now();
+  lock.unlock();
+
+  pthread_t print_thread;
+  pthread_create(&print_thread, NULL, status_printer, (void *)this);
+  ceph_pthread_setname(print_thread, "seq_read_stat");
+
+  mono_time finish_time = data.start_time + time_to_run;
+  //start initial reads
+  for (int i = 0; i < concurrentios; ++i) {
+    index[i] = i;
+    start_times[i] = mono_clock::now();
+    create_completion(i, _aio_cb, (void *)&lc);
+    r = aio_read(name[i], i, contents[i].get(), data.op_size,
+		 data.op_size * (i % reads_per_object));
+    if (r < 0) {
+      cerr << "r = " << r << std::endl;
+      goto ERR;
+    }
+    lock.lock();
+    ++data.started;
+    ++data.in_flight;
+    lock.unlock();
+  }
+
+  //keep on adding new reads as old ones complete
+  int slot;
+  bufferlist *cur_contents;
+
+  slot = 0;
+  while ((seconds_to_run && mono_clock::now() < finish_time) &&
+	 num_objects > data.started) {
+    lock.lock();
+    int old_slot = slot;
+    bool found = false;
+    while (1) {
+      do {
+        if (completion_is_done(slot)) {
+          found = true;
+          break;
+        }
+        slot++;
+        if (slot == concurrentios) {
+          slot = 0;
+        }
+      } while (slot != old_slot);
+      if (found) {
+        break;
+      }
+      lc.cond.Wait(lock);
+    }
+
+    // calculate latency here, so memcmp doesn't inflate it
+    data.cur_latency = mono_clock::now() - start_times[slot];
+
+    cur_contents = contents[slot].get();
+    int current_index = index[slot];
+    
+    // invalidate internal crc cache
+    cur_contents->invalidate_crc();
+  
+    if (!no_verify) {
+      snprintf(data.object_contents, data.op_size, "I'm the %16dth op!", current_index);
+      if ( (cur_contents->length() != data.op_size) || 
+           (memcmp(data.object_contents, cur_contents->c_str(), data.op_size) != 0) ) {
+        cerr << name[slot] << " is not correct!" << std::endl;
+        ++errors;
+      }
+    }
+
+    newName = generate_object_name_fast(data.started / reads_per_object, pid);
+    index[slot] = data.started;
+    lock.unlock();
+    completion_wait(slot);
+    lock.lock();
+    r = completion_ret(slot);
+    if (r < 0) {
+      cerr << "read got " << r << std::endl;
+      lock.unlock();
+      goto ERR;
+    }
+    total_latency += data.cur_latency.count();
+    if (data.cur_latency.count() > data.max_latency)
+      data.max_latency = data.cur_latency.count();
+    if (data.cur_latency.count() < data.min_latency)
+      data.min_latency = data.cur_latency.count();
+    ++data.finished;
+    data.avg_latency = total_latency / data.finished;
+    --data.in_flight;
+    lock.unlock();
+    release_completion(slot);
+
+    //start new read and check data if requested
+    start_times[slot] = mono_clock::now();
+    create_completion(slot, _aio_cb, (void *)&lc);
+    r = aio_read(newName, slot, contents[slot].get(), data.op_size,
+		 data.op_size * (data.started % reads_per_object));
+    if (r < 0) {
+      goto ERR;
+    }
+    lock.lock();
+    ++data.started;
+    ++data.in_flight;
+    lock.unlock();
+    name[slot] = newName;
+  }
+
+  //wait for final reads to complete
+  while (data.finished < data.started) {
+    slot = data.finished % concurrentios;
+    completion_wait(slot);
+    lock.lock();
+    r = completion_ret(slot);
+    if (r < 0) {
+      cerr << "read got " << r << std::endl;
+      lock.unlock();
+      goto ERR;
+    }
+    data.cur_latency = mono_clock::now() - start_times[slot];
+    total_latency += data.cur_latency.count();
+    if (data.cur_latency.count() > data.max_latency)
+      data.max_latency = data.cur_latency.count();
+    if (data.cur_latency.count() < data.min_latency)
+      data.min_latency = data.cur_latency.count();
+    ++data.finished;
+    data.avg_latency = total_latency / data.finished;
+    --data.in_flight;
+    release_completion(slot);
+    if (!no_verify) {
+      snprintf(data.object_contents, data.op_size, "I'm the %16dth op!", index[slot]);
+      lock.unlock();
+      if ((contents[slot]->length() != data.op_size) || 
+         (memcmp(data.object_contents, contents[slot]->c_str(), data.op_size) != 0)) {
+        cerr << name[slot] << " is not correct!" << std::endl;
+        ++errors;
+      }
+    } else {
+        lock.unlock();
+    }
+  }
+
+  timePassed = mono_clock::now() - data.start_time;
+  lock.lock();
+  data.done = true;
+  lock.unlock();
+
+  pthread_join(print_thread, NULL);
+
+  double bandwidth;
+  bandwidth = ((double)data.finished)*((double)data.op_size)/timePassed.count();
+  bandwidth = bandwidth/(1024*1024); // we want it in MB/sec
+  
+  double iops_stddev;
+  if (data.idata.iops_cycles > 1) {
+    iops_stddev = std::sqrt(data.idata.iops_diff_sum / (data.idata.iops_cycles - 1));
+  } else {
+    iops_stddev = 0;
+  }
+
+  if (!formatter) {
+    out(cout) << "Total time run:       " << timePassed.count() << std::endl
+       << "Total reads made:     " << data.finished << std::endl
+       << "Read size:            " << data.op_size << std::endl
+       << "Object size:          " << data.object_size << std::endl
+       << "Bandwidth (MB/sec):   " << setprecision(6) << bandwidth << std::endl
+       << "Average IOPS:         " << (int)(data.finished/timePassed.count()) << std::endl
+       << "Stddev IOPS:          " << iops_stddev << std::endl
+       << "Max IOPS:             " << data.idata.max_iops << std::endl
+       << "Min IOPS:             " << data.idata.min_iops << std::endl
+       << "Average Latency(s):   " << data.avg_latency << std::endl
+       << "Max latency(s):       " << data.max_latency << std::endl
+       << "Min latency(s):       " << data.min_latency << std::endl;
+  } else {
+    formatter->dump_format("total_time_run", "%f", timePassed.count());
+    formatter->dump_format("total_reads_made", "%d", data.finished);
+    formatter->dump_format("read_size", "%d", data.op_size);
+    formatter->dump_format("object_size", "%d", data.object_size);
+    formatter->dump_format("bandwidth", "%f", bandwidth);
+    formatter->dump_format("average_iops", "%d", (int)(data.finished/timePassed.count()));
+    formatter->dump_format("stddev_iops", "%f", iops_stddev);
+    formatter->dump_format("max_iops", "%d", data.idata.max_iops);
+    formatter->dump_format("min_iops", "%d", data.idata.min_iops);
+    formatter->dump_format("average_latency", "%f", data.avg_latency);
+    formatter->dump_format("max_latency", "%f", data.max_latency);
+    formatter->dump_format("min_latency", "%f", data.min_latency);
+  }
+
+  completions_done();
+
+  return (errors > 0 ? -EIO : 0);
+
+ ERR:
+  lock.lock();
+  data.done = 1;
+  lock.unlock();
+  pthread_join(print_thread, NULL);
+  return r;
+}
+
+int ObjBencher::rand_read_bench(int seconds_to_run, int num_objects, int concurrentios, int pid, bool no_verify)
+{
+  lock_cond lc(&lock);
+
+  if (concurrentios <= 0)
+    return -EINVAL;
+
+  std::vector<string> name(concurrentios);
+  std::string newName;
+  unique_ptr<bufferlist> contents[concurrentios];
+  int index[concurrentios];
+  int errors = 0;
+  int r = 0;
+  double total_latency = 0;
+  std::vector<mono_time> start_times(concurrentios);
+  mono_clock::duration time_to_run = std::chrono::seconds(seconds_to_run);
+  std::chrono::duration<double> timePassed;
+  sanitize_object_contents(&data, data.op_size); //clean it up once; subsequent
+  //changes will be safe because string length should remain the same
+
+  unsigned reads_per_object = 1;
+  if (data.op_size)
+    reads_per_object = data.object_size / data.op_size;
+
+  srand (time(NULL));
+
+  r = completions_init(concurrentios);
+  if (r < 0)
+    return r;
+
+  //set up initial reads
+  for (int i = 0; i < concurrentios; ++i) {
+    name[i] = generate_object_name_fast(i / reads_per_object, pid);
+    contents[i] = std::make_unique<bufferlist>();
+  }
+
+  lock.lock();
+  data.finished = 0;
+  data.start_time = mono_clock::now();
+  lock.unlock();
+
+  pthread_t print_thread;
+  pthread_create(&print_thread, NULL, status_printer, (void *)this);
+  ceph_pthread_setname(print_thread, "rand_read_stat");
+
+  mono_time finish_time = data.start_time + time_to_run;
+  //start initial reads
+  for (int i = 0; i < concurrentios; ++i) {
+    index[i] = i;
+    start_times[i] = mono_clock::now();
+    create_completion(i, _aio_cb, (void *)&lc);
+    r = aio_read(name[i], i, contents[i].get(), data.op_size,
+		 data.op_size * (i % reads_per_object));
+    if (r < 0) {
+      cerr << "r = " << r << std::endl;
+      goto ERR;
+    }
+    lock.lock();
+    ++data.started;
+    ++data.in_flight;
+    lock.unlock();
+  }
+
+  //keep on adding new reads as old ones complete
+  int slot;
+  bufferlist *cur_contents;
+  int rand_id;
+
+  slot = 0;
+  while ((seconds_to_run && mono_clock::now() < finish_time)) {
+    lock.lock();
+    int old_slot = slot;
+    bool found = false;
+    while (1) {
+      do {
+        if (completion_is_done(slot)) {
+          found = true;
+          break;
+        }
+        slot++;
+        if (slot == concurrentios) {
+          slot = 0;
+        }
+      } while (slot != old_slot);
+      if (found) {
+        break;
+      }
+      lc.cond.Wait(lock);
+    }
+
+    // calculate latency here, so memcmp doesn't inflate it
+    data.cur_latency = mono_clock::now() - start_times[slot];
+
+    lock.unlock();
+
+    int current_index = index[slot];
+    cur_contents = contents[slot].get();
+    completion_wait(slot);
+    lock.lock();
+    r = completion_ret(slot);
+    if (r < 0) {
+      cerr << "read got " << r << std::endl;
+      lock.unlock();
+      goto ERR;
+    }
+
+    total_latency += data.cur_latency.count();
+    if (data.cur_latency.count() > data.max_latency)
+      data.max_latency = data.cur_latency.count();
+    if (data.cur_latency.count() < data.min_latency)
+      data.min_latency = data.cur_latency.count();
+    ++data.finished;
+    data.avg_latency = total_latency / data.finished;
+    --data.in_flight;
+    lock.unlock();
+    
+    if (!no_verify) {
+      snprintf(data.object_contents, data.op_size, "I'm the %16dth op!", current_index);
+      if ((cur_contents->length() != data.op_size) || 
+          (memcmp(data.object_contents, cur_contents->c_str(), data.op_size) != 0)) {
+        cerr << name[slot] << " is not correct!" << std::endl;
+        ++errors;
+      }
+    } 
+
+    rand_id = rand() % num_objects;
+    newName = generate_object_name_fast(rand_id / reads_per_object, pid);
+    index[slot] = rand_id;
+    release_completion(slot);
+
+    // invalidate internal crc cache
+    cur_contents->invalidate_crc();
+
+    //start new read and check data if requested
+    start_times[slot] = mono_clock::now();
+    create_completion(slot, _aio_cb, (void *)&lc);
+    r = aio_read(newName, slot, contents[slot].get(), data.op_size,
+		 data.op_size * (rand_id % reads_per_object));
+    if (r < 0) {
+      goto ERR;
+    }
+    lock.lock();
+    ++data.started;
+    ++data.in_flight;
+    lock.unlock();
+    name[slot] = newName;
+  }
+
+
+  //wait for final reads to complete
+  while (data.finished < data.started) {
+    slot = data.finished % concurrentios;
+    completion_wait(slot);
+    lock.lock();
+    r = completion_ret(slot);
+    if (r < 0) {
+      cerr << "read got " << r << std::endl;
+      lock.unlock();
+      goto ERR;
+    }
+    data.cur_latency = mono_clock::now() - start_times[slot];
+    total_latency += data.cur_latency.count();
+    if (data.cur_latency.count() > data.max_latency)
+      data.max_latency = data.cur_latency.count();
+    if (data.cur_latency.count() < data.min_latency)
+      data.min_latency = data.cur_latency.count();
+    ++data.finished;
+    data.avg_latency = total_latency / data.finished;
+    --data.in_flight;
+    release_completion(slot);
+    if (!no_verify) {
+      snprintf(data.object_contents, data.op_size, "I'm the %16dth op!", index[slot]);
+      lock.unlock();
+      if ((contents[slot]->length() != data.op_size) || 
+          (memcmp(data.object_contents, contents[slot]->c_str(), data.op_size) != 0)) {
+        cerr << name[slot] << " is not correct!" << std::endl;
+        ++errors;
+      }
+    } else {
+        lock.unlock();
+    }
+  }
+
+  timePassed = mono_clock::now() - data.start_time;
+  lock.lock();
+  data.done = true;
+  lock.unlock();
+
+  pthread_join(print_thread, NULL);
+
+  double bandwidth;
+  bandwidth = ((double)data.finished)*((double)data.op_size)/timePassed.count();
+  bandwidth = bandwidth/(1024*1024); // we want it in MB/sec
+  
+  double iops_stddev;
+  if (data.idata.iops_cycles > 1) {
+    iops_stddev = std::sqrt(data.idata.iops_diff_sum / (data.idata.iops_cycles - 1));
+  } else {
+    iops_stddev = 0;
+  }
+
+  if (!formatter) {
+    out(cout) << "Total time run:       " << timePassed.count() << std::endl
+       << "Total reads made:     " << data.finished << std::endl
+       << "Read size:            " << data.op_size << std::endl
+       << "Object size:          " << data.object_size << std::endl
+       << "Bandwidth (MB/sec):   " << setprecision(6) << bandwidth << std::endl
+       << "Average IOPS:         " << (int)(data.finished/timePassed.count()) << std::endl
+       << "Stddev IOPS:          " << iops_stddev << std::endl
+       << "Max IOPS:             " << data.idata.max_iops << std::endl
+       << "Min IOPS:             " << data.idata.min_iops << std::endl
+       << "Average Latency(s):   " << data.avg_latency << std::endl
+       << "Max latency(s):       " << data.max_latency << std::endl
+       << "Min latency(s):       " << data.min_latency << std::endl;
+  } else {
+    formatter->dump_format("total_time_run", "%f", timePassed.count());
+    formatter->dump_format("total_reads_made", "%d", data.finished);
+    formatter->dump_format("read_size", "%d", data.op_size);
+    formatter->dump_format("object_size", "%d", data.object_size);
+    formatter->dump_format("bandwidth", "%f", bandwidth);
+    formatter->dump_format("average_iops", "%d", (int)(data.finished/timePassed.count()));
+    formatter->dump_format("stddev_iops", "%f", iops_stddev);
+    formatter->dump_format("max_iops", "%d", data.idata.max_iops);
+    formatter->dump_format("min_iops", "%d", data.idata.min_iops);
+    formatter->dump_format("average_latency", "%f", data.avg_latency);
+    formatter->dump_format("max_latency", "%f", data.max_latency);
+    formatter->dump_format("min_latency", "%f", data.min_latency);
+  }
+  completions_done();
+
+  return (errors > 0 ? -EIO : 0);
+
+ ERR:
+  lock.lock();
+  data.done = 1;
+  lock.unlock();
+  pthread_join(print_thread, NULL);
+  return r;
+}
+
+int ObjBencher::clean_up(const std::string& orig_prefix, int concurrentios, const std::string& run_name) {
+  int r = 0;
+  uint64_t op_size, object_size;
+  int num_objects;
+  int prevPid;
+
+  // default meta object if user does not specify one
+  const std::string run_name_meta = (run_name.empty() ? BENCH_LASTRUN_METADATA : run_name);
+  const std::string prefix = (orig_prefix.empty() ? generate_object_prefix_nopid() : orig_prefix);
+
+  if (prefix.substr(0, BENCH_PREFIX.length()) != BENCH_PREFIX) {
+    cerr << "Specified --prefix invalid, it must begin with \"" << BENCH_PREFIX << "\"" << std::endl;
+    return -EINVAL;
+  }
+
+  std::list<Object> unfiltered_objects;
+  std::set<std::string> meta_namespaces, all_namespaces;
+
+  // If caller set all_nspaces this will be searching
+  // across multiple namespaces.
+  while (true) {
+    bool objects_remain = get_objects(&unfiltered_objects, 20);
+    if (!objects_remain)
+      break;
+
+    std::list<Object>::const_iterator i = unfiltered_objects.begin();
+    for ( ; i != unfiltered_objects.end(); ++i) {
+      if (i->first == run_name_meta) {
+        meta_namespaces.insert(i->second);
+      }
+      if (i->first.substr(0, prefix.length()) == prefix) {
+        all_namespaces.insert(i->second);
+      }
+    }
+  }
+
+  std::set<std::string>::const_iterator i = all_namespaces.begin();
+  for ( ; i != all_namespaces.end(); ++i) {
+    set_namespace(*i);
+
+    // if no metadata file found we should try to do a linear search on the prefix
+    if (meta_namespaces.find(*i) == meta_namespaces.end()) {
+      int r = clean_up_slow(prefix, concurrentios);
+      if (r < 0) {
+        cerr << "clean_up_slow error r= " << r << std::endl;
+        return r;
+      }
+      continue;
+    }
+
+    r = fetch_bench_metadata(run_name_meta, &op_size, &object_size, &num_objects, &prevPid);
+    if (r < 0) {
+      return r;
+    }
+
+    r = clean_up(num_objects, prevPid, concurrentios);
+    if (r != 0) return r;
+
+    r = sync_remove(run_name_meta);
+    if (r != 0) return r;
+  }
+
+  return 0;
+}
+
+int ObjBencher::clean_up(int num_objects, int prevPid, int concurrentios) {
+  lock_cond lc(&lock);
+  
+  if (concurrentios <= 0) 
+    return -EINVAL;
+
+  std::vector<string> name(concurrentios);
+  std::string newName;
+  int r = 0;
+  int slot = 0;
+
+  lock.lock();
+  data.done = false;
+  data.in_flight = 0;
+  data.started = 0;
+  data.finished = 0;
+  lock.unlock();
+
+  // don't start more completions than files
+  if (num_objects == 0) {
+    return 0;
+  } else if (num_objects < concurrentios) {
+    concurrentios = num_objects;
+  }
+
+  r = completions_init(concurrentios);
+  if (r < 0)
+    return r;
+
+  //set up initial removes
+  for (int i = 0; i < concurrentios; ++i) {
+    name[i] = generate_object_name_fast(i, prevPid);
+  }
+
+  //start initial removes
+  for (int i = 0; i < concurrentios; ++i) {
+    create_completion(i, _aio_cb, (void *)&lc);
+    r = aio_remove(name[i], i);
+    if (r < 0) { //naughty, doesn't clean up heap
+      cerr << "r = " << r << std::endl;
+      goto ERR;
+    }
+    lock.lock();
+    ++data.started;
+    ++data.in_flight;
+    lock.unlock();
+  }
+
+  //keep on adding new removes as old ones complete
+  while (data.started < num_objects) {
+    lock.lock();
+    int old_slot = slot;
+    bool found = false;
+    while (1) {
+      do {
+        if (completion_is_done(slot)) {
+          found = true;
+          break;
+        }
+        slot++;
+        if (slot == concurrentios) {
+          slot = 0;
+        }
+      } while (slot != old_slot);
+      if (found) {
+        break;
+      }
+      lc.cond.Wait(lock);
+    }
+    lock.unlock();
+    newName = generate_object_name_fast(data.started, prevPid);
+    completion_wait(slot);
+    lock.lock();
+    r = completion_ret(slot);
+    if (r != 0 && r != -ENOENT) { // file does not exist
+      cerr << "remove got " << r << std::endl;
+      lock.unlock();
+      goto ERR;
+    }
+    ++data.finished;
+    --data.in_flight;
+    lock.unlock();
+    release_completion(slot);
+
+    //start new remove and check data if requested
+    create_completion(slot, _aio_cb, (void *)&lc);
+    r = aio_remove(newName, slot);
+    if (r < 0) {
+      goto ERR;
+    }
+    lock.lock();
+    ++data.started;
+    ++data.in_flight;
+    lock.unlock();
+    name[slot] = newName;
+  }
+
+  //wait for final removes to complete
+  while (data.finished < data.started) {
+    slot = data.finished % concurrentios;
+    completion_wait(slot);
+    lock.lock();
+    r = completion_ret(slot);
+    if (r != 0 && r != -ENOENT) { // file does not exist
+      cerr << "remove got " << r << std::endl;
+      lock.unlock();
+      goto ERR;
+    }
+    ++data.finished;
+    --data.in_flight;
+    release_completion(slot);
+    lock.unlock();
+  }
+
+  lock.lock();
+  data.done = true;
+  lock.unlock();
+
+  completions_done();
+
+  out(cout) << "Removed " << data.finished << " object" << (data.finished != 1 ? "s" : "") << std::endl;
+
+  return 0;
+
+ ERR:
+  lock.lock();
+  data.done = 1;
+  lock.unlock();
+  return r;
+}
+
+/**
+ * Return objects from the datastore which match a prefix.
+ *
+ * Clears the list and populates it with any objects which match the
+ * prefix. The list is guaranteed to have at least one item when the
+ * function returns true.
+ *
+ * @param prefix the prefix to match against
+ * @param objects [out] return list of objects
+ * @returns true if there are any objects in the store which match
+ * the prefix, false if there are no more
+ */
+bool ObjBencher::more_objects_matching_prefix(const std::string& prefix, std::list<Object>* objects) {
+  std::list<Object> unfiltered_objects;
+
+  objects->clear();
+
+  while (objects->empty()) {
+    bool objects_remain = get_objects(&unfiltered_objects, 20);
+    if (!objects_remain)
+      return false;
+
+    std::list<Object>::const_iterator i = unfiltered_objects.begin();
+    for ( ; i != unfiltered_objects.end(); ++i) {
+      if (i->first.substr(0, prefix.length()) == prefix) {
+        objects->push_back(*i);
+      }
+    }
+  }
+
+  return true;
+}
+
+int ObjBencher::clean_up_slow(const std::string& prefix, int concurrentios) {
+  lock_cond lc(&lock);
+
+  if (concurrentios <= 0) 
+    return -EINVAL;
+
+  std::vector<Object> name(concurrentios);
+  Object newName;
+  int r = 0;
+  int slot = 0;
+  std::list<Object> objects;
+  bool objects_remain = true;
+
+  lock.lock();
+  data.done = false;
+  data.in_flight = 0;
+  data.started = 0;
+  data.finished = 0;
+  lock.unlock();
+
+  out(cout) << "Warning: using slow linear search" << std::endl;
+
+  r = completions_init(concurrentios);
+  if (r < 0)
+    return r;
+
+  //set up initial removes
+  for (int i = 0; i < concurrentios; ++i) {
+    if (objects.empty()) {
+      // if there are fewer objects than concurrent ios, don't generate extras
+      bool objects_found = more_objects_matching_prefix(prefix, &objects);
+      if (!objects_found) {
+        concurrentios = i;
+        objects_remain = false;
+        break;
+      }
+    }
+
+    name[i] = objects.front();
+    objects.pop_front();
+  }
+
+  //start initial removes
+  for (int i = 0; i < concurrentios; ++i) {
+    create_completion(i, _aio_cb, (void *)&lc);
+    set_namespace(name[i].second);
+    r = aio_remove(name[i].first, i);
+    if (r < 0) { //naughty, doesn't clean up heap
+      cerr << "r = " << r << std::endl;
+      goto ERR;
+    }
+    lock.lock();
+    ++data.started;
+    ++data.in_flight;
+    lock.unlock();
+  }
+
+  //keep on adding new removes as old ones complete
+  while (objects_remain) {
+    lock.lock();
+    int old_slot = slot;
+    bool found = false;
+    while (1) {
+      do {
+        if (completion_is_done(slot)) {
+          found = true;
+          break;
+        }
+        slot++;
+        if (slot == concurrentios) {
+          slot = 0;
+        }
+      } while (slot != old_slot);
+      if (found) {
+        break;
+      }
+      lc.cond.Wait(lock);
+    }
+    lock.unlock();
+
+    // get more objects if necessary
+    if (objects.empty()) {
+      objects_remain = more_objects_matching_prefix(prefix, &objects);
+      // quit if there are no more
+      if (!objects_remain) {
+        break;
+      }
+    }
+
+    // get the next object
+    newName = objects.front();
+    objects.pop_front();
+
+    completion_wait(slot);
+    lock.lock();
+    r = completion_ret(slot);
+    if (r != 0 && r != -ENOENT) { // file does not exist
+      cerr << "remove got " << r << std::endl;
+      lock.unlock();
+      goto ERR;
+    }
+    ++data.finished;
+    --data.in_flight;
+    lock.unlock();
+    release_completion(slot);
+
+    //start new remove and check data if requested
+    create_completion(slot, _aio_cb, (void *)&lc);
+    set_namespace(newName.second);
+    r = aio_remove(newName.first, slot);
+    if (r < 0) {
+      goto ERR;
+    }
+    lock.lock();
+    ++data.started;
+    ++data.in_flight;
+    lock.unlock();
+    name[slot] = newName;
+  }
+
+  //wait for final removes to complete
+  while (data.finished < data.started) {
+    slot = data.finished % concurrentios;
+    completion_wait(slot);
+    lock.lock();
+    r = completion_ret(slot);
+    if (r != 0 && r != -ENOENT) { // file does not exist
+      cerr << "remove got " << r << std::endl;
+      lock.unlock();
+      goto ERR;
+    }
+    ++data.finished;
+    --data.in_flight;
+    release_completion(slot);
+    lock.unlock();
+  }
+
+  lock.lock();
+  data.done = true;
+  lock.unlock();
+
+  completions_done();
+
+  out(cout) << "Removed " << data.finished << " object" << (data.finished != 1 ? "s" : "") << std::endl;
+
+  return 0;
+
+ ERR:
+  lock.lock();
+  data.done = 1;
+  lock.unlock();
+  return -EIO;
+}
diff --git a/src/common/obj_bencher.h b/src/common/obj_bencher.h
new file mode 100644
index 00000000..8e41fb5b
--- /dev/null
+++ b/src/common/obj_bencher.h
@@ -0,0 +1,131 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2009 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OBJ_BENCHER_H
+#define CEPH_OBJ_BENCHER_H
+
+#include "common/ceph_context.h"
+#include "common/Formatter.h"
+#include "ceph_time.h"
+#include <cfloat>
+
+using ceph::mono_clock;
+
+struct bench_interval_data {
+  double min_bandwidth = DBL_MAX;
+  double max_bandwidth = 0;
+  double avg_bandwidth = 0;
+  int bandwidth_cycles = 0;
+  double bandwidth_diff_sum = 0;
+  int min_iops = INT_MAX;
+  int max_iops = 0;
+  double avg_iops = 0;
+  int iops_cycles = 0;
+  double iops_diff_sum = 0;
+};
+
+struct bench_data {
+  bool done; //is the benchmark is done
+  uint64_t object_size; //the size of the objects
+  uint64_t op_size;     // the size of the read/write ops
+  bool hints;
+  // same as object_size for write tests
+  int in_flight; //number of reads/writes being waited on
+  int started;
+  int finished;
+  double min_latency;
+  double max_latency;
+  double avg_latency;
+  struct bench_interval_data idata; // data that is updated by time intervals and not by events
+  double latency_diff_sum;
+  std::chrono::duration<double> cur_latency; //latency of last completed transaction - in seconds by default
+  mono_time start_time; //start time for benchmark - use the monotonic clock as we'll measure the passage of time
+  char *object_contents; //pointer to the contents written to each object
+};
+
+const int OP_WRITE     = 1;
+const int OP_SEQ_READ  = 2;
+const int OP_RAND_READ = 3;
+
+// Object is composed of <oid,namespace>
+typedef std::pair<std::string, std::string> Object;
+
+class ObjBencher {
+  bool show_time;
+  Formatter *formatter = NULL;
+  ostream *outstream = NULL;
+public:
+  CephContext *cct;
+protected:
+  Mutex lock;
+
+  static void *status_printer(void *bencher);
+
+  struct bench_data data;
+
+  int fetch_bench_metadata(const std::string& metadata_file, uint64_t* op_size,
+			   uint64_t* object_size, int* num_objects, int* prev_pid);
+
+  int write_bench(int secondsToRun, int concurrentios, const string& run_name_meta, unsigned max_objects, int prev_pid);
+  int seq_read_bench(int secondsToRun, int num_objects, int concurrentios, int writePid, bool no_verify=false);
+  int rand_read_bench(int secondsToRun, int num_objects, int concurrentios, int writePid, bool no_verify=false);
+
+  int clean_up(int num_objects, int prevPid, int concurrentios);
+  bool more_objects_matching_prefix(const std::string& prefix, std::list<Object>* name);
+
+  virtual int completions_init(int concurrentios) = 0;
+  virtual void completions_done() = 0;
+
+  virtual int create_completion(int i, void (*cb)(void *, void*), void *arg) = 0;
+  virtual void release_completion(int slot) = 0;
+
+  virtual bool completion_is_done(int slot) = 0;
+  virtual int completion_wait(int slot) = 0;
+  virtual int completion_ret(int slot) = 0;
+
+  virtual int aio_read(const std::string& oid, int slot, bufferlist *pbl, size_t len, size_t offset) = 0;
+  virtual int aio_write(const std::string& oid, int slot, bufferlist& bl, size_t len, size_t offset) = 0;
+  virtual int aio_remove(const std::string& oid, int slot) = 0;
+  virtual int sync_read(const std::string& oid, bufferlist& bl, size_t len) = 0;
+  virtual int sync_write(const std::string& oid, bufferlist& bl, size_t len) = 0;
+  virtual int sync_remove(const std::string& oid) = 0;
+
+  virtual bool get_objects(std::list< std::pair<std::string, std::string> >* objects, int num) = 0;
+  virtual void set_namespace(const std::string&) {}
+
+  ostream& out(ostream& os);
+  ostream& out(ostream& os, utime_t& t);
+public:
+  explicit ObjBencher(CephContext *cct_) : show_time(false), cct(cct_), lock("ObjBencher::lock"), data() {}
+  virtual ~ObjBencher() {}
+  int aio_bench(
+    int operation, int secondsToRun,
+    int concurrentios, uint64_t op_size, uint64_t object_size, unsigned max_objects,
+    bool cleanup, bool hints, const std::string& run_name, bool reuse_bench, bool no_verify=false);
+  int clean_up(const std::string& prefix, int concurrentios, const std::string& run_name);
+
+  void set_show_time(bool dt) {
+    show_time = dt;
+  }
+  void set_formatter(Formatter *f) {
+    formatter = f;
+  }
+  void set_outstream(ostream& os) {
+    outstream = &os;
+  }
+  int clean_up_slow(const std::string& prefix, int concurrentios);
+};
+
+
+#endif
diff --git a/src/common/options.cc b/src/common/options.cc
new file mode 100644
index 00000000..768d6505
--- /dev/null
+++ b/src/common/options.cc
@@ -0,0 +1,8633 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "acconfig.h"
+#include "options.h"
+#include "common/Formatter.h"
+
+// Helpers for validators
+#include "include/stringify.h"
+#include <boost/algorithm/string.hpp>
+#include <boost/lexical_cast.hpp>
+#include <regex>
+
+// Definitions for enums
+#include "common/perf_counters.h"
+
+// rbd feature validation
+#include "librbd/Features.h"
+
+namespace {
+class printer : public boost::static_visitor<> {
+  ostream& out;
+public:
+  explicit printer(ostream& os)
+    : out(os) {}
+  template<typename T>
+  void operator()(const T& v) const {
+    out << v;
+  }
+  void operator()(boost::blank blank) const {
+    return;
+  }
+  void operator()(bool v) const {
+    out << (v ? "true" : "false");
+  }
+  void operator()(double v) const {
+    out << std::fixed << v << std::defaultfloat;
+  }
+  void operator()(const Option::size_t& v) const {
+    out << v.value;
+  }
+  void operator()(const std::chrono::seconds v) const {
+    out << v.count();
+  }
+};
+}
+
+ostream& operator<<(ostream& os, const Option::value_t& v) {
+  printer p{os};
+  v.apply_visitor(p);
+  return os;
+}
+
+void Option::dump_value(const char *field_name,
+    const Option::value_t &v, Formatter *f) const
+{
+  if (boost::get<boost::blank>(&v)) {
+    // This should be nil but Formatter doesn't allow it.
+    f->dump_string(field_name, "");
+    return;
+  }
+  switch (type) {
+  case TYPE_INT:
+    f->dump_int(field_name, boost::get<int64_t>(v)); break;
+  case TYPE_UINT:
+    f->dump_unsigned(field_name, boost::get<uint64_t>(v)); break;
+  case TYPE_STR:
+    f->dump_string(field_name, boost::get<std::string>(v)); break;
+  case TYPE_FLOAT:
+    f->dump_float(field_name, boost::get<double>(v)); break;
+  case TYPE_BOOL:
+    f->dump_bool(field_name, boost::get<bool>(v)); break;
+  default:
+    f->dump_stream(field_name) << v; break;
+  }
+}
+
+int Option::pre_validate(std::string *new_value, std::string *err) const
+{
+  if (validator) {
+    return validator(new_value, err);
+  } else {
+    return 0;
+  }
+}
+
+int Option::validate(const Option::value_t &new_value, std::string *err) const
+{
+  // Generic validation: min
+  if (!boost::get<boost::blank>(&(min))) {
+    if (new_value < min) {
+      std::ostringstream oss;
+      oss << "Value '" << new_value << "' is below minimum " << min;
+      *err = oss.str();
+      return -EINVAL;
+    }
+  }
+
+  // Generic validation: max
+  if (!boost::get<boost::blank>(&(max))) {
+    if (new_value > max) {
+      std::ostringstream oss;
+      oss << "Value '" << new_value << "' exceeds maximum " << max;
+      *err = oss.str();
+      return -EINVAL;
+    }
+  }
+
+  // Generic validation: enum
+  if (!enum_allowed.empty() && type == Option::TYPE_STR) {
+    auto found = std::find(enum_allowed.begin(), enum_allowed.end(),
+                           boost::get<std::string>(new_value));
+    if (found == enum_allowed.end()) {
+      std::ostringstream oss;
+      oss << "'" << new_value << "' is not one of the permitted "
+                 "values: " << joinify(enum_allowed.begin(),
+                                       enum_allowed.end(),
+                                       std::string(", "));
+      *err = oss.str();
+      return -EINVAL;
+    }
+  }
+
+  return 0;
+}
+
+int Option::parse_value(
+  const std::string& raw_val,
+  value_t *out,
+  std::string *error_message,
+  std::string *normalized_value) const
+{
+  std::string val = raw_val;
+
+  int r = pre_validate(&val, error_message);
+  if (r != 0) {
+    return r;
+  }
+
+  if (type == Option::TYPE_INT) {
+    int64_t f = strict_si_cast<int64_t>(val.c_str(), error_message);
+    if (!error_message->empty()) {
+      return -EINVAL;
+    }
+    *out = f;
+  } else if (type == Option::TYPE_UINT) {
+    uint64_t f = strict_si_cast<uint64_t>(val.c_str(), error_message);
+    if (!error_message->empty()) {
+      return -EINVAL;
+    }
+    *out = f;
+  } else if (type == Option::TYPE_STR) {
+    *out = val;
+  } else if (type == Option::TYPE_FLOAT) {
+    double f = strict_strtod(val.c_str(), error_message);
+    if (!error_message->empty()) {
+      return -EINVAL;
+    } else {
+      *out = f;
+    }
+  } else if (type == Option::TYPE_BOOL) {
+    if (strcasecmp(val.c_str(), "false") == 0) {
+      *out = false;
+    } else if (strcasecmp(val.c_str(), "true") == 0) {
+      *out = true;
+    } else {
+      int b = strict_strtol(val.c_str(), 10, error_message);
+      if (!error_message->empty()) {
+	return -EINVAL;
+      }
+      *out = (bool)!!b;
+    }
+  } else if (type == Option::TYPE_ADDR) {
+    entity_addr_t addr;
+    if (!addr.parse(val.c_str())){
+      return -EINVAL;
+    }
+    *out = addr;
+  } else if (type == Option::TYPE_ADDR) {
+    entity_addrvec_t addr;
+    if (!addr.parse(val.c_str())){
+      return -EINVAL;
+    }
+    *out = addr;
+  } else if (type == Option::TYPE_UUID) {
+    uuid_d uuid;
+    if (!uuid.parse(val.c_str())) {
+      return -EINVAL;
+    }
+    *out = uuid;
+  } else if (type == Option::TYPE_SIZE) {
+    Option::size_t sz{strict_iecstrtoll(val.c_str(), error_message)};
+    if (!error_message->empty()) {
+      return -EINVAL;
+    }
+    *out = sz;
+  } else if (type == Option::TYPE_SECS) {
+    try {
+      *out = parse_timespan(val);
+    } catch (const invalid_argument& e) {
+      *error_message = e.what();
+      return -EINVAL;
+    }
+  } else {
+    ceph_abort();
+  }
+
+  r = validate(*out, error_message);
+  if (r != 0) {
+    return r;
+  }
+
+  if (normalized_value) {
+    *normalized_value = to_str(*out);
+  }
+  return 0;
+}
+
+void Option::dump(Formatter *f) const
+{
+  f->dump_string("name", name);
+
+  f->dump_string("type", type_to_str(type));
+
+  f->dump_string("level", level_to_str(level));
+
+  f->dump_string("desc", desc);
+  f->dump_string("long_desc", long_desc);
+
+  dump_value("default", value, f);
+  dump_value("daemon_default", daemon_value, f);
+
+  f->open_array_section("tags");
+  for (const auto t : tags) {
+    f->dump_string("tag", t);
+  }
+  f->close_section();
+
+  f->open_array_section("services");
+  for (const auto s : services) {
+    f->dump_string("service", s);
+  }
+  f->close_section();
+
+  f->open_array_section("see_also");
+  for (const auto sa : see_also) {
+    f->dump_string("see_also", sa);
+  }
+  f->close_section();
+
+  if (type == TYPE_STR) {
+    f->open_array_section("enum_values");
+    for (const auto &ea : enum_allowed) {
+      f->dump_string("enum_value", ea);
+    }
+    f->close_section();
+  }
+
+  dump_value("min", min, f);
+  dump_value("max", max, f);
+
+  f->dump_bool("can_update_at_runtime", can_update_at_runtime());
+
+  f->open_array_section("flags");
+  if (has_flag(FLAG_RUNTIME)) {
+    f->dump_string("option", "runtime");
+  }
+  if (has_flag(FLAG_NO_MON_UPDATE)) {
+    f->dump_string("option", "no_mon_update");
+  }
+  if (has_flag(FLAG_STARTUP)) {
+    f->dump_string("option", "startup");
+  }
+  if (has_flag(FLAG_CLUSTER_CREATE)) {
+    f->dump_string("option", "cluster_create");
+  }
+  if (has_flag(FLAG_CREATE)) {
+    f->dump_string("option", "create");
+  }
+  f->close_section();
+}
+
+std::string Option::to_str(const Option::value_t& v)
+{
+  return stringify(v);
+}
+
+void Option::print(ostream *out) const
+{
+  *out << name << " - " << desc << "\n";
+  *out << "  (" << type_to_str(type) << ", " << level_to_str(level) << ")\n";
+  if (!boost::get<boost::blank>(&daemon_value)) {
+    *out << "  Default (non-daemon): " << stringify(value) << "\n";
+    *out << "  Default (daemon): " << stringify(daemon_value) << "\n";
+  } else {
+    *out << "  Default: " << stringify(value) << "\n";
+  }
+  if (!enum_allowed.empty()) {
+    *out << "  Possible values: ";
+    for (auto& i : enum_allowed) {
+      *out << " " << stringify(i);
+    }
+    *out << "\n";
+  }
+  if (!boost::get<boost::blank>(&min)) {
+    *out << "  Minimum: " << stringify(min) << "\n"
+	 << "  Maximum: " << stringify(max) << "\n";
+  }
+  *out << "  Can update at runtime: "
+       << (can_update_at_runtime() ? "true" : "false") << "\n";
+  if (!services.empty()) {
+    *out << "  Services: " << services << "\n";
+  }
+  if (!tags.empty()) {
+    *out << "  Tags: " << tags << "\n";
+  }
+  if (!see_also.empty()) {
+    *out << "  See also: " << see_also << "\n";
+  }
+
+  if (long_desc.size()) {
+    *out << "\n" << long_desc << "\n";
+  }
+}
+
+constexpr unsigned long long operator"" _min (unsigned long long min) {
+  return min * 60;
+}
+constexpr unsigned long long operator"" _hr (unsigned long long hr) {
+  return hr * 60 * 60;
+}
+constexpr unsigned long long operator"" _day (unsigned long long day) {
+  return day * 60 * 60 * 24;
+}
+constexpr unsigned long long operator"" _K (unsigned long long n) {
+  return n << 10;
+}
+constexpr unsigned long long operator"" _M (unsigned long long n) {
+  return n << 20;
+}
+constexpr unsigned long long operator"" _G (unsigned long long n) {
+  return n << 30;
+}
+
+std::vector<Option> get_global_options() {
+  return std::vector<Option>({
+    Option("host", Option::TYPE_STR, Option::LEVEL_BASIC)
+    .set_description("local hostname")
+    .set_long_description("if blank, ceph assumes the short hostname (hostname -s)")
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .add_service("common")
+    .add_tag("network"),
+
+    Option("fsid", Option::TYPE_UUID, Option::LEVEL_BASIC)
+    .set_description("cluster fsid (uuid)")
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .set_flag(Option::FLAG_STARTUP)
+    .add_service("common")
+    .add_tag("service"),
+
+    Option("public_addr", Option::TYPE_ADDR, Option::LEVEL_BASIC)
+    .set_description("public-facing address to bind to")
+    .set_flag(Option::FLAG_STARTUP)
+    .add_service({"mon", "mds", "osd", "mgr"}),
+
+    Option("public_bind_addr", Option::TYPE_ADDR, Option::LEVEL_ADVANCED)
+    .set_default(entity_addr_t())
+    .set_flag(Option::FLAG_STARTUP)
+    .add_service("mon")
+    .set_description(""),
+
+    Option("cluster_addr", Option::TYPE_ADDR, Option::LEVEL_BASIC)
+    .set_description("cluster-facing address to bind to")
+    .add_service("osd")
+    .set_flag(Option::FLAG_STARTUP)
+    .add_tag("network"),
+
+    Option("public_network", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .add_service({"mon", "mds", "osd", "mgr"})
+    .set_flag(Option::FLAG_STARTUP)
+    .add_tag("network")
+    .set_description("Network(s) from which to choose a public address to bind to"),
+
+    Option("public_network_interface", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .add_service({"mon", "mds", "osd", "mgr"})
+    .add_tag("network")
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("Interface name(s) from which to choose an address from a public_network to bind to; public_network must also be specified.")
+    .add_see_also("public_network"),
+
+    Option("cluster_network", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .add_service("osd")
+    .set_flag(Option::FLAG_STARTUP)
+    .add_tag("network")
+    .set_description("Network(s) from which to choose a cluster address to bind to"),
+
+    Option("cluster_network_interface", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .add_service({"mon", "mds", "osd", "mgr"})
+    .set_flag(Option::FLAG_STARTUP)
+    .add_tag("network")
+    .set_description("Interface name(s) from which to choose an address from a cluster_network to bind to; cluster_network must also be specified.")
+    .add_see_also("cluster_network"),
+
+    Option("monmap", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_description("path to MonMap file")
+    .set_long_description("This option is normally used during mkfs, but can also "
+  			"be used to identify which monitors to connect to.")
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .add_service("mon")
+    .set_flag(Option::FLAG_CREATE),
+
+    Option("mon_host", Option::TYPE_STR, Option::LEVEL_BASIC)
+    .set_description("list of hosts or addresses to search for a monitor")
+    .set_long_description("This is a comma, whitespace, or semicolon separated "
+  			"list of IP addresses or hostnames. Hostnames are "
+  			"resolved via DNS and all A or AAAA records are "
+  			"included in the search list.")
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .set_flag(Option::FLAG_STARTUP)
+    .add_service("common"),
+
+    Option("mon_host_override", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_description("monitor(s) to use overriding the MonMap")
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .set_flag(Option::FLAG_STARTUP)
+    .add_service("common"),
+
+    Option("mon_dns_srv_name", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("ceph-mon")
+    .set_description("name of DNS SRV record to check for monitor addresses")
+    .set_flag(Option::FLAG_STARTUP)
+    .add_service("common")
+    .add_tag("network")
+    .add_see_also("mon_host"),
+
+    // lockdep
+    Option("lockdep", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_description("enable lockdep lock dependency analyzer")
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .set_flag(Option::FLAG_STARTUP)
+    .add_service("common"),
+
+    Option("lockdep_force_backtrace", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_description("always gather current backtrace at every lock")
+    .set_flag(Option::FLAG_STARTUP)
+    .add_service("common")
+    .add_see_also("lockdep"),
+
+    Option("run_dir", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("/var/run/ceph")
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("path for the 'run' directory for storing pid and socket files")
+    .add_service("common")
+    .add_see_also("admin_socket"),
+
+    Option("admin_socket", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_daemon_default("$run_dir/$cluster-$name.asok")
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("path for the runtime control socket file, used by the 'ceph daemon' command")
+    .add_service("common"),
+
+    Option("admin_socket_mode", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_description("file mode to set for the admin socket file, e.g, '0755'")
+    .set_flag(Option::FLAG_STARTUP)
+    .add_service("common")
+    .add_see_also("admin_socket"),
+
+    // daemon
+    Option("daemonize", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_daemon_default(true)
+    .set_description("whether to daemonize (background) after startup")
+    .set_flag(Option::FLAG_STARTUP)
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .add_service({"mon", "mgr", "osd", "mds"})
+    .add_tag("service")
+    .add_see_also({"pid_file", "chdir"}),
+
+    Option("setuser", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("uid or user name to switch to on startup")
+    .set_long_description("This is normally specified by the systemd unit file.")
+    .add_service({"mon", "mgr", "osd", "mds"})
+    .add_tag("service")
+    .add_see_also("setgroup"),
+
+    Option("setgroup", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("gid or group name to switch to on startup")
+    .set_long_description("This is normally specified by the systemd unit file.")
+    .add_service({"mon", "mgr", "osd", "mds"})
+    .add_tag("service")
+    .add_see_also("setuser"),
+
+    Option("setuser_match_path", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("if set, setuser/setgroup is condition on this path matching ownership")
+    .set_long_description("If setuser or setgroup are specified, and this option is non-empty, then the uid/gid of the daemon will only be changed if the file or directory specified by this option has a matching uid and/or gid.  This exists primarily to allow switching to user ceph for OSDs to be conditional on whether the osd data contents have also been chowned after an upgrade.  This is normally specified by the systemd unit file.")
+    .add_service({"mon", "mgr", "osd", "mds"})
+    .add_tag("service")
+    .add_see_also({"setuser", "setgroup"}),
+
+    Option("pid_file", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("path to write a pid file (if any)")
+    .add_service({"mon", "mgr", "osd", "mds"})
+    .add_tag("service"),
+
+    Option("chdir", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_description("path to chdir(2) to after daemonizing")
+    .set_flag(Option::FLAG_STARTUP)
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .add_service({"mon", "mgr", "osd", "mds"})
+    .add_tag("service")
+    .add_see_also("daemonize"),
+
+    Option("fatal_signal_handlers", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("whether to register signal handlers for SIGABRT etc that dump a stack trace")
+    .set_long_description("This is normally true for daemons and values for libraries.")
+    .add_service({"mon", "mgr", "osd", "mds"})
+    .add_tag("service"),
+
+    Option("crash_dir", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_flag(Option::FLAG_STARTUP)
+    .set_default("/var/lib/ceph/crash")
+    .set_description("Directory where crash reports are archived"),
+
+    // restapi
+    Option("restapi_log_level", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_description("default set by python code"),
+
+    Option("restapi_base_url", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_description("default set by python code"),
+
+    Option("erasure_code_dir", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default(CEPH_PKGLIBDIR"/erasure-code")
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("directory where erasure-code plugins can be found")
+    .add_service({"mon", "osd"}),
+
+    // logging
+    Option("log_file", Option::TYPE_STR, Option::LEVEL_BASIC)
+    .set_default("")
+    .set_daemon_default("/var/log/ceph/$cluster-$name.log")
+    .set_description("path to log file")
+    .add_see_also({"log_to_file",
+		   "log_to_stderr",
+                   "err_to_stderr",
+                   "log_to_syslog",
+                   "err_to_syslog"}),
+
+    Option("log_max_new", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1000)
+    .set_description("max unwritten log entries to allow before waiting to flush to the log")
+    .add_see_also("log_max_recent"),
+
+    Option("log_max_recent", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(500)
+    .set_daemon_default(10000)
+    .set_description("recent log entries to keep in memory to dump in the event of a crash")
+    .set_long_description("The purpose of this option is to log at a higher debug level only to the in-memory buffer, and write out the detailed log messages only if there is a crash.  Only log entries below the lower log level will be written unconditionally to the log.  For example, debug_osd=1/5 will write everything <= 1 to the log unconditionally but keep entries at levels 2-5 in memory.  If there is a seg fault or assertion failure, all entries will be dumped to the log."),
+
+    Option("log_to_file", Option::TYPE_BOOL, Option::LEVEL_BASIC)
+    .set_default(true)
+    .set_description("send log lines to a file")
+    .add_see_also("log_file"),
+
+    Option("log_to_stderr", Option::TYPE_BOOL, Option::LEVEL_BASIC)
+    .set_default(true)
+    .set_daemon_default(false)
+    .set_description("send log lines to stderr"),
+
+    Option("err_to_stderr", Option::TYPE_BOOL, Option::LEVEL_BASIC)
+    .set_default(false)
+    .set_daemon_default(true)
+    .set_description("send critical error log lines to stderr"),
+
+    Option("log_stderr_prefix", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_description("String to prefix log messages with when sent to stderr")
+    .set_long_description("This is useful in container environments when combined with mon_cluster_log_to_stderr.  The mon log prefixes each line with the channel name (e.g., 'default', 'audit'), while log_stderr_prefix can be set to 'debug '.")
+    .add_see_also("mon_cluster_log_to_stderr"),
+
+    Option("log_to_syslog", Option::TYPE_BOOL, Option::LEVEL_BASIC)
+    .set_default(false)
+    .set_description("send log lines to syslog facility"),
+
+    Option("err_to_syslog", Option::TYPE_BOOL, Option::LEVEL_BASIC)
+    .set_default(false)
+    .set_description("send critical error log lines to syslog facility"),
+
+    Option("log_flush_on_exit", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("set a process exit handler to ensure the log is flushed on exit"),
+
+    Option("log_stop_at_utilization", Option::TYPE_FLOAT, Option::LEVEL_BASIC)
+    .set_default(.97)
+    .set_min_max(0.0, 1.0)
+    .set_description("stop writing to the log file when device utilization reaches this ratio")
+    .add_see_also("log_file"),
+
+    Option("log_to_graylog", Option::TYPE_BOOL, Option::LEVEL_BASIC)
+    .set_default(false)
+    .set_description("send log lines to remote graylog server")
+    .add_see_also({"err_to_graylog",
+                   "log_graylog_host",
+                   "log_graylog_port"}),
+
+    Option("err_to_graylog", Option::TYPE_BOOL, Option::LEVEL_BASIC)
+    .set_default(false)
+    .set_description("send critical error log lines to remote graylog server")
+    .add_see_also({"log_to_graylog",
+                   "log_graylog_host",
+                   "log_graylog_port"}),
+
+    Option("log_graylog_host", Option::TYPE_STR, Option::LEVEL_BASIC)
+    .set_default("127.0.0.1")
+    .set_description("address or hostname of graylog server to log to")
+    .add_see_also({"log_to_graylog",
+                   "err_to_graylog",
+                   "log_graylog_port"}),
+
+    Option("log_graylog_port", Option::TYPE_INT, Option::LEVEL_BASIC)
+    .set_default(12201)
+    .set_description("port number for the remote graylog server")
+    .add_see_also("log_graylog_host"),
+
+    Option("log_coarse_timestamps", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("timestamp log entries from coarse system clock "
+		     "to improve performance")
+    .add_service("common")
+    .add_tag("performance")
+    .add_tag("service"),
+
+
+    // unmodified
+    Option("clog_to_monitors", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("default=true")
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Make daemons send cluster log messages to monitors"),
+
+    Option("clog_to_syslog", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("false")
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Make daemons send cluster log messages to syslog"),
+
+    Option("clog_to_syslog_level", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("info")
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Syslog level for cluster log messages")
+    .add_see_also("clog_to_syslog"),
+
+    Option("clog_to_syslog_facility", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("default=daemon audit=local0")
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Syslog facility for cluster log messages")
+    .add_see_also("clog_to_syslog"),
+
+    Option("clog_to_graylog", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("false")
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Make daemons send cluster log to graylog"),
+
+    Option("clog_to_graylog_host", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("127.0.0.1")
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Graylog host to cluster log messages")
+    .add_see_also("clog_to_graylog"),
+
+    Option("clog_to_graylog_port", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("12201")
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Graylog port number for cluster log messages")
+    .add_see_also("clog_to_graylog"),
+
+    Option("mon_cluster_log_to_stderr", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .add_service("mon")
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Make monitor send cluster log messages to stderr (prefixed by channel)")
+    .add_see_also("log_stderr_prefix"),
+
+    Option("mon_cluster_log_to_syslog", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("default=false")
+    .set_flag(Option::FLAG_RUNTIME)
+    .add_service("mon")
+    .set_description("Make monitor send cluster log messages to syslog"),
+
+    Option("mon_cluster_log_to_syslog_level", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("info")
+    .add_service("mon")
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Syslog level for cluster log messages")
+    .add_see_also("mon_cluster_log_to_syslog"),
+
+    Option("mon_cluster_log_to_syslog_facility", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("daemon")
+    .add_service("mon")
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Syslog facility for cluster log messages")
+    .add_see_also("mon_cluster_log_to_syslog"),
+
+    Option("mon_cluster_log_to_file", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_flag(Option::FLAG_RUNTIME)
+    .add_service("mon")
+    .set_description("Make monitor send cluster log messages to file")
+    .add_see_also("mon_cluster_log_file"),
+
+    Option("mon_cluster_log_file", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("default=/var/log/ceph/$cluster.$channel.log cluster=/var/log/ceph/$cluster.log")
+    .set_flag(Option::FLAG_RUNTIME)
+    .add_service("mon")
+    .set_description("File(s) to write cluster log to")
+    .set_long_description("This can either be a simple file name to receive all messages, or a list of key/value pairs where the key is the log channel and the value is the filename, which may include $cluster and $channel metavariables")
+    .add_see_also("mon_cluster_log_to_file"),
+
+    Option("mon_cluster_log_file_level", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("debug")
+    .set_flag(Option::FLAG_RUNTIME)
+    .add_service("mon")
+    .set_description("Lowest level to include is cluster log file")
+    .add_see_also("mon_cluster_log_file"),
+
+    Option("mon_cluster_log_to_graylog", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("false")
+    .set_flag(Option::FLAG_RUNTIME)
+    .add_service("mon")
+    .set_description("Make monitor send cluster log to graylog"),
+
+    Option("mon_cluster_log_to_graylog_host", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("127.0.0.1")
+    .set_flag(Option::FLAG_RUNTIME)
+    .add_service("mon")
+    .set_description("Graylog host for cluster log messages")
+    .add_see_also("mon_cluster_log_to_graylog"),
+
+    Option("mon_cluster_log_to_graylog_port", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("12201")
+    .set_flag(Option::FLAG_RUNTIME)
+    .add_service("mon")
+    .set_description("Graylog port for cluster log messages")
+    .add_see_also("mon_cluster_log_to_graylog"),
+
+    Option("enable_experimental_unrecoverable_data_corrupting_features", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_default("")
+    .set_description("Enable named (or all with '*') experimental features that may be untested, dangerous, and/or cause permanent data loss"),
+
+    Option("plugin_dir", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default(CEPH_PKGLIBDIR)
+    .set_flag(Option::FLAG_STARTUP)
+    .add_service({"mon", "osd"})
+    .set_description("Base directory for dynamically loaded plugins"),
+
+    // XIO
+    Option("xio_trace_mempool", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("xio_trace_msgcnt", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("xio_trace_xcon", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("xio_queue_depth", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(128)
+    .set_description(""),
+
+    Option("xio_mp_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(128)
+    .set_description(""),
+
+    Option("xio_mp_max_64", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(65536)
+    .set_description(""),
+
+    Option("xio_mp_max_256", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(8192)
+    .set_description(""),
+
+    Option("xio_mp_max_1k", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(8192)
+    .set_description(""),
+
+    Option("xio_mp_max_page", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(4096)
+    .set_description(""),
+
+    Option("xio_mp_max_hint", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(4096)
+    .set_description(""),
+
+    Option("xio_portal_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(2)
+    .set_description(""),
+
+    Option("xio_max_conns_per_portal", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(32)
+    .set_description(""),
+
+    Option("xio_transport_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("rdma")
+    .set_description(""),
+
+    Option("xio_max_send_inline", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(512)
+    .set_description(""),
+
+    // Compressor
+    Option("compressor_zlib_isal", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Use Intel ISA-L accelerated zlib implementation if available"),
+
+    Option("compressor_zlib_level", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("Zlib compression level to use"),
+
+    Option("compressor_zstd_level", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description("Zstd compression level to use"),
+
+    Option("qat_compressor_enabled", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Enable Intel QAT acceleration support for compression if available"),
+
+    Option("plugin_crypto_accelerator", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("crypto_isal")
+    .set_description("Crypto accelerator library to use"),
+
+    Option("mempool_debug", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .set_description(""),
+
+    Option("thp", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("enable transparent huge page (THP) support")
+    .set_long_description("Ceph is known to suffer from memory fragmentation due to THP use. This is indicated by RSS usage above configured memory targets. Enabling THP is currently discouraged until selective use of THP by Ceph is implemented."),
+
+    Option("key", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Authentication key")
+    .set_long_description("A CephX authentication key, base64 encoded.  It normally looks something like 'AQAtut9ZdMbNJBAAHz6yBAWyJyz2yYRyeMWDag=='.")
+    .set_flag(Option::FLAG_STARTUP)
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .add_see_also("keyfile")
+    .add_see_also("keyring"),
+
+    Option("keyfile", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Path to a file containing a key")
+    .set_long_description("The file should contain a CephX authentication key and optionally a trailing newline, but nothing else.")
+    .set_flag(Option::FLAG_STARTUP)
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .add_see_also("key"),
+
+    Option("keyring", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default(
+      "/etc/ceph/$cluster.$name.keyring,/etc/ceph/$cluster.keyring,"
+      "/etc/ceph/keyring,/etc/ceph/keyring.bin,"
+  #if defined(__FreeBSD)
+      "/usr/local/etc/ceph/$cluster.$name.keyring,"
+      "/usr/local/etc/ceph/$cluster.keyring,"
+      "/usr/local/etc/ceph/keyring,/usr/local/etc/ceph/keyring.bin,"
+  #endif
+    )
+    .set_description("Path to a keyring file.")
+    .set_long_description("A keyring file is an INI-style formatted file where the section names are client or daemon names (e.g., 'osd.0') and each section contains a 'key' property with CephX authentication key as the value.")
+    .set_flag(Option::FLAG_STARTUP)
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .add_see_also("key")
+    .add_see_also("keyfile"),
+
+    Option("heartbeat_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("Frequency of internal heartbeat checks (seconds)"),
+
+    Option("heartbeat_file", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("File to touch on successful internal heartbeat")
+    .set_long_description("If set, this file will be touched every time an internal heartbeat check succeeds.")
+    .add_see_also("heartbeat_interval"),
+
+    Option("heartbeat_inject_failure", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("perf", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Enable internal performance metrics")
+    .set_long_description("If enabled, collect and expose internal health metrics"),
+
+    Option("ms_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_flag(Option::FLAG_STARTUP)
+    .set_default("async+posix")
+    .set_description("Messenger implementation to use for network communication"),
+
+    Option("ms_public_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("Messenger implementation to use for the public network")
+    .set_long_description("If not specified, use ms_type")
+    .add_see_also("ms_type"),
+
+    Option("ms_cluster_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("Messenger implementation to use for the internal cluster network")
+    .set_long_description("If not specified, use ms_type")
+    .add_see_also("ms_type"),
+
+    Option("ms_mon_cluster_mode", Option::TYPE_STR, Option::LEVEL_BASIC)
+    .set_default("secure crc")
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("Connection modes (crc, secure) for intra-mon connections in order of preference")
+    .add_see_also("ms_mon_service_mode")
+    .add_see_also("ms_mon_client_mode")
+    .add_see_also("ms_service_mode")
+    .add_see_also("ms_cluster_mode")
+    .add_see_also("ms_client_mode"),
+
+    Option("ms_mon_service_mode", Option::TYPE_STR, Option::LEVEL_BASIC)
+    .set_default("secure crc")
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("Allowed connection modes (crc, secure) for connections to mons")
+    .add_see_also("ms_service_mode")
+    .add_see_also("ms_mon_cluster_mode")
+    .add_see_also("ms_mon_client_mode")
+    .add_see_also("ms_cluster_mode")
+    .add_see_also("ms_client_mode"),
+
+    Option("ms_mon_client_mode", Option::TYPE_STR, Option::LEVEL_BASIC)
+    .set_default("secure crc")
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("Connection modes (crc, secure) for connections from clients to monitors in order of preference")
+    .add_see_also("ms_mon_service_mode")
+    .add_see_also("ms_mon_cluster_mode")
+    .add_see_also("ms_service_mode")
+    .add_see_also("ms_cluster_mode")
+    .add_see_also("ms_client_mode"),
+
+    Option("ms_cluster_mode", Option::TYPE_STR, Option::LEVEL_BASIC)
+    .set_default("crc secure")
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("Connection modes (crc, secure) for intra-cluster connections in order of preference")
+    .add_see_also("ms_service_mode")
+    .add_see_also("ms_client_mode"),
+
+    Option("ms_service_mode", Option::TYPE_STR, Option::LEVEL_BASIC)
+    .set_default("crc secure")
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("Allowed connection modes (crc, secure) for connections to daemons")
+    .add_see_also("ms_cluster_mode")
+    .add_see_also("ms_client_mode"),
+
+    Option("ms_client_mode", Option::TYPE_STR, Option::LEVEL_BASIC)
+    .set_default("crc secure")
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("Connection modes (crc, secure) for connections from clients in order of preference")
+    .add_see_also("ms_cluster_mode")
+    .add_see_also("ms_service_mode"),
+
+    Option("ms_learn_addr_from_peer", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Learn address from what IP our first peer thinks we connect from")
+    .set_long_description("Use the IP address our first peer (usually a monitor) sees that we are connecting from.  This is useful if a client is behind some sort of NAT and we want to see it identified by its local (not NATed) address."),
+
+    Option("ms_tcp_nodelay", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Disable Nagle's algorithm and send queued network traffic immediately"),
+
+    Option("ms_tcp_rcvbuf", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("Size of TCP socket receive buffer"),
+
+    Option("ms_tcp_prefetch_max_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(4_K)
+    .set_description("Maximum amount of data to prefetch out of the socket receive buffer"),
+
+    Option("ms_initial_backoff", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.2)
+    .set_description("Initial backoff after a network error is detected (seconds)"),
+
+    Option("ms_max_backoff", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(15.0)
+    .set_description("Maximum backoff after a network error before retrying (seconds)")
+    .add_see_also("ms_initial_backoff"),
+
+    Option("ms_crc_data", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(true)
+    .set_description("Set and/or verify crc32c checksum on data payload sent over network"),
+
+    Option("ms_crc_header", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(true)
+    .set_description("Set and/or verify crc32c checksum on header payload sent over network"),
+
+    Option("ms_die_on_bad_msg", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("Induce a daemon crash/exit when a bad network message is received"),
+
+    Option("ms_die_on_unhandled_msg", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("Induce a daemon crash/exit when an unrecognized message is received"),
+
+    Option("ms_die_on_old_message", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("Induce a daemon crash/exit when a old, undecodable message is received"),
+
+    Option("ms_die_on_skipped_message", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("Induce a daemon crash/exit if sender skips a message sequence number"),
+
+    Option("ms_die_on_bug", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("Induce a crash/exit on various bugs (for testing purposes)"),
+
+    Option("ms_dispatch_throttle_bytes", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(100_M)
+    .set_description("Limit messages that are read off the network but still being processed"),
+
+    Option("ms_bind_ipv4", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Bind servers to IPv4 address(es)")
+    .add_see_also("ms_bind_ipv6"),
+
+    Option("ms_bind_ipv6", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Bind servers to IPv6 address(es)")
+    .add_see_also("ms_bind_ipv4"),
+
+    Option("ms_bind_prefer_ipv4", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Prefer IPV4 over IPV6 address(es)"),
+
+    Option("ms_bind_msgr1", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Bind servers to msgr1 (legacy) protocol address(es)")
+    .add_see_also("ms_bind_msgr2"),
+
+    Option("ms_bind_msgr2", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Bind servers to msgr2 (nautilus+) protocol address(es)")
+    .add_see_also("ms_bind_msgr1"),
+
+    Option("ms_bind_port_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(6800)
+    .set_description("Lowest port number to bind daemon(s) to"),
+
+    Option("ms_bind_port_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(7300)
+    .set_description("Highest port number to bind daemon(s) to"),
+
+    Option("ms_bind_retry_count", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+  #if !defined(__FreeBSD__)
+    .set_default(3)
+  #else
+    // FreeBSD does not use SO_REAUSEADDR so allow for a bit more time per default
+    .set_default(6)
+  #endif
+    .set_description("Number of attempts to make while bind(2)ing to a port"),
+
+    Option("ms_bind_retry_delay", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+  #if !defined(__FreeBSD__)
+    .set_default(5)
+  #else
+    // FreeBSD does not use SO_REAUSEADDR so allow for a bit more time per default
+    .set_default(6)
+  #endif
+    .set_description("Delay between bind(2) attempts (seconds)"),
+
+    Option("ms_bind_before_connect", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Call bind(2) on client sockets"),
+
+    Option("ms_tcp_listen_backlog", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(512)
+    .set_description("Size of queue of incoming connections for accept(2)"),
+
+    Option("ms_rwthread_stack_bytes", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(1_M)
+    .set_description("Size of stack for SimpleMessenger read/write threads"),
+
+    Option("ms_connection_ready_timeout", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .set_description("Time before we declare a not yet ready connection as dead (seconds)"),
+
+    Option("ms_connection_idle_timeout", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(900)
+    .set_description("Time before an idle connection is closed (seconds)"),
+
+    Option("ms_pq_max_tokens_per_priority", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(16777216)
+    .set_description(""),
+
+    Option("ms_pq_min_cost", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(65536)
+    .set_description(""),
+
+    Option("ms_inject_socket_failures", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description("Inject a socket failure every Nth socket operation"),
+
+    Option("ms_inject_delay_type", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("")
+    .set_description("Entity type to inject delays for")
+    .set_flag(Option::FLAG_RUNTIME),
+
+    Option("ms_inject_delay_msg_type", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("")
+    .set_description("Message type to inject delays for"),
+
+    Option("ms_inject_delay_max", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(1)
+    .set_description("Max delay to inject"),
+
+    Option("ms_inject_delay_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("ms_inject_internal_delays", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description("Inject various internal delays to induce races (seconds)"),
+
+    Option("ms_dump_on_send", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Hexdump message to debug log on message send"),
+
+    Option("ms_dump_corrupt_message_level", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description("Log level at which to hexdump corrupt messages we receive"),
+
+    Option("ms_async_op_threads", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(3)
+    .set_min_max(1, 24)
+    .set_description("Threadpool size for AsyncMessenger (ms_type=async)"),
+
+    Option("ms_async_max_op_threads", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("Maximum threadpool size of AsyncMessenger")
+    .add_see_also("ms_async_op_threads"),
+
+    Option("ms_async_rdma_device_name", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description(""),
+
+    Option("ms_async_rdma_enable_hugepage", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("ms_async_rdma_buffer_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(128_K)
+    .set_description(""),
+
+    Option("ms_async_rdma_send_buffers", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1_K)
+    .set_description(""),
+
+    Option("ms_async_rdma_receive_buffers", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(32768)
+    .set_description(""),
+
+    Option("ms_async_rdma_receive_queue_len", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(4096)
+    .set_description(""),
+
+    Option("ms_async_rdma_support_srq", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description(""),
+
+    Option("ms_async_rdma_port_num", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description(""),
+
+    Option("ms_async_rdma_polling_us", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1000)
+    .set_description(""),
+
+    Option("ms_async_rdma_local_gid", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description(""),
+
+    Option("ms_async_rdma_roce_ver", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description(""),
+
+    Option("ms_async_rdma_sl", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(3)
+    .set_description(""),
+
+    Option("ms_async_rdma_dscp", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(96)
+    .set_description(""),
+
+    Option("ms_max_accept_failures", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(4)
+    .set_description("The maximum number of consecutive failed accept() calls before "
+                     "considering the daemon is misconfigured and abort it."),
+
+    Option("ms_async_rdma_cm", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("ms_async_rdma_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("ib")
+    .set_description(""),
+
+    Option("ms_dpdk_port_id", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description(""),
+
+    Option("ms_dpdk_coremask", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("0xF")         //begin with 0x for the string
+    .set_description("")
+    .add_see_also("ms_async_op_threads"),
+
+    Option("ms_dpdk_memory_channel", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("4")
+    .set_description(""),
+
+    Option("ms_dpdk_hugepages", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description(""),
+
+    Option("ms_dpdk_pmd", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description(""),
+
+    Option("ms_dpdk_host_ipv4_addr", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description(""),
+
+    Option("ms_dpdk_gateway_ipv4_addr", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description(""),
+
+    Option("ms_dpdk_netmask_ipv4_addr", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description(""),
+
+    Option("ms_dpdk_lro", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description(""),
+
+    Option("ms_dpdk_hw_flow_control", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description(""),
+
+    Option("ms_dpdk_hw_queue_weight", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description(""),
+
+    Option("ms_dpdk_debug_allow_loopback", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("ms_dpdk_rx_buffer_count_per_core", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(8192)
+    .set_description(""),
+
+    Option("inject_early_sigterm", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("send ourselves a SIGTERM early during startup"),
+
+    // MON
+    Option("mon_enable_op_tracker", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .add_service("mon")
+    .set_description("enable/disable MON op tracking"),
+
+    Option("mon_op_complaint_time", Option::TYPE_SECS, Option::LEVEL_ADVANCED)
+    .set_default(30)
+    .add_service("mon")
+    .set_description("time after which to consider a monitor operation blocked "
+                     "after no updates"),
+
+    Option("mon_op_log_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .add_service("mon")
+    .set_description("max number of slow ops to display"),
+
+    Option("mon_op_history_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(20)
+    .add_service("mon")
+    .set_description("max number of completed ops to track"),
+
+    Option("mon_op_history_duration", Option::TYPE_SECS, Option::LEVEL_ADVANCED)
+    .set_default(600)
+    .add_service("mon")
+    .set_description("expiration time in seconds of historical MON OPS"),
+
+    Option("mon_op_history_slow_op_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(20)
+    .add_service("mon")
+    .set_description("max number of slow historical MON OPS to keep"),
+
+    Option("mon_op_history_slow_op_threshold", Option::TYPE_SECS, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .add_service("mon")
+    .set_description("duration of an op to be considered as a historical slow op"),
+
+    Option("mon_data", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .set_default("/var/lib/ceph/mon/$cluster-$id")
+    .add_service("mon")
+    .set_description("path to mon database"),
+
+    Option("mon_initial_members", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .add_service("mon")
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .set_flag(Option::FLAG_CLUSTER_CREATE)
+    .set_description(""),
+
+    Option("mon_compact_on_start", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .add_service("mon")
+    .set_description(""),
+
+    Option("mon_compact_on_bootstrap", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .add_service("mon")
+    .set_description(""),
+
+    Option("mon_compact_on_trim", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .add_service("mon")
+    .set_description(""),
+
+    /* -- mon: osdmap prune (begin) -- */
+    Option("mon_osdmap_full_prune_enabled", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .add_service("mon")
+    .set_description("enables pruning full osdmap versions when we go over a given number of maps")
+    .add_see_also("mon_osdmap_full_prune_min")
+    .add_see_also("mon_osdmap_full_prune_interval")
+    .add_see_also("mon_osdmap_full_prune_txsize"),
+
+    Option("mon_osdmap_full_prune_min", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(10000)
+    .add_service("mon")
+    .set_description("minimum number of versions in the store to trigger full map pruning")
+    .add_see_also("mon_osdmap_full_prune_enabled")
+    .add_see_also("mon_osdmap_full_prune_interval")
+    .add_see_also("mon_osdmap_full_prune_txsize"),
+
+    Option("mon_osdmap_full_prune_interval", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .add_service("mon")
+    .set_description("interval between maps that will not be pruned; maps in the middle will be pruned.")
+    .add_see_also("mon_osdmap_full_prune_enabled")
+    .add_see_also("mon_osdmap_full_prune_interval")
+    .add_see_also("mon_osdmap_full_prune_txsize"),
+
+    Option("mon_osdmap_full_prune_txsize", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(100)
+    .add_service("mon")
+    .set_description("number of maps we will prune per iteration")
+    .add_see_also("mon_osdmap_full_prune_enabled")
+    .add_see_also("mon_osdmap_full_prune_interval")
+    .add_see_also("mon_osdmap_full_prune_txsize"),
+    /* -- mon: osdmap prune (end) -- */
+
+    Option("mon_osd_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(500)
+    .add_service("mon")
+    .set_description("maximum number of OSDMaps to cache in memory"),
+
+    Option("mon_osd_cache_size_min", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(128_M)
+    .add_service("mon")
+    .set_description("The minimum amount of bytes to be kept mapped in memory for osd monitor caches."),
+
+    Option("mon_memory_target", Option::TYPE_SIZE, Option::LEVEL_BASIC)
+    .set_default(2_G)
+    .set_flag(Option::FLAG_RUNTIME)
+    .add_service("mon")
+    .set_description("The amount of bytes pertaining to osd monitor caches and kv cache to be kept mapped in memory with cache auto-tuning enabled"),
+
+    Option("mon_memory_autotune", Option::TYPE_BOOL, Option::LEVEL_BASIC)
+    .set_default(true)
+    .set_flag(Option::FLAG_RUNTIME)
+    .add_service("mon")
+    .set_description("Autotune the cache memory being used for osd monitors and kv database"),
+
+    Option("mon_cpu_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(4)
+    .add_service("mon")
+    .set_description("worker threads for CPU intensive background work"),
+
+    Option("mon_osd_mapping_pgs_per_chunk", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(4096)
+    .add_service("mon")
+    .set_description("granularity of PG placement calculation background work"),
+
+    Option("mon_clean_pg_upmaps_per_chunk", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(256)
+    .add_service("mon")
+    .set_description("granularity of PG upmap validation background work"),
+
+    Option("mon_osd_max_creating_pgs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1024)
+    .add_service("mon")
+    .set_description("maximum number of PGs the mon will create at once"),
+
+    Option("mon_osd_max_initial_pgs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1024)
+    .add_service("mon")
+    .set_description("maximum number of PGs a pool will created with")
+    .set_long_description("If the user specifies more PGs than this, the cluster will subsequently split PGs after the pool is created in order to reach the target."),
+
+    Option("mon_tick_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .add_service("mon")
+    .set_description("interval for internal mon background checks"),
+
+    Option("mon_session_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(300)
+    .add_service("mon")
+    .set_description("close inactive mon client connections after this many seconds"),
+
+    Option("mon_subscribe_interval", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(1_day)
+    .add_service("mon")
+    .set_description("subscribe interval for pre-jewel clients"),
+
+    Option("mon_delta_reset_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .add_service("mon")
+    .add_service("mon")
+    .set_description("window duration for rate calculations in 'ceph status'"),
+
+    Option("mon_osd_laggy_halflife", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1_hr)
+    .add_service("mon")
+    .set_description("halflife of OSD 'lagginess' factor"),
+
+    Option("mon_osd_laggy_weight", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.3)
+    .set_min_max(0.0, 1.0)
+    .add_service("mon")
+    .set_description("how heavily to weight OSD marking itself back up in overall laggy_probability")
+    .set_long_description("1.0 means that an OSD marking itself back up (because it was marked down but not actually dead) means a 100% laggy_probability; 0.0 effectively disables tracking of laggy_probability."),
+
+    Option("mon_osd_laggy_max_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(300)
+    .add_service("mon")
+    .set_description("cap value for period for OSD to be marked for laggy_interval calculation"),
+
+    Option("mon_osd_adjust_heartbeat_grace", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .add_service("mon")
+    .set_description("increase OSD heartbeat grace if peers appear to be laggy")
+    .set_long_description("If an OSD is marked down but then marks itself back up, it implies it wasn't actually down but was unable to respond to heartbeats.  If this option is true, we can use the laggy_probability and laggy_interval values calculated to model this situation to increase the heartbeat grace period for this OSD so that it isn't marked down again.  laggy_probability is an estimated probability that the given OSD is down because it is laggy (not actually down), and laggy_interval is an estiate on how long it stays down when it is laggy.")
+    .add_see_also("mon_osd_laggy_halflife")
+    .add_see_also("mon_osd_laggy_weight")
+    .add_see_also("mon_osd_laggy_max_interval"),
+
+    Option("mon_osd_adjust_down_out_interval", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .add_service("mon")
+    .set_description("increase the mon_osd_down_out_interval if an OSD appears to be laggy")
+    .add_see_also("mon_osd_adjust_heartbeat_grace"),
+
+    Option("mon_osd_auto_mark_in", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .add_service("mon")
+    .set_description("mark any OSD that comes up 'in'"),
+
+    Option("mon_osd_auto_mark_auto_out_in", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .add_service("mon")
+    .set_description("mark any OSD that comes up that was automatically marked 'out' back 'in'")
+    .add_see_also("mon_osd_down_out_interval"),
+
+    Option("mon_osd_auto_mark_new_in", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .add_service("mon")
+    .set_description("mark any new OSD that comes up 'in'"),
+
+    Option("mon_osd_destroyed_out_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(600)
+    .add_service("mon")
+    .set_description("mark any OSD 'out' that has been 'destroy'ed for this long (seconds)"),
+
+    Option("mon_osd_down_out_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(600)
+    .add_service("mon")
+    .set_description("mark any OSD 'out' that has been 'down' for this long (seconds)"),
+
+    Option("mon_osd_down_out_subtree_limit", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("rack")
+    .set_flag(Option::FLAG_RUNTIME)
+    .add_service("mon")
+    .set_description("do not automatically mark OSDs 'out' if an entire subtree of this size is down")
+    .add_see_also("mon_osd_down_out_interval"),
+
+    Option("mon_osd_min_up_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.3)
+    .add_service("mon")
+    .set_description("do not automatically mark OSDs 'out' if fewer than this many OSDs are 'up'")
+    .add_see_also("mon_osd_down_out_interval"),
+
+    Option("mon_osd_min_in_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.75)
+    .add_service("mon")
+    .set_description("do not automatically mark OSDs 'out' if fewer than this many OSDs are 'in'")
+    .add_see_also("mon_osd_down_out_interval"),
+
+    Option("mon_osd_warn_op_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(32)
+    .add_service("mgr")
+    .set_description("issue REQUEST_SLOW health warning if OSD ops are slower than this age (seconds)"),
+
+    Option("mon_osd_warn_num_repaired", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .add_service("mon")
+    .set_description("issue OSD_TOO_MANY_REPAIRS health warning if an OSD has more than this many read repairs"),
+
+    Option("mon_osd_err_op_age_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(128)
+    .add_service("mgr")
+    .set_description("issue REQUEST_STUCK health error if OSD ops are slower than is age (seconds)"),
+
+    Option("mon_osd_prime_pg_temp", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(true)
+    .add_service("mon")
+    .set_description("minimize peering work by priming pg_temp values after a map change"),
+
+    Option("mon_osd_prime_pg_temp_max_time", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(.5)
+    .add_service("mon")
+    .set_description("maximum time to spend precalculating PG mappings on map change (seconds)"),
+
+    Option("mon_osd_prime_pg_temp_max_estimate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.25)
+    .add_service("mon")
+    .set_description("calculate all PG mappings if estimated fraction of PGs that change is above this amount"),
+
+    Option("mon_stat_smooth_intervals", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(6)
+    .set_min(1)
+    .add_service("mgr")
+    .set_description("number of PGMaps stats over which we calc the average read/write throughput of the whole cluster"),
+
+    Option("mon_election_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .add_service("mon")
+    .set_description("maximum time for a mon election (seconds)"),
+
+    Option("mon_lease", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .add_service("mon")
+    .set_description("lease interval between quorum monitors (seconds)")
+    .set_long_description("This setting controls how sensitive your mon quorum is to intermittent network issues or other failures."),
+
+    Option("mon_lease_renew_interval_factor", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.6)
+    .set_min_max((double)0.0, (double).9999999)
+    .add_service("mon")
+    .set_description("multiple of mon_lease for the lease renewal interval")
+    .set_long_description("Leases must be renewed before they time out.  A smaller value means frequent renewals, while a value close to 1 makes a lease expiration more likely.")
+    .add_see_also("mon_lease"),
+
+    Option("mon_lease_ack_timeout_factor", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(2.0)
+    .set_min_max(1.0001, 100.0)
+    .add_service("mon")
+    .set_description("multiple of mon_lease for the lease ack interval before calling new election")
+    .add_see_also("mon_lease"),
+
+    Option("mon_accept_timeout_factor", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(2.0)
+    .add_service("mon")
+    .set_description("multiple of mon_lease for follower mons to accept proposed state changes before calling a new election")
+    .add_see_also("mon_lease"),
+
+    Option("mon_clock_drift_allowed", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.050)
+    .add_service("mon")
+    .set_description("allowed clock drift (in seconds) between mons before issuing a health warning"),
+
+    Option("mon_clock_drift_warn_backoff", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .add_service("mon")
+    .set_description("exponential backoff factor for logging clock drift warnings in the cluster log"),
+
+    Option("mon_timecheck_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(300.0)
+    .add_service("mon")
+    .set_description("frequency of clock synchronization checks between monitors (seconds)"),
+
+    Option("mon_timecheck_skew_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(30.0)
+    .add_service("mon")
+    .set_description("frequency of clock synchronization (re)checks between monitors while clocks are believed to be skewed (seconds)")
+    .add_see_also("mon_timecheck_interval"),
+
+    Option("mon_pg_stuck_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(60)
+    .set_description("number of seconds after which pgs can be considered stuck inactive, unclean, etc")
+    .set_long_description("see doc/control.rst under dump_stuck for more info")
+    .add_service("mgr"),
+
+    Option("mon_pg_warn_min_per_osd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .add_service("mgr")
+    .set_description("minimal number PGs per (in) osd before we warn the admin"),
+
+    Option("mon_max_pg_per_osd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_min(1)
+    .set_default(250)
+    .add_service("mgr")
+    .set_description("Max number of PGs per OSD the cluster will allow")
+    .set_long_description("If the number of PGs per OSD exceeds this, a "
+        "health warning will be visible in `ceph status`.  This is also used "
+        "in automated PG management, as the threshold at which some pools' "
+        "pg_num may be shrunk in order to enable increasing the pg_num of "
+        "others."),
+
+    Option("mon_target_pg_per_osd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_min(1)
+    .set_default(100)
+    .set_description("Automated PG management creates this many PGs per OSD")
+    .set_long_description("When creating pools, the automated PG management "
+        "logic will attempt to reach this target.  In some circumstances, it "
+        "may exceed this target, up to the ``mon_max_pg_per_osd`` limit. "
+        "Conversely, a lower number of PGs per OSD may be created if the "
+        "cluster is not yet fully utilised"),
+
+    Option("mon_pg_warn_max_object_skew", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(10.0)
+    .set_description("max skew few average in objects per pg")
+    .add_service("mgr"),
+
+    Option("mon_pg_warn_min_objects", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(10000)
+    .set_description("do not warn below this object #")
+    .add_service("mgr"),
+
+    Option("mon_pg_warn_min_pool_objects", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1000)
+    .set_description("do not warn on pools below this object #")
+    .add_service("mgr"),
+
+    Option("mon_pg_check_down_all_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.5)
+    .set_description("threshold of down osds after which we check all pgs")
+    .add_service("mgr"),
+
+    Option("mon_cache_target_full_warn_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.66)
+    .add_service("mgr")
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .set_flag(Option::FLAG_CLUSTER_CREATE)
+    .set_description("issue CACHE_POOL_NEAR_FULL health warning when cache pool utilization exceeds this ratio of usable space"),
+
+    Option("mon_osd_full_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.95)
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .set_flag(Option::FLAG_CLUSTER_CREATE)
+    .set_description("full ratio of OSDs to be set during initial creation of the cluster"),
+
+    Option("mon_osd_backfillfull_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.90)
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .set_flag(Option::FLAG_CLUSTER_CREATE)
+    .set_description(""),
+
+    Option("mon_osd_nearfull_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.85)
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .set_flag(Option::FLAG_CLUSTER_CREATE)
+    .set_description("nearfull ratio for OSDs to be set during initial creation of cluster"),
+
+    Option("mon_osd_initial_require_min_compat_client", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("jewel")
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .set_flag(Option::FLAG_CLUSTER_CREATE)
+    .set_description(""),
+
+    Option("mon_allow_pool_delete", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .add_service("mon")
+    .set_description("allow pool deletions"),
+
+    Option("mon_fake_pool_delete", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .add_service("mon")
+    .set_description("fake pool deletions by renaming the rados pool"),
+
+    Option("mon_globalid_prealloc", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(10000)
+    .add_service("mon")
+    .set_description("number of globalid values to preallocate")
+    .set_long_description("This setting caps how many new clients can authenticate with the cluster before the monitors have to perform a write to preallocate more.  Large values burn through the 64-bit ID space more quickly."),
+
+    Option("mon_osd_report_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(900)
+    .add_service("mon")
+    .set_description("time before OSDs who do not report to the mons are marked down (seconds)"),
+
+    Option("mon_warn_on_insecure_global_id_reclaim", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .add_service("mon")
+    .set_description("issue AUTH_INSECURE_GLOBAL_ID_RECLAIM health warning if any connected clients are insecurely reclaiming global_id")
+    .add_see_also("mon_warn_on_insecure_global_id_reclaim_allowed")
+    .add_see_also("auth_allow_insecure_global_id_reclaim")
+    .add_see_also("auth_expose_insecure_global_id_reclaim"),
+
+    Option("mon_warn_on_insecure_global_id_reclaim_allowed", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .add_service("mon")
+    .set_description("issue AUTH_INSECURE_GLOBAL_ID_RECLAIM_ALLOWED health warning if insecure global_id reclaim is allowed")
+    .add_see_also("mon_warn_on_insecure_global_id_reclaim")
+    .add_see_also("auth_allow_insecure_global_id_reclaim")
+    .add_see_also("auth_expose_insecure_global_id_reclaim"),
+
+    Option("mon_warn_on_msgr2_not_enabled", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .add_service("mon")
+    .set_description("issue MON_MSGR2_NOT_ENABLED health warning if monitors are all running Nautilus but not all binding to a msgr2 port")
+    .add_see_also("ms_bind_msgr2"),
+
+    Option("mon_warn_on_legacy_crush_tunables", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .add_service("mgr")
+    .set_description("issue OLD_CRUSH_TUNABLES health warning if CRUSH tunables are older than mon_crush_min_required_version")
+    .add_see_also("mon_crush_min_required_version"),
+
+    Option("mon_crush_min_required_version", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("hammer")
+    .add_service("mgr")
+    .set_description("minimum ceph release to use for mon_warn_on_legacy_crush_tunables")
+    .add_see_also("mon_warn_on_legacy_crush_tunables"),
+
+    Option("mon_warn_on_crush_straw_calc_version_zero", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .add_service("mgr")
+    .set_description("issue OLD_CRUSH_STRAW_CALC_VERSION health warning if the CRUSH map's straw_calc_version is zero"),
+
+    Option("mon_warn_on_osd_down_out_interval_zero", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .add_service("mgr")
+    .set_description("issue OSD_NO_DOWN_OUT_INTERVAL health warning if mon_osd_down_out_interval is zero")
+    .set_long_description("Having mon_osd_down_out_interval set to 0 means that down OSDs are not marked out automatically and the cluster does not heal itself without administrator intervention.")
+    .add_see_also("mon_osd_down_out_interval"),
+
+    Option("mon_warn_on_cache_pools_without_hit_sets", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .add_service("mgr")
+    .set_description("issue CACHE_POOL_NO_HIT_SET health warning for cache pools that do not have hit sets configured"),
+
+    Option("mon_warn_on_pool_no_app", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(true)
+    .add_service("mgr")
+    .set_description("issue POOL_APP_NOT_ENABLED health warning if pool has not application enabled"),
+
+    Option("mon_warn_on_pool_pg_num_not_power_of_two", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(true)
+    .add_service("mon")
+    .set_description("issue POOL_PG_NUM_NOT_POWER_OF_TWO warning if pool has a non-power-of-two pg_num value"),
+
+    Option("mon_warn_on_pool_no_redundancy", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .add_service("mon")
+    .set_description("Issue a health warning if any pool is configured with no replicas")
+    .add_see_also("osd_pool_default_size")
+    .add_see_also("osd_pool_default_min_size"),
+
+    Option("mon_warn_on_misplaced", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .add_service("mgr")
+    .set_description("Issue a health warning if there are misplaced objects"),
+
+    Option("mon_warn_on_too_few_osds", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .add_service("mgr")
+    .set_description("Issue a health warning if there are fewer OSDs than osd_pool_default_size"),
+
+    Option("mon_warn_on_slow_ping_time", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .add_service("mgr")
+    .set_description("Override mon_warn_on_slow_ping_ratio with specified threshold in milliseconds")
+    .add_see_also("mon_warn_on_slow_ping_ratio"),
+
+    Option("mon_warn_on_slow_ping_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.05)
+    .add_service("mgr")
+    .set_description("Issue a health warning if heartbeat ping longer than percentage of osd_heartbeat_grace")
+    .add_see_also("osd_heartbeat_grace")
+    .add_see_also("mon_warn_on_slow_ping_time"),
+
+    Option("mon_max_snap_prune_per_epoch", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(100)
+    .add_service("mon")
+    .set_description("max number of pruned snaps we will process in a single OSDMap epoch"),
+
+    Option("mon_min_osdmap_epochs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(500)
+    .add_service("mon")
+    .set_description("min number of OSDMaps to store"),
+
+    Option("mon_max_log_epochs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(500)
+    .add_service("mon")
+    .set_description("max number of past cluster log epochs to store"),
+
+    Option("mon_max_mdsmap_epochs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(500)
+    .add_service("mon")
+    .set_description("max number of FSMaps/MDSMaps to store"),
+
+    Option("mon_max_mgrmap_epochs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(500)
+    .add_service("mon")
+    .set_description("max number of MgrMaps to store"),
+
+    Option("mon_max_osd", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(10000)
+    .add_service("mon")
+    .set_description("max number of OSDs in a cluster"),
+
+    Option("mon_probe_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(2.0)
+    .add_service("mon")
+    .set_description("timeout for querying other mons during bootstrap pre-election phase (seconds)"),
+
+    Option("mon_client_bytes", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(100ul << 20)
+    .add_service("mon")
+    .set_description("max bytes of outstanding client messages mon will read off the network"),
+
+    Option("mon_daemon_bytes", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(400ul << 20)
+    .add_service("mon")
+    .set_description("max bytes of outstanding mon messages mon will read off the network"),
+
+    Option("mon_mgr_proxy_client_bytes_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(.3)
+    .add_service("mon")
+    .set_description("ratio of mon_client_bytes that can be consumed by "
+                     "proxied mgr commands before we error out to client"),
+
+    Option("mon_log_max_summary", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(50)
+    .add_service("mon")
+    .set_description("number of recent cluster log messages to retain"),
+
+    Option("mon_max_log_entries_per_event", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(4096)
+    .add_service("mon")
+    .set_description("max cluster log entries per paxos event"),
+
+    Option("mon_reweight_min_pgs_per_osd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .add_service("mgr")
+    .set_description(""),
+
+    Option("mon_reweight_min_bytes_per_osd", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(100_M)
+    .add_service("mgr")
+    .set_description(""),
+
+    Option("mon_reweight_max_osds", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(4)
+    .add_service("mgr")
+    .set_description(""),
+
+    Option("mon_reweight_max_change", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.05)
+    .add_service("mgr")
+    .set_description(""),
+
+    Option("mon_health_to_clog", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .add_service("mon")
+    .set_description("log monitor health to cluster log"),
+
+    Option("mon_health_to_clog_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(10_min)
+    .add_service("mon")
+    .set_description("frequency to log monitor health to cluster log")
+    .add_see_also("mon_health_to_clog"),
+
+    Option("mon_health_to_clog_tick_interval", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(60.0)
+    .add_service("mon")
+    .set_description(""),
+
+    Option("mon_health_detail_to_clog", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(true)
+    .set_description("log health detail to cluster log"),
+
+    Option("mon_health_max_detail", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(50)
+    .add_service("mon")
+    .set_description("max detailed pgs to report in health detail"),
+
+    Option("mon_health_log_update_period", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(5)
+    .add_service("mon")
+    .set_description("minimum time in seconds between log messages about "
+                     "each health check")
+    .set_min(0),
+
+    Option("mon_data_avail_crit", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .add_service("mon")
+    .set_description("issue MON_DISK_CRIT health error when mon available space below this percentage"),
+
+    Option("mon_data_avail_warn", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(30)
+    .add_service("mon")
+    .set_description("issue MON_DISK_LOW health warning when mon available space below this percentage"),
+
+    Option("mon_data_size_warn", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(15_G)
+    .add_service("mon")
+    .set_description("issue MON_DISK_BIG health warning when mon database is above this size"),
+
+    Option("mon_warn_pg_not_scrubbed_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.5)
+    .set_min(0)
+    .set_description("Percentage of the scrub max interval past the scrub max interval to warn")
+    .set_long_description("")
+    .add_see_also("osd_scrub_max_interval"),
+
+    Option("mon_warn_pg_not_deep_scrubbed_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.75)
+    .set_min(0)
+    .set_description("Percentage of the deep scrub interval past the deep scrub interval to warn")
+    .set_long_description("")
+    .add_see_also("osd_deep_scrub_interval"),
+
+    Option("mon_scrub_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1_day)
+    .add_service("mon")
+    .set_description("frequency for scrubbing mon database"),
+
+    Option("mon_scrub_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(5_min)
+    .add_service("mon")
+    .set_description("timeout to restart scrub of mon quorum participant does not respond for the latest chunk"),
+
+    Option("mon_scrub_max_keys", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(100)
+    .add_service("mon")
+    .set_description("max keys per on scrub chunk/step"),
+
+    Option("mon_scrub_inject_crc_mismatch", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0.0)
+    .add_service("mon")
+    .set_description("probability for injecting crc mismatches into mon scrub"),
+
+    Option("mon_scrub_inject_missing_keys", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0.0)
+    .add_service("mon")
+    .set_description("probability for injecting missing keys into mon scrub"),
+
+    Option("mon_config_key_max_entry_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(64_K)
+    .add_service("mon")
+    .set_description("Defines the number of bytes allowed to be held in a "
+		     "single config-key entry"),
+
+    Option("mon_sync_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(60.0)
+    .add_service("mon")
+    .set_description("timeout before canceling sync if syncing mon does not respond"),
+
+    Option("mon_sync_max_payload_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(1_M)
+    .add_service("mon")
+    .set_description("target max message payload for mon sync"),
+
+    Option("mon_sync_max_payload_keys", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(2000)
+    .add_service("mon")
+    .set_description("target max keys in message payload for mon sync"),
+
+    Option("mon_sync_debug", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .add_service("mon")
+    .set_description("enable extra debugging during mon sync"),
+
+    Option("mon_inject_sync_get_chunk_delay", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .add_service("mon")
+    .set_description("inject delay during sync (seconds)"),
+
+    Option("mon_osd_min_down_reporters", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(2)
+    .add_service("mon")
+    .set_description("number of OSDs from different subtrees who need to report a down OSD for it to count")
+    .add_see_also("mon_osd_reporter_subtree_level"),
+
+    Option("mon_osd_reporter_subtree_level", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("host")
+    .add_service("mon")
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("in which level of parent bucket the reporters are counted"),
+
+    Option("mon_osd_snap_trim_queue_warn_on", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(32768)
+    .add_service("mon")
+    .set_description("Warn when snap trim queue is that large (or larger).")
+    .set_long_description("Warn when snap trim queue length for at least one PG crosses this value, as this is indicator of snap trimmer not keeping up, wasting disk space"),
+
+    Option("mon_osd_force_trim_to", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .add_service("mon")
+    .set_description("force mons to trim osdmaps through this epoch"),
+
+    Option("mon_mds_force_trim_to", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .add_service("mon")
+    .set_description("force mons to trim mdsmaps/fsmaps through this epoch"),
+
+    Option("mon_mds_skip_sanity", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .add_service("mon")
+    .set_description("skip sanity checks on fsmap/mdsmap"),
+
+    Option("mon_debug_extra_checks", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .add_service("mon")
+    .set_description("Enable some additional monitor checks")
+    .set_long_description(
+        "Enable some additional monitor checks that would be too expensive "
+        "to run on production systems, or would only be relevant while "
+        "testing or debugging."),
+
+    Option("mon_debug_block_osdmap_trim", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .add_service("mon")
+    .set_description("Block OSDMap trimming while the option is enabled.")
+    .set_long_description(
+        "Blocking OSDMap trimming may be quite helpful to easily reproduce "
+        "states in which the monitor keeps (hundreds of) thousands of "
+        "osdmaps."),
+
+    Option("mon_debug_deprecated_as_obsolete", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .add_service("mon")
+    .set_description("treat deprecated mon commands as obsolete"),
+
+    Option("mon_debug_dump_transactions", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .add_service("mon")
+    .set_description("dump paxos transactions to log")
+    .add_see_also("mon_debug_dump_location"),
+
+    Option("mon_debug_dump_json", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .add_service("mon")
+    .set_description("dump paxos transasctions to log as json")
+    .add_see_also("mon_debug_dump_transactions"),
+
+    Option("mon_debug_dump_location", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("/var/log/ceph/$cluster-$name.tdump")
+    .add_service("mon")
+    .set_description("file to dump paxos transactions to")
+    .add_see_also("mon_debug_dump_transactions"),
+
+    Option("mon_debug_no_require_mimic", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .add_service("mon")
+    .set_flag(Option::FLAG_CLUSTER_CREATE)
+    .set_description("do not set mimic feature for new mon clusters"),
+
+    Option("mon_debug_no_require_nautilus", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .add_service("mon")
+    .set_flag(Option::FLAG_CLUSTER_CREATE)
+    .set_description("do not set nautilus feature for new mon clusters"),
+
+    Option("mon_debug_no_require_bluestore_for_ec_overwrites", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .add_service("mon")
+    .set_description("do not require bluestore OSDs to enable EC overwrites on a rados pool"),
+
+    Option("mon_debug_no_initial_persistent_features", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .add_service("mon")
+    .set_flag(Option::FLAG_CLUSTER_CREATE)
+    .set_description("do not set any monmap features for new mon clusters"),
+
+    Option("mon_inject_transaction_delay_max", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(10.0)
+    .add_service("mon")
+    .set_description("max duration of injected delay in paxos"),
+
+    Option("mon_inject_transaction_delay_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .add_service("mon")
+    .set_description("probability of injecting a delay in paxos"),
+
+    Option("mon_inject_pg_merge_bounce_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .add_service("mon")
+    .set_description("probability of failing and reverting a pg_num decrement"),
+
+    Option("mon_sync_provider_kill_at", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .add_service("mon")
+    .set_description("kill mon sync requester at specific point"),
+
+    Option("mon_sync_requester_kill_at", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .add_service("mon")
+    .set_description("kill mon sync requestor at specific point"),
+
+    Option("mon_force_quorum_join", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .add_service("mon")
+    .set_description("force mon to rejoin quorum even though it was just removed"),
+
+    Option("mon_keyvaluedb", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("rocksdb")
+    .set_enum_allowed({"leveldb", "rocksdb"})
+    .set_flag(Option::FLAG_CREATE)
+    .add_service("mon")
+    .set_description("database backend to use for the mon database"),
+
+    Option("mon_debug_unsafe_allow_tier_with_nonempty_snaps", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .add_service("mon")
+    .set_description(""),
+
+    Option("mon_osd_blacklist_default_expire", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1_hr)
+    .add_service("mon")
+    .set_description("Duration in seconds that blacklist entries for clients "
+                     "remain in the OSD map"),
+
+    Option("mon_mds_blacklist_interval", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(1_day)
+    .set_min(1_hr)
+    .add_service("mon")
+    .set_description("Duration in seconds that blacklist entries for MDS "
+                     "daemons remain in the OSD map"),
+
+    Option("mon_osd_crush_smoke_test", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .add_service("mon")
+    .set_description("perform a smoke test on any new CRUSH map before accepting changes"),
+
+    Option("mon_smart_report_timeout", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .add_service("mon")
+    .set_description("Timeout (in seconds) for smarctl to run, default is set to 5"),
+
+
+    // PAXOS
+
+    Option("paxos_stash_full_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(25)
+    .add_service("mon")
+    .set_description(""),
+
+    Option("paxos_max_join_drift", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .add_service("mon")
+    .set_description(""),
+
+    Option("paxos_propose_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1.0)
+    .add_service("mon")
+    .set_description(""),
+
+    Option("paxos_min_wait", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.05)
+    .add_service("mon")
+    .set_description(""),
+
+    Option("paxos_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(500)
+    .add_service("mon")
+    .set_description(""),
+
+    Option("paxos_trim_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(250)
+    .add_service("mon")
+    .set_description(""),
+
+    Option("paxos_trim_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(500)
+    .add_service("mon")
+    .set_description(""),
+
+    Option("paxos_service_trim_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(250)
+    .add_service("mon")
+    .set_description(""),
+
+    Option("paxos_service_trim_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(500)
+    .add_service("mon")
+    .set_description(""),
+
+    Option("paxos_kill_at", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .add_service("mon")
+    .set_description(""),
+
+
+    // AUTH
+
+    Option("auth_cluster_required", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("cephx")
+    .set_description("authentication methods required by the cluster"),
+
+    Option("auth_service_required", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("cephx")
+    .set_description("authentication methods required by service daemons"),
+
+    Option("auth_client_required", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("cephx, none")
+    .set_flag(Option::FLAG_MINIMAL_CONF)
+    .set_description("authentication methods allowed by clients"),
+
+    Option("auth_supported", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("authentication methods required (deprecated)"),
+
+    Option("max_rotating_auth_attempts", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .set_description("number of attempts to initialize rotating keys before giving up"),
+
+    Option("rotating_keys_bootstrap_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(30)
+    .set_description("timeout for obtaining rotating keys during bootstrap phase (seconds)"),
+
+    Option("rotating_keys_renewal_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .set_description("timeout for updating rotating keys (seconds)"),
+
+    Option("cephx_require_signatures", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("cephx_require_version", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description("Cephx version required (1 = pre-mimic, 2 = mimic+)"),
+
+    Option("cephx_cluster_require_signatures", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("cephx_cluster_require_version", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description("Cephx version required by the cluster from clients (1 = pre-mimic, 2 = mimic+)"),
+
+    Option("cephx_service_require_signatures", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("cephx_service_require_version", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description("Cephx version required from ceph services (1 = pre-mimic, 2 = mimic+)"),
+
+    Option("cephx_sign_messages", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description(""),
+
+    Option("auth_mon_ticket_ttl", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(72_hr)
+    .set_description(""),
+
+    Option("auth_service_ticket_ttl", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1_hr)
+    .set_description(""),
+
+    Option("auth_allow_insecure_global_id_reclaim", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Allow reclaiming global_id without presenting a valid ticket proving previous possession of that global_id")
+    .set_long_description("Allowing unauthorized global_id (re)use poses a security risk.  Unfortunately, older clients may omit their ticket on reconnects and therefore rely on this being allowed for preserving their global_id for the lifetime of the client instance.  Setting this value to false would immediately prevent new connections from those clients (assuming auth_expose_insecure_global_id_reclaim set to true) and eventually break existing sessions as well (regardless of auth_expose_insecure_global_id_reclaim setting).")
+    .add_see_also("mon_warn_on_insecure_global_id_reclaim")
+    .add_see_also("mon_warn_on_insecure_global_id_reclaim_allowed")
+    .add_see_also("auth_expose_insecure_global_id_reclaim"),
+
+    Option("auth_expose_insecure_global_id_reclaim", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Force older clients that may omit their ticket on reconnects to reconnect as part of establishing a session")
+    .set_long_description("In permissive mode (auth_allow_insecure_global_id_reclaim set to true), this helps with identifying clients that are not patched.  In enforcing mode (auth_allow_insecure_global_id_reclaim set to false), this is a fail-fast mechanism: don't establish a session that will almost inevitably be broken later.")
+    .add_see_also("mon_warn_on_insecure_global_id_reclaim")
+    .add_see_also("mon_warn_on_insecure_global_id_reclaim_allowed")
+    .add_see_also("auth_allow_insecure_global_id_reclaim"),
+
+    Option("auth_debug", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("mon_client_hunt_parallel", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(3)
+    .set_description(""),
+
+    Option("mon_client_hunt_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(3.0)
+    .set_description(""),
+
+    Option("mon_client_ping_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(10.0)
+    .set_description(""),
+
+    Option("mon_client_ping_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(30.0)
+    .set_description(""),
+
+    Option("mon_client_hunt_interval_backoff", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1.5)
+    .set_description(""),
+
+    Option("mon_client_hunt_interval_min_multiple", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1.0)
+    .set_description(""),
+
+    Option("mon_client_hunt_interval_max_multiple", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(10.0)
+    .set_description(""),
+
+    Option("mon_client_max_log_entries_per_message", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1000)
+    .set_description(""),
+
+    Option("mon_client_directed_command_retry", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(2)
+    .set_description("Number of times to try sending a comamnd directed at a specific monitor"),
+
+    Option("mon_max_pool_pg_num", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(65536)
+    .set_description(""),
+
+    Option("mon_pool_quota_warn_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("percent of quota at which to issue warnings")
+    .add_service("mgr"),
+
+    Option("mon_pool_quota_crit_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("percent of quota at which to issue errors")
+    .add_service("mgr"),
+
+    Option("crush_location", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description(""),
+
+    Option("crush_location_hook", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description(""),
+
+    Option("crush_location_hook_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .set_description(""),
+
+    Option("objecter_tick_interval", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(5.0)
+    .set_description(""),
+
+    Option("objecter_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(10.0)
+    .set_description("Seconds before in-flight op is considered 'laggy' and we query mon for the latest OSDMap"),
+
+    Option("objecter_inflight_op_bytes", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(100_M)
+    .set_description("Max in-flight data in bytes (both directions)"),
+
+    Option("objecter_inflight_ops", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1024)
+    .set_description("Max in-flight operations"),
+
+    Option("objecter_completion_locks_per_session", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(32)
+    .set_description(""),
+
+    Option("objecter_inject_no_watch_ping", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("objecter_retry_writes_after_first_reply", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("objecter_debug_inject_relock_delay", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("filer_max_purge_ops", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .set_description("Max in-flight operations for purging a striped range (e.g., MDS journal)"),
+
+    Option("filer_max_truncate_ops", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(128)
+    .set_description("Max in-flight operations for truncating/deleting a striped sequence (e.g., MDS journal)"),
+
+    Option("journaler_write_head_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(15)
+    .set_description("Interval in seconds between journal header updates (to help bound replay time)"),
+
+    // * journal object size
+    Option("journaler_prefetch_periods", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .set_min(2)			// we need at least 2 periods to make progress.
+    .set_description("Number of striping periods to prefetch while reading MDS journal"),
+
+    // * journal object size
+    Option("journaler_prezero_periods", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    // we need to zero at least two periods, minimum, to ensure that we
+    // have a full empty object/period in front of us.
+    .set_min(2)
+    .set_description("Number of striping periods to zero head of MDS journal write position"),
+
+    // -- OSD --
+    Option("osd_calc_pg_upmaps_aggressively", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("try to calculate PG upmaps more aggressively, e.g., "
+                     "by doing a fairly exhaustive search of existing PGs "
+                     "that can be unmapped or upmapped"),
+
+    Option("osd_calc_pg_upmaps_local_fallback_retries", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(100)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Maximum number of PGs we can attempt to unmap or upmap "
+                     "for a specific overfull or underfull osd per iteration "),
+
+    Option("osd_numa_prefer_iface", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("prefer IP on network interface on same numa node as storage")
+    .add_see_also("osd_numa_auto_affinity"),
+
+    Option("osd_numa_auto_affinity", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("automatically set affinity to numa node when storage and network match"),
+
+    Option("osd_numa_node", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(-1)
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("set affinity to a numa node (-1 for none)")
+    .add_see_also("osd_numa_auto_affinity"),
+
+    Option("osd_smart_report_timeout", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("Timeout (in seconds) for smarctl to run, default is set to 5"),
+
+    Option("osd_check_max_object_name_len_on_startup", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(true)
+    .set_description(""),
+
+    Option("osd_max_backfills", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description("Maximum number of concurrent local and remote backfills or recoveries per OSD ")
+    .set_long_description("There can be osd_max_backfills local reservations AND the same remote reservations per OSD. So a value of 1 lets this OSD participate as 1 PG primary in recovery and 1 shard of another recovering PG."),
+
+    Option("osd_min_recovery_priority", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("Minimum priority below which recovery is not performed")
+    .set_long_description("The purpose here is to prevent the cluster from doing *any* lower priority work (e.g., rebalancing) below this threshold and focus solely on higher priority work (e.g., replicating degraded objects)."),
+
+    Option("osd_backfill_retry_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(30.0)
+    .set_description("how frequently to retry backfill reservations after being denied (e.g., due to a full OSD)"),
+
+    Option("osd_recovery_retry_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(30.0)
+    .set_description("how frequently to retry recovery reservations after being denied (e.g., due to a full OSD)"),
+
+    Option("osd_agent_max_ops", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(4)
+    .set_description("maximum concurrent tiering operations for tiering agent"),
+
+    Option("osd_agent_max_low_ops", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(2)
+    .set_description("maximum concurrent low-priority tiering operations for tiering agent"),
+
+    Option("osd_agent_min_evict_effort", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.1)
+    .set_min_max(0.0, .99)
+    .set_description("minimum effort to expend evicting clean objects"),
+
+    Option("osd_agent_quantize_effort", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.1)
+    .set_description("size of quantize unit for eviction effort"),
+
+    Option("osd_agent_delay_time", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(5.0)
+    .set_description("how long agent should sleep if it has no work to do"),
+
+    Option("osd_find_best_info_ignore_history_les", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("ignore last_epoch_started value when peering AND PROBABLY LOSE DATA")
+    .set_long_description("THIS IS AN EXTREMELY DANGEROUS OPTION THAT SHOULD ONLY BE USED AT THE DIRECTION OF A DEVELOPER.  It makes peering ignore the last_epoch_started value when peering, which can allow the OSD to believe an OSD has an authoritative view of a PG's contents even when it is in fact old and stale, typically leading to data loss (by believing a stale PG is up to date)."),
+
+    Option("osd_agent_hist_halflife", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1000)
+    .set_description("halflife of agent atime and temp histograms"),
+
+    Option("osd_agent_slop", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.02)
+    .set_description("slop factor to avoid switching tiering flush and eviction mode"),
+
+    Option("osd_uuid", Option::TYPE_UUID, Option::LEVEL_ADVANCED)
+    .set_default(uuid_d())
+    .set_flag(Option::FLAG_CREATE)
+    .set_description("uuid label for a new OSD"),
+
+    Option("osd_data", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("/var/lib/ceph/osd/$cluster-$id")
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .set_description("path to OSD data"),
+
+    Option("osd_journal", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("/var/lib/ceph/osd/$cluster-$id/journal")
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .set_description("path to OSD journal (when FileStore backend is in use)"),
+
+    Option("osd_journal_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(5120)
+    .set_flag(Option::FLAG_CREATE)
+    .set_description("size of FileStore journal (in MiB)"),
+
+    Option("osd_journal_flush_on_shutdown", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("flush FileStore journal contents during clean OSD shutdown"),
+
+    Option("osd_os_flags", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description("flags to skip filestore omap or journal initialization"),
+
+    Option("osd_max_write_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_min(4)
+    .set_default(90)
+    .set_description("Maximum size of a RADOS write operation in megabytes")
+    .set_long_description("This setting prevents clients from doing "
+        "very large writes to RADOS.  If you set this to a value "
+        "below what clients expect, they will receive an error "
+        "when attempting to write to the cluster."),
+
+    Option("osd_max_pgls", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1024)
+    .set_description("maximum number of results when listing objects in a pool"),
+
+    Option("osd_client_message_size_cap", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(500_M)
+    .set_description("maximum memory to devote to in-flight client requests")
+    .set_long_description("If this value is exceeded, the OSD will not read any new client data off of the network until memory is freed."),
+
+    Option("osd_client_message_cap", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("maximum number of in-flight client requests"),
+
+    Option("osd_crush_update_weight_set", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("update CRUSH weight-set weights when updating weights")
+    .set_long_description("If this setting is true, we will update the weight-set weights when adjusting an item's weight, effectively making changes take effect immediately, and discarding any previous optimization in the weight-set value.  Setting this value to false will leave it to the balancer to (slowly, presumably) adjust weights to approach the new target value."),
+
+    Option("osd_crush_chooseleaf_type", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(1)
+    .set_flag(Option::FLAG_CREATE)
+    .set_description("default chooseleaf type for osdmaptool --create"),
+
+    Option("osd_pool_use_gmt_hitset", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(true)
+    .set_description("use UTC for hitset timestamps")
+    .set_long_description("This setting only exists for compatibility with hammer (and older) clusters."),
+
+    Option("osd_crush_update_on_start", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("update OSD CRUSH location on startup"),
+
+    Option("osd_class_update_on_start", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("set OSD device class on startup"),
+
+    Option("osd_crush_initial_weight", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(-1)
+    .set_description("if >= 0, initial CRUSH weight for newly created OSDs")
+    .set_long_description("If this value is negative, the size of the OSD in TiB is used."),
+
+    Option("osd_pool_default_ec_fast_read", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("set ec_fast_read for new erasure-coded pools")
+    .add_service("mon"),
+
+    Option("osd_pool_default_crush_rule", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(-1)
+    .set_description("CRUSH rule for newly created pools")
+    .add_service("mon"),
+
+    Option("osd_pool_erasure_code_stripe_unit", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(4_K)
+    .set_description("the amount of data (in bytes) in a data chunk, per stripe")
+    .add_service("mon"),
+
+    Option("osd_pool_default_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(3)
+    .set_min_max(0, 10)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("the number of copies of an object for new replicated pools")
+    .add_service("mon"),
+
+    Option("osd_pool_default_min_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_min_max(0, 255)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("the minimal number of copies allowed to write to a degraded pool for new replicated pools")
+    .set_long_description("0 means no specific default; ceph will use size-size/2")
+    .add_see_also("osd_pool_default_size")
+    .add_service("mon"),
+
+    Option("osd_pool_default_pg_num", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(32)
+    .set_description("number of PGs for new pools")
+    .set_flag(Option::FLAG_RUNTIME)
+    .add_service("mon"),
+
+    Option("osd_pool_default_pgp_num", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("number of PGs for placement purposes (0 to match pg_num)")
+    .add_see_also("osd_pool_default_pg_num")
+    .set_flag(Option::FLAG_RUNTIME)
+    .add_service("mon"),
+
+    Option("osd_pool_default_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("replicated")
+    .set_enum_allowed({"replicated", "erasure"})
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("default type of pool to create")
+    .add_service("mon"),
+
+    Option("osd_pool_default_erasure_code_profile", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("plugin=jerasure technique=reed_sol_van k=2 m=1")
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("default erasure code profile for new erasure-coded pools")
+    .add_service("mon"),
+
+    Option("osd_erasure_code_plugins", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("jerasure lrc"
+  #ifdef HAVE_BETTER_YASM_ELF64
+         " isa"
+  #endif
+        )
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("erasure code plugins to load")
+    .add_service("mon")
+    .add_service("osd"),
+
+    Option("osd_allow_recovery_below_min_size", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(true)
+    .set_description("allow replicated pools to recover with < min_size active members")
+    .add_service("osd"),
+
+    Option("osd_pool_default_flags", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description("(integer) flags to set on new pools")
+    .add_service("mon"),
+
+    Option("osd_pool_default_flag_hashpspool", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("set hashpspool (better hashing scheme) flag on new pools")
+    .add_service("mon"),
+
+    Option("osd_pool_default_flag_nodelete", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("set nodelete flag on new pools")
+    .add_service("mon"),
+
+    Option("osd_pool_default_flag_nopgchange", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("set nopgchange flag on new pools")
+    .add_service("mon"),
+
+    Option("osd_pool_default_flag_nosizechange", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("set nosizechange flag on new pools")
+    .add_service("mon"),
+
+    Option("osd_pool_default_hit_set_bloom_fpp", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.05)
+    .set_description("")
+    .add_see_also("osd_tier_default_cache_hit_set_type")
+    .add_service("mon"),
+
+    Option("osd_pool_default_cache_target_dirty_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.4)
+    .set_description(""),
+
+    Option("osd_pool_default_cache_target_dirty_high_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.6)
+    .set_description(""),
+
+    Option("osd_pool_default_cache_target_full_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.8)
+    .set_description(""),
+
+    Option("osd_pool_default_cache_min_flush_age", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description(""),
+
+    Option("osd_pool_default_cache_min_evict_age", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description(""),
+
+    Option("osd_pool_default_cache_max_evict_check_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .set_description(""),
+
+    Option("osd_pool_default_pg_autoscale_mode", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("warn")
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_enum_allowed({"off", "warn", "on"})
+    .set_description("Default PG autoscaling behavior for new pools"),
+
+    Option("osd_hit_set_min_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1000)
+    .set_description(""),
+
+    Option("osd_hit_set_max_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(100000)
+    .set_description(""),
+
+    Option("osd_hit_set_namespace", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default(".ceph-internal")
+    .set_description(""),
+
+    Option("osd_tier_promote_max_objects_sec", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(25)
+    .set_description(""),
+
+    Option("osd_tier_promote_max_bytes_sec", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(5_M)
+    .set_description(""),
+
+    Option("osd_tier_default_cache_mode", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("writeback")
+    .set_enum_allowed({"none", "writeback", "forward",
+	               "readonly", "readforward", "readproxy", "proxy"})
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description(""),
+
+    Option("osd_tier_default_cache_hit_set_count", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(4)
+    .set_description(""),
+
+    Option("osd_tier_default_cache_hit_set_period", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1200)
+    .set_description(""),
+
+    Option("osd_tier_default_cache_hit_set_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("bloom")
+    .set_enum_allowed({"bloom", "explicit_hash", "explicit_object"})
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description(""),
+
+    Option("osd_tier_default_cache_min_read_recency_for_promote", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description("number of recent HitSets the object must appear in to be promoted (on read)"),
+
+    Option("osd_tier_default_cache_min_write_recency_for_promote", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description("number of recent HitSets the object must appear in to be promoted (on write)"),
+
+    Option("osd_tier_default_cache_hit_set_grade_decay_rate", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(20)
+    .set_description(""),
+
+    Option("osd_tier_default_cache_hit_set_search_last_n", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description(""),
+
+    Option("osd_objecter_finishers", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description(""),
+
+    Option("osd_map_dedup", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description(""),
+
+    Option("osd_map_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(50)
+    .set_description(""),
+
+    Option("osd_map_message_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(40)
+    .set_description("maximum number of OSDMaps to include in a single message"),
+
+    Option("osd_map_message_max_bytes", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(10_M)
+    .set_description("maximum number of bytes worth of OSDMaps to include in a single message"),
+
+    Option("osd_map_share_max_epochs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(40)
+    .set_description(""),
+
+    Option("osd_pg_epoch_max_lag_factor", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(2.0)
+    .set_description("Max multiple of the map cache that PGs can lag before we throttle map injest")
+    .add_see_also("osd_map_cache_size"),
+
+    Option("osd_inject_bad_map_crc_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("osd_inject_failure_on_pg_removal", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_max_markdown_period", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(600)
+    .set_description(""),
+
+    Option("osd_max_markdown_count", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description(""),
+
+    Option("osd_op_pq_max_tokens_per_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(4194304)
+    .set_description(""),
+
+    Option("osd_op_pq_min_cost", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(65536)
+    .set_description(""),
+
+    Option("osd_recover_clone_overlap", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description(""),
+
+    Option("osd_op_num_threads_per_shard", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description(""),
+
+    Option("osd_op_num_threads_per_shard_hdd", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("")
+    .add_see_also("osd_op_num_threads_per_shard"),
+
+    Option("osd_op_num_threads_per_shard_ssd", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(2)
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("")
+    .add_see_also("osd_op_num_threads_per_shard"),
+
+    Option("osd_op_num_shards", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description(""),
+
+    Option("osd_op_num_shards_hdd", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("")
+    .add_see_also("osd_op_num_shards"),
+
+    Option("osd_op_num_shards_ssd", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(8)
+    .set_flag(Option::FLAG_STARTUP)
+    .set_description("")
+    .add_see_also("osd_op_num_shards"),
+
+    Option("osd_skip_data_digest", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("Do not store full-object checksums if the backend (bluestore) does its own checksums.  Only usable with all BlueStore OSDs."),
+
+    Option("osd_op_queue", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("wpq")
+    .set_enum_allowed( { "wpq", "prioritized", "mclock_opclass", "mclock_client", "debug_random" } )
+    .set_description("which operation queue algorithm to use")
+    .set_long_description("which operation queue algorithm to use; mclock_opclass and mclock_client are currently experimental")
+    .set_flag(Option::FLAG_STARTUP)
+    .add_see_also("osd_op_queue_cut_off"),
+
+    Option("osd_op_queue_cut_off", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("low")
+    .set_enum_allowed( { "low", "high", "debug_random" } )
+    .set_description("the threshold between high priority ops and low priority ops")
+    .set_long_description("the threshold between high priority ops that use strict priority ordering and low priority ops that use a fairness algorithm that may or may not incorporate priority")
+    .add_see_also("osd_op_queue"),
+
+    Option("osd_op_queue_mclock_client_op_res", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1000.0)
+    .set_description("mclock reservation of client operator requests")
+    .set_long_description("mclock reservation of client operator requests when osd_op_queue is either 'mclock_opclass' or 'mclock_client'; higher values increase the reservation")
+    .add_see_also("osd_op_queue")
+    .add_see_also("osd_op_queue_mclock_client_op_wgt")
+    .add_see_also("osd_op_queue_mclock_client_op_lim")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_res")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_wgt")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_lim")
+    .add_see_also("osd_op_queue_mclock_snap_res")
+    .add_see_also("osd_op_queue_mclock_snap_wgt")
+    .add_see_also("osd_op_queue_mclock_snap_lim")
+    .add_see_also("osd_op_queue_mclock_recov_res")
+    .add_see_also("osd_op_queue_mclock_recov_wgt")
+    .add_see_also("osd_op_queue_mclock_recov_lim")
+    .add_see_also("osd_op_queue_mclock_scrub_res")
+    .add_see_also("osd_op_queue_mclock_scrub_wgt")
+    .add_see_also("osd_op_queue_mclock_scrub_lim")
+    .add_see_also("osd_op_queue_mclock_anticipation_timeout"),
+
+    Option("osd_op_queue_mclock_client_op_wgt", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(500.0)
+    .set_description("mclock weight of client operator requests")
+    .set_long_description("mclock weight of client operator requests when osd_op_queue is either 'mclock_opclass' or 'mclock_client'; higher values increase the weight")
+    .add_see_also("osd_op_queue")
+    .add_see_also("osd_op_queue_mclock_client_op_res")
+    .add_see_also("osd_op_queue_mclock_client_op_lim")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_res")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_wgt")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_lim")
+    .add_see_also("osd_op_queue_mclock_snap_res")
+    .add_see_also("osd_op_queue_mclock_snap_wgt")
+    .add_see_also("osd_op_queue_mclock_snap_lim")
+    .add_see_also("osd_op_queue_mclock_recov_res")
+    .add_see_also("osd_op_queue_mclock_recov_wgt")
+    .add_see_also("osd_op_queue_mclock_recov_lim")
+    .add_see_also("osd_op_queue_mclock_scrub_res")
+    .add_see_also("osd_op_queue_mclock_scrub_wgt")
+    .add_see_also("osd_op_queue_mclock_scrub_lim")
+    .add_see_also("osd_op_queue_mclock_anticipation_timeout"),
+
+    Option("osd_op_queue_mclock_client_op_lim", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.0)
+    .set_description("mclock limit of client operator requests")
+    .set_long_description("mclock limit of client operator requests when osd_op_queue is either 'mclock_opclass' or 'mclock_client'; higher values increase the limit")
+    .add_see_also("osd_op_queue")
+    .add_see_also("osd_op_queue_mclock_client_op_res")
+    .add_see_also("osd_op_queue_mclock_client_op_wgt")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_res")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_wgt")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_lim")
+    .add_see_also("osd_op_queue_mclock_snap_res")
+    .add_see_also("osd_op_queue_mclock_snap_wgt")
+    .add_see_also("osd_op_queue_mclock_snap_lim")
+    .add_see_also("osd_op_queue_mclock_recov_res")
+    .add_see_also("osd_op_queue_mclock_recov_wgt")
+    .add_see_also("osd_op_queue_mclock_recov_lim")
+    .add_see_also("osd_op_queue_mclock_scrub_res")
+    .add_see_also("osd_op_queue_mclock_scrub_wgt")
+    .add_see_also("osd_op_queue_mclock_scrub_lim")
+    .add_see_also("osd_op_queue_mclock_anticipation_timeout"),
+
+    Option("osd_op_queue_mclock_osd_rep_op_res", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1000.0)
+    .set_description("mclock reservation of osd replication operation requests and replies")
+    .set_long_description("mclock reservation of replication operation requests and replies when osd_op_queue is either 'mclock_opclass' or 'mclock_client'; higher values increase the reservation")
+    .add_see_also("osd_op_queue")
+    .add_see_also("osd_op_queue_mclock_client_op_res")
+    .add_see_also("osd_op_queue_mclock_client_op_wgt")
+    .add_see_also("osd_op_queue_mclock_client_op_lim")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_wgt")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_lim")
+    .add_see_also("osd_op_queue_mclock_snap_res")
+    .add_see_also("osd_op_queue_mclock_snap_wgt")
+    .add_see_also("osd_op_queue_mclock_snap_lim")
+    .add_see_also("osd_op_queue_mclock_recov_res")
+    .add_see_also("osd_op_queue_mclock_recov_wgt")
+    .add_see_also("osd_op_queue_mclock_recov_lim")
+    .add_see_also("osd_op_queue_mclock_scrub_res")
+    .add_see_also("osd_op_queue_mclock_scrub_wgt")
+    .add_see_also("osd_op_queue_mclock_scrub_lim")
+    .add_see_also("osd_op_queue_mclock_anticipation_timeout"),
+
+    Option("osd_op_queue_mclock_osd_rep_op_wgt", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(500.0)
+    .set_description("mclock weight of osd replication operation requests and replies")
+    .set_long_description("mclock weight of osd replication operation requests and replies when osd_op_queue is either 'mclock_opclass' or 'mclock_client'; higher values increase the weight")
+    .add_see_also("osd_op_queue")
+    .add_see_also("osd_op_queue_mclock_client_op_res")
+    .add_see_also("osd_op_queue_mclock_client_op_wgt")
+    .add_see_also("osd_op_queue_mclock_client_op_lim")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_res")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_lim")
+    .add_see_also("osd_op_queue_mclock_snap_res")
+    .add_see_also("osd_op_queue_mclock_snap_wgt")
+    .add_see_also("osd_op_queue_mclock_snap_lim")
+    .add_see_also("osd_op_queue_mclock_recov_res")
+    .add_see_also("osd_op_queue_mclock_recov_wgt")
+    .add_see_also("osd_op_queue_mclock_recov_lim")
+    .add_see_also("osd_op_queue_mclock_scrub_res")
+    .add_see_also("osd_op_queue_mclock_scrub_wgt")
+    .add_see_also("osd_op_queue_mclock_scrub_lim")
+    .add_see_also("osd_op_queue_mclock_anticipation_timeout"),
+
+    Option("osd_op_queue_mclock_osd_rep_op_lim", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.0)
+    .set_description("mclock limit of osd replication operation requests and replies")
+    .set_long_description("mclock limit of osd sub-operation requests when osd_op_queue is either 'mclock_opclass' or 'mclock_client'; higher values increase the limit")
+    .add_see_also("osd_op_queue")
+    .add_see_also("osd_op_queue_mclock_client_op_res")
+    .add_see_also("osd_op_queue_mclock_client_op_wgt")
+    .add_see_also("osd_op_queue_mclock_client_op_lim")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_res")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_wgt")
+    .add_see_also("osd_op_queue_mclock_snap_res")
+    .add_see_also("osd_op_queue_mclock_snap_wgt")
+    .add_see_also("osd_op_queue_mclock_snap_lim")
+    .add_see_also("osd_op_queue_mclock_recov_res")
+    .add_see_also("osd_op_queue_mclock_recov_wgt")
+    .add_see_also("osd_op_queue_mclock_recov_lim")
+    .add_see_also("osd_op_queue_mclock_scrub_res")
+    .add_see_also("osd_op_queue_mclock_scrub_wgt")
+    .add_see_also("osd_op_queue_mclock_scrub_lim")
+    .add_see_also("osd_op_queue_mclock_anticipation_timeout"),
+
+    Option("osd_op_queue_mclock_snap_res", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.0)
+    .set_description("mclock reservation of snaptrim requests")
+    .set_long_description("mclock reservation of snaptrim requests when osd_op_queue is either 'mclock_opclass' or 'mclock_client'; higher values increase the reservation")
+    .add_see_also("osd_op_queue")
+    .add_see_also("osd_op_queue_mclock_client_op_res")
+    .add_see_also("osd_op_queue_mclock_client_op_wgt")
+    .add_see_also("osd_op_queue_mclock_client_op_lim")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_res")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_wgt")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_lim")
+    .add_see_also("osd_op_queue_mclock_snap_wgt")
+    .add_see_also("osd_op_queue_mclock_snap_lim")
+    .add_see_also("osd_op_queue_mclock_recov_res")
+    .add_see_also("osd_op_queue_mclock_recov_wgt")
+    .add_see_also("osd_op_queue_mclock_recov_lim")
+    .add_see_also("osd_op_queue_mclock_scrub_res")
+    .add_see_also("osd_op_queue_mclock_scrub_wgt")
+    .add_see_also("osd_op_queue_mclock_scrub_lim")
+    .add_see_also("osd_op_queue_mclock_anticipation_timeout"),
+
+    Option("osd_op_queue_mclock_snap_wgt", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1.0)
+    .set_description("mclock weight of snaptrim requests")
+    .set_long_description("mclock weight of snaptrim requests when osd_op_queue is either 'mclock_opclass' or 'mclock_client'; higher values increase the weight")
+    .add_see_also("osd_op_queue")
+    .add_see_also("osd_op_queue_mclock_client_op_res")
+    .add_see_also("osd_op_queue_mclock_client_op_wgt")
+    .add_see_also("osd_op_queue_mclock_client_op_lim")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_res")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_wgt")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_lim")
+    .add_see_also("osd_op_queue_mclock_snap_res")
+    .add_see_also("osd_op_queue_mclock_snap_lim")
+    .add_see_also("osd_op_queue_mclock_recov_res")
+    .add_see_also("osd_op_queue_mclock_recov_wgt")
+    .add_see_also("osd_op_queue_mclock_recov_lim")
+    .add_see_also("osd_op_queue_mclock_scrub_res")
+    .add_see_also("osd_op_queue_mclock_scrub_wgt")
+    .add_see_also("osd_op_queue_mclock_scrub_lim")
+    .add_see_also("osd_op_queue_mclock_anticipation_timeout"),
+
+    Option("osd_op_queue_mclock_snap_lim", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.001)
+    .set_description("")
+    .set_description("mclock limit of snaptrim requests")
+    .set_long_description("mclock limit of snaptrim requests when osd_op_queue is either 'mclock_opclass' or 'mclock_client'; higher values increase the limit")
+    .add_see_also("osd_op_queue_mclock_client_op_res")
+    .add_see_also("osd_op_queue_mclock_client_op_wgt")
+    .add_see_also("osd_op_queue_mclock_client_op_lim")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_res")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_wgt")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_lim")
+    .add_see_also("osd_op_queue_mclock_snap_res")
+    .add_see_also("osd_op_queue_mclock_snap_wgt")
+    .add_see_also("osd_op_queue_mclock_recov_res")
+    .add_see_also("osd_op_queue_mclock_recov_wgt")
+    .add_see_also("osd_op_queue_mclock_recov_lim")
+    .add_see_also("osd_op_queue_mclock_scrub_res")
+    .add_see_also("osd_op_queue_mclock_scrub_wgt")
+    .add_see_also("osd_op_queue_mclock_scrub_lim")
+    .add_see_also("osd_op_queue_mclock_anticipation_timeout"),
+
+    Option("osd_op_queue_mclock_recov_res", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.0)
+    .set_description("mclock reservation of recovery requests")
+    .set_long_description("mclock reservation of recovery requests when osd_op_queue is either 'mclock_opclass' or 'mclock_client'; higher values increase the reservation")
+    .add_see_also("osd_op_queue")
+    .add_see_also("osd_op_queue_mclock_client_op_res")
+    .add_see_also("osd_op_queue_mclock_client_op_wgt")
+    .add_see_also("osd_op_queue_mclock_client_op_lim")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_res")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_wgt")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_lim")
+    .add_see_also("osd_op_queue_mclock_snap_res")
+    .add_see_also("osd_op_queue_mclock_snap_wgt")
+    .add_see_also("osd_op_queue_mclock_snap_lim")
+    .add_see_also("osd_op_queue_mclock_recov_wgt")
+    .add_see_also("osd_op_queue_mclock_recov_lim")
+    .add_see_also("osd_op_queue_mclock_scrub_res")
+    .add_see_also("osd_op_queue_mclock_scrub_wgt")
+    .add_see_also("osd_op_queue_mclock_scrub_lim")
+    .add_see_also("osd_op_queue_mclock_anticipation_timeout"),
+
+    Option("osd_op_queue_mclock_recov_wgt", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1.0)
+    .set_description("mclock weight of recovery requests")
+    .set_long_description("mclock weight of recovery requests when osd_op_queue is either 'mclock_opclass' or 'mclock_client'; higher values increase the weight")
+    .add_see_also("osd_op_queue")
+    .add_see_also("osd_op_queue_mclock_client_op_res")
+    .add_see_also("osd_op_queue_mclock_client_op_wgt")
+    .add_see_also("osd_op_queue_mclock_client_op_lim")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_res")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_wgt")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_lim")
+    .add_see_also("osd_op_queue_mclock_snap_res")
+    .add_see_also("osd_op_queue_mclock_snap_wgt")
+    .add_see_also("osd_op_queue_mclock_snap_lim")
+    .add_see_also("osd_op_queue_mclock_recov_res")
+    .add_see_also("osd_op_queue_mclock_recov_lim")
+    .add_see_also("osd_op_queue_mclock_scrub_res")
+    .add_see_also("osd_op_queue_mclock_scrub_wgt")
+    .add_see_also("osd_op_queue_mclock_scrub_lim")
+    .add_see_also("osd_op_queue_mclock_anticipation_timeout"),
+
+    Option("osd_op_queue_mclock_recov_lim", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.001)
+    .set_description("mclock limit of recovery requests")
+    .set_long_description("mclock limit of recovery requests when osd_op_queue is either 'mclock_opclass' or 'mclock_client'; higher values increase the limit")
+    .add_see_also("osd_op_queue")
+    .add_see_also("osd_op_queue_mclock_client_op_res")
+    .add_see_also("osd_op_queue_mclock_client_op_wgt")
+    .add_see_also("osd_op_queue_mclock_client_op_lim")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_res")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_wgt")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_lim")
+    .add_see_also("osd_op_queue_mclock_snap_res")
+    .add_see_also("osd_op_queue_mclock_snap_wgt")
+    .add_see_also("osd_op_queue_mclock_snap_lim")
+    .add_see_also("osd_op_queue_mclock_recov_res")
+    .add_see_also("osd_op_queue_mclock_recov_wgt")
+    .add_see_also("osd_op_queue_mclock_scrub_res")
+    .add_see_also("osd_op_queue_mclock_scrub_wgt")
+    .add_see_also("osd_op_queue_mclock_scrub_lim")
+    .add_see_also("osd_op_queue_mclock_anticipation_timeout"),
+
+    Option("osd_op_queue_mclock_scrub_res", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.0)
+    .set_description("mclock reservation of scrub requests")
+    .set_long_description("mclock reservation of scrub requests when osd_op_queue is either 'mclock_opclass' or 'mclock_client'; higher values increase the reservation")
+    .add_see_also("osd_op_queue")
+    .add_see_also("osd_op_queue_mclock_client_op_res")
+    .add_see_also("osd_op_queue_mclock_client_op_wgt")
+    .add_see_also("osd_op_queue_mclock_client_op_lim")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_res")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_wgt")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_lim")
+    .add_see_also("osd_op_queue_mclock_snap_res")
+    .add_see_also("osd_op_queue_mclock_snap_wgt")
+    .add_see_also("osd_op_queue_mclock_snap_lim")
+    .add_see_also("osd_op_queue_mclock_recov_res")
+    .add_see_also("osd_op_queue_mclock_recov_wgt")
+    .add_see_also("osd_op_queue_mclock_recov_lim")
+    .add_see_also("osd_op_queue_mclock_scrub_wgt")
+    .add_see_also("osd_op_queue_mclock_scrub_lim")
+    .add_see_also("osd_op_queue_mclock_anticipation_timeout"),
+
+    Option("osd_op_queue_mclock_scrub_wgt", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1.0)
+    .set_description("mclock weight of scrub requests")
+    .set_long_description("mclock weight of scrub requests when osd_op_queue is either 'mclock_opclass' or 'mclock_client'; higher values increase the weight")
+    .add_see_also("osd_op_queue")
+    .add_see_also("osd_op_queue_mclock_client_op_res")
+    .add_see_also("osd_op_queue_mclock_client_op_wgt")
+    .add_see_also("osd_op_queue_mclock_client_op_lim")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_res")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_wgt")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_lim")
+    .add_see_also("osd_op_queue_mclock_snap_res")
+    .add_see_also("osd_op_queue_mclock_snap_wgt")
+    .add_see_also("osd_op_queue_mclock_snap_lim")
+    .add_see_also("osd_op_queue_mclock_recov_res")
+    .add_see_also("osd_op_queue_mclock_recov_wgt")
+    .add_see_also("osd_op_queue_mclock_recov_lim")
+    .add_see_also("osd_op_queue_mclock_scrub_res")
+    .add_see_also("osd_op_queue_mclock_scrub_lim")
+    .add_see_also("osd_op_queue_mclock_anticipation_timeout"),
+
+    Option("osd_op_queue_mclock_scrub_lim", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.001)
+    .set_description("mclock weight of limit requests")
+    .set_long_description("mclock weight of limit requests when osd_op_queue is either 'mclock_opclass' or 'mclock_client'; higher values increase the limit")
+    .add_see_also("osd_op_queue")
+    .add_see_also("osd_op_queue_mclock_client_op_res")
+    .add_see_also("osd_op_queue_mclock_client_op_wgt")
+    .add_see_also("osd_op_queue_mclock_client_op_lim")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_res")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_wgt")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_lim")
+    .add_see_also("osd_op_queue_mclock_snap_res")
+    .add_see_also("osd_op_queue_mclock_snap_wgt")
+    .add_see_also("osd_op_queue_mclock_snap_lim")
+    .add_see_also("osd_op_queue_mclock_recov_res")
+    .add_see_also("osd_op_queue_mclock_recov_wgt")
+    .add_see_also("osd_op_queue_mclock_recov_lim")
+    .add_see_also("osd_op_queue_mclock_scrub_res")
+    .add_see_also("osd_op_queue_mclock_scrub_wgt")
+    .add_see_also("osd_op_queue_mclock_anticipation_timeout"),
+
+    Option("osd_op_queue_mclock_anticipation_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.0)
+    .set_description("mclock anticipation timeout in seconds")
+    .set_long_description("the amount of time that mclock waits until the unused resource is forfeited")
+    .add_see_also("osd_op_queue")
+    .add_see_also("osd_op_queue_mclock_client_op_res")
+    .add_see_also("osd_op_queue_mclock_client_op_wgt")
+    .add_see_also("osd_op_queue_mclock_client_op_lim")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_res")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_wgt")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_lim")
+    .add_see_also("osd_op_queue_mclock_snap_res")
+    .add_see_also("osd_op_queue_mclock_snap_wgt")
+    .add_see_also("osd_op_queue_mclock_snap_lim")
+    .add_see_also("osd_op_queue_mclock_recov_res")
+    .add_see_also("osd_op_queue_mclock_recov_wgt")
+    .add_see_also("osd_op_queue_mclock_recov_lim")
+    .add_see_also("osd_op_queue_mclock_scrub_res")
+    .add_see_also("osd_op_queue_mclock_scrub_wgt")
+    .add_see_also("osd_op_queue_mclock_scrub_lim"),
+
+    Option("osd_op_queue_mclock_pg_delete_res", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.0)
+    .set_description("mclock reservation of pg delete work")
+    .set_long_description("mclock reservation of pg delete work when osd_op_queue is either 'mclock_opclass' or 'mclock_client'; higher values increase the reservation")
+    .add_see_also("osd_op_queue")
+    .add_see_also("osd_op_queue_mclock_client_op_res")
+    .add_see_also("osd_op_queue_mclock_client_op_wgt")
+    .add_see_also("osd_op_queue_mclock_client_op_lim")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_res")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_wgt")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_lim")
+    .add_see_also("osd_op_queue_mclock_snap_res")
+    .add_see_also("osd_op_queue_mclock_snap_wgt")
+    .add_see_also("osd_op_queue_mclock_snap_lim")
+    .add_see_also("osd_op_queue_mclock_recov_res")
+    .add_see_also("osd_op_queue_mclock_recov_wgt")
+    .add_see_also("osd_op_queue_mclock_recov_lim")
+    .add_see_also("osd_op_queue_mclock_scrub_wgt")
+    .add_see_also("osd_op_queue_mclock_scrub_lim"),
+
+    Option("osd_op_queue_mclock_pg_delete_wgt", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1.0)
+    .set_description("mclock weight of pg delete work")
+    .set_long_description("mclock weight of pg delete work when osd_op_queue is either 'mclock_opclass' or 'mclock_client'; higher values increase the weight")
+    .add_see_also("osd_op_queue")
+    .add_see_also("osd_op_queue_mclock_client_op_res")
+    .add_see_also("osd_op_queue_mclock_client_op_wgt")
+    .add_see_also("osd_op_queue_mclock_client_op_lim")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_res")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_wgt")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_lim")
+    .add_see_also("osd_op_queue_mclock_snap_res")
+    .add_see_also("osd_op_queue_mclock_snap_wgt")
+    .add_see_also("osd_op_queue_mclock_snap_lim")
+    .add_see_also("osd_op_queue_mclock_recov_res")
+    .add_see_also("osd_op_queue_mclock_recov_wgt")
+    .add_see_also("osd_op_queue_mclock_recov_lim")
+    .add_see_also("osd_op_queue_mclock_scrub_res")
+    .add_see_also("osd_op_queue_mclock_scrub_lim"),
+
+    Option("osd_op_queue_mclock_pg_delete_lim", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.001)
+    .set_description("mclock weight of pg delete work limit requests")
+    .set_long_description("mclock weight of limit pg delete work when osd_op_queue is either 'mclock_opclass' or 'mclock_client'; higher values increase the limit")
+    .add_see_also("osd_op_queue")
+    .add_see_also("osd_op_queue_mclock_client_op_res")
+    .add_see_also("osd_op_queue_mclock_client_op_wgt")
+    .add_see_also("osd_op_queue_mclock_client_op_lim")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_res")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_wgt")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_lim")
+    .add_see_also("osd_op_queue_mclock_snap_res")
+    .add_see_also("osd_op_queue_mclock_snap_wgt")
+    .add_see_also("osd_op_queue_mclock_snap_lim")
+    .add_see_also("osd_op_queue_mclock_recov_res")
+    .add_see_also("osd_op_queue_mclock_recov_wgt")
+    .add_see_also("osd_op_queue_mclock_recov_lim")
+    .add_see_also("osd_op_queue_mclock_scrub_res")
+    .add_see_also("osd_op_queue_mclock_scrub_wgt"),
+
+    Option("osd_op_queue_mclock_peering_event_res", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.0)
+    .set_description("mclock reservation of peering events")
+    .set_long_description("mclock reservation of scrub requests when osd_op_queue is either 'mclock_opclass' or 'mclock_client'; higher values increase the reservation")
+    .add_see_also("osd_op_queue")
+    .add_see_also("osd_op_queue_mclock_client_op_res")
+    .add_see_also("osd_op_queue_mclock_client_op_wgt")
+    .add_see_also("osd_op_queue_mclock_client_op_lim")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_res")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_wgt")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_lim")
+    .add_see_also("osd_op_queue_mclock_snap_res")
+    .add_see_also("osd_op_queue_mclock_snap_wgt")
+    .add_see_also("osd_op_queue_mclock_snap_lim")
+    .add_see_also("osd_op_queue_mclock_recov_res")
+    .add_see_also("osd_op_queue_mclock_recov_wgt")
+    .add_see_also("osd_op_queue_mclock_recov_lim")
+    .add_see_also("osd_op_queue_mclock_scrub_wgt")
+    .add_see_also("osd_op_queue_mclock_scrub_lim"),
+
+    Option("osd_op_queue_mclock_peering_event_wgt", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1.0)
+    .set_description("mclock weight of peering events")
+    .set_long_description("mclock weight of scrub requests when osd_op_queue is either 'mclock_opclass' or 'mclock_client'; higher values increase the weight")
+    .add_see_also("osd_op_queue")
+    .add_see_also("osd_op_queue_mclock_client_op_res")
+    .add_see_also("osd_op_queue_mclock_client_op_wgt")
+    .add_see_also("osd_op_queue_mclock_client_op_lim")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_res")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_wgt")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_lim")
+    .add_see_also("osd_op_queue_mclock_snap_res")
+    .add_see_also("osd_op_queue_mclock_snap_wgt")
+    .add_see_also("osd_op_queue_mclock_snap_lim")
+    .add_see_also("osd_op_queue_mclock_recov_res")
+    .add_see_also("osd_op_queue_mclock_recov_wgt")
+    .add_see_also("osd_op_queue_mclock_recov_lim")
+    .add_see_also("osd_op_queue_mclock_scrub_res")
+    .add_see_also("osd_op_queue_mclock_scrub_lim"),
+
+    Option("osd_op_queue_mclock_peering_event_lim", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.001)
+    .set_description("mclock weight of limit peering events")
+    .set_long_description("mclock weight of limit requests when osd_op_queue is either 'mclock_opclass' or 'mclock_client'; higher values increase the limit")
+    .add_see_also("osd_op_queue")
+    .add_see_also("osd_op_queue_mclock_client_op_res")
+    .add_see_also("osd_op_queue_mclock_client_op_wgt")
+    .add_see_also("osd_op_queue_mclock_client_op_lim")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_res")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_wgt")
+    .add_see_also("osd_op_queue_mclock_osd_rep_op_lim")
+    .add_see_also("osd_op_queue_mclock_snap_res")
+    .add_see_also("osd_op_queue_mclock_snap_wgt")
+    .add_see_also("osd_op_queue_mclock_snap_lim")
+    .add_see_also("osd_op_queue_mclock_recov_res")
+    .add_see_also("osd_op_queue_mclock_recov_wgt")
+    .add_see_also("osd_op_queue_mclock_recov_lim")
+    .add_see_also("osd_op_queue_mclock_scrub_res")
+    .add_see_also("osd_op_queue_mclock_scrub_wgt"),
+
+    Option("osd_ignore_stale_divergent_priors", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_read_ec_check_for_errors", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_recover_clone_overlap_limit", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .set_description(""),
+
+    Option("osd_debug_feed_pullee", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(-1)
+    .set_description("Feed a pullee, and force primary to pull "
+                     "a currently missing object from it"),
+
+    Option("osd_backfill_scan_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(64)
+    .set_description(""),
+
+    Option("osd_backfill_scan_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(512)
+    .set_description(""),
+
+    Option("osd_op_thread_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(15)
+    .set_description(""),
+
+    Option("osd_op_thread_suicide_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(150)
+    .set_description(""),
+
+    Option("osd_recovery_sleep", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("Time in seconds to sleep before next recovery or backfill op"),
+
+    Option("osd_recovery_sleep_hdd", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.1)
+    .set_description("Time in seconds to sleep before next recovery or backfill op for HDDs"),
+
+    Option("osd_recovery_sleep_ssd", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("Time in seconds to sleep before next recovery or backfill op for SSDs")
+    .add_see_also("osd_recovery_sleep"),
+
+    Option("osd_recovery_sleep_hybrid", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.025)
+    .set_description("Time in seconds to sleep before next recovery or backfill op when data is on HDD and journal is on SSD")
+    .add_see_also("osd_recovery_sleep"),
+
+    Option("osd_snap_trim_sleep", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("Time in seconds to sleep before next snap trim (overrides values below)"),
+
+    Option("osd_snap_trim_sleep_hdd", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("Time in seconds to sleep before next snap trim for HDDs"),
+
+    Option("osd_snap_trim_sleep_ssd", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("Time in seconds to sleep before next snap trim for SSDs"),
+
+    Option("osd_snap_trim_sleep_hybrid", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(2)
+    .set_description("Time in seconds to sleep before next snap trim when data is on HDD and journal is on SSD"),
+
+    Option("osd_scrub_invalid_stats", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description(""),
+
+    Option("osd_command_thread_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(10_min)
+    .set_description(""),
+
+    Option("osd_command_thread_suicide_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(15_min)
+    .set_description(""),
+
+    Option("osd_heartbeat_interval", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(6)
+    .set_min_max(1, 60)
+    .set_description("Interval (in seconds) between peer pings"),
+
+    Option("osd_heartbeat_grace", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(20)
+    .set_description(""),
+
+    Option("osd_heartbeat_stale", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(600)
+    .set_description("Interval (in seconds) we mark an unresponsive heartbeat peer as stale.")
+    .set_long_description("Automatically mark unresponsive heartbeat sessions as stale and tear them down. "
+		          "The primary benefit is that OSD doesn't need to keep a flood of "
+			  "blocked heartbeat messages around in memory."),
+
+    Option("osd_heartbeat_min_peers", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .set_description(""),
+
+    Option("osd_heartbeat_use_min_delay_socket", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_heartbeat_min_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(2000)
+    .set_description("Minimum heartbeat packet size in bytes. Will add dummy payload if heartbeat packet is smaller than this."),
+
+    Option("osd_pg_max_concurrent_snap_trims", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(2)
+    .set_description(""),
+
+    Option("osd_max_trimming_pgs", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(2)
+    .set_description(""),
+
+    Option("osd_heartbeat_min_healthy_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.33)
+    .set_description(""),
+
+    Option("osd_mon_heartbeat_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(30)
+    .set_description(""),
+
+    Option("osd_mon_heartbeat_stat_stale", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1_hr)
+    .set_description("Stop reporting on heartbeat ping times not updated for this many seconds.")
+    .set_long_description("Stop reporting on old heartbeat information unless this is set to zero"),
+
+    Option("osd_mon_report_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("Frequency of OSD reports to mon for peer failures, fullness status changes"),
+
+    Option("osd_mon_report_max_in_flight", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(2)
+    .set_description(""),
+
+    Option("osd_beacon_report_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(300)
+    .set_description(""),
+
+    Option("osd_pg_stat_report_interval_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(500)
+    .set_description(""),
+
+    Option("osd_mon_ack_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(30.0)
+    .set_description(""),
+
+    Option("osd_stats_ack_timeout_factor", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(2.0)
+    .set_description(""),
+
+    Option("osd_stats_ack_timeout_decay", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.9)
+    .set_description(""),
+
+    Option("osd_max_snap_prune_intervals_per_epoch", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(512)
+    .set_description("Max number of snap intervals to report to mgr in pg_stat_t"),
+
+    Option("osd_default_data_pool_replay_window", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(45)
+    .set_description(""),
+
+    Option("osd_auto_mark_unfound_lost", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_recovery_delay_start", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description(""),
+
+    Option("osd_recovery_max_active", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(3)
+    .set_description(""),
+
+    Option("osd_recovery_max_single_start", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description(""),
+
+    Option("osd_recovery_max_chunk", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(8_M)
+    .set_description(""),
+
+    Option("osd_recovery_max_omap_entries_per_chunk", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(8096)
+    .set_description(""),
+
+    Option("osd_copyfrom_max_chunk", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(8_M)
+    .set_description(""),
+
+    Option("osd_push_per_object_cost", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(1000)
+    .set_description(""),
+
+    Option("osd_max_push_cost", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(8<<20)
+    .set_description(""),
+
+    Option("osd_max_push_objects", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .set_description(""),
+
+    Option("osd_max_scrubs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description("Maximum concurrent scrubs on a single OSD"),
+
+    Option("osd_scrub_during_recovery", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Allow scrubbing when PGs on the OSD are undergoing recovery"),
+
+    Option("osd_repair_during_recovery", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Allow requested repairing when PGs on the OSD are undergoing recovery"),
+
+    Option("osd_scrub_begin_hour", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("Restrict scrubbing to this hour of the day or later")
+    .add_see_also("osd_scrub_end_hour"),
+
+    Option("osd_scrub_end_hour", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(24)
+    .set_description("Restrict scrubbing to hours of the day earlier than this")
+    .add_see_also("osd_scrub_begin_hour"),
+
+    Option("osd_scrub_begin_week_day", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("Restrict scrubbing to this day of the week or later")
+    .set_long_description("0 or 7 = Sunday, 1 = Monday, etc.")
+    .add_see_also("osd_scrub_end_week_day"),
+
+    Option("osd_scrub_end_week_day", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(7)
+    .set_description("Restrict scrubbing to days of the week earlier than this")
+    .set_long_description("0 or 7 = Sunday, 1 = Monday, etc.")
+    .add_see_also("osd_scrub_begin_week_day"),
+
+    Option("osd_scrub_load_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.5)
+    .set_description("Allow scrubbing when system load divided by number of CPUs is below this value"),
+
+    Option("osd_scrub_min_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1_day)
+    .set_description("Scrub each PG no more often than this interval")
+    .add_see_also("osd_scrub_max_interval"),
+
+    Option("osd_scrub_max_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(7_day)
+    .set_description("Scrub each PG no less often than this interval")
+    .add_see_also("osd_scrub_min_interval"),
+
+    Option("osd_scrub_interval_randomize_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.5)
+    .set_description("Ratio of scrub interval to randomly vary")
+    .set_long_description("This prevents a scrub 'stampede' by randomly varying the scrub intervals so that they are soon uniformly distributed over the week")
+    .add_see_also("osd_scrub_min_interval"),
+
+    Option("osd_scrub_backoff_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(.66)
+    .set_long_description("This is the precentage of ticks that do NOT schedule scrubs, 66% means that 1 out of 3 ticks will schedule scrubs")
+    .set_description("Backoff ratio for scheduling scrubs"),
+
+    Option("osd_scrub_chunk_min", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("Minimum number of objects to scrub in a single chunk")
+    .add_see_also("osd_scrub_chunk_max"),
+
+    Option("osd_scrub_chunk_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(25)
+    .set_description("Maximum number of objects to scrub in a single chunk")
+    .add_see_also("osd_scrub_chunk_min"),
+
+    Option("osd_scrub_sleep", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("Duration to inject a delay during scrubbing"),
+
+    Option("osd_scrub_auto_repair", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Automatically repair damaged objects detected during scrub"),
+
+    Option("osd_scrub_auto_repair_num_errors", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("Maximum number of detected errors to automatically repair")
+    .add_see_also("osd_scrub_auto_repair"),
+
+    Option("osd_scrub_max_preemptions", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_min_max(0, 30)
+    .set_description("Set the maximum number of times we will preempt a deep scrub due to a client operation before blocking client IO to complete the scrub"),
+
+    Option("osd_deep_scrub_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(7_day)
+    .set_description("Deep scrub each PG (i.e., verify data checksums) at least this often"),
+
+    Option("osd_deep_scrub_randomize_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.15)
+    .set_description("Scrubs will randomly become deep scrubs at this rate (0.15 -> 15% of scrubs are deep)")
+    .set_long_description("This prevents a deep scrub 'stampede' by spreading deep scrubs so they are uniformly distributed over the week"),
+
+    Option("osd_deep_scrub_stride", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(512_K)
+    .set_description("Number of bytes to read from an object at a time during deep scrub"),
+
+    Option("osd_deep_scrub_keys", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1024)
+    .set_description("Number of keys to read from an object at a time during deep scrub"),
+
+    Option("osd_deep_scrub_update_digest_min_age", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(2_hr)
+    .set_description("Update overall object digest only if object was last modified longer ago than this"),
+
+    Option("osd_deep_scrub_large_omap_object_key_threshold", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(200000)
+    .set_description("Warn when we encounter an object with more omap keys than this")
+    .add_service("osd")
+    .add_see_also("osd_deep_scrub_large_omap_object_value_sum_threshold"),
+
+    Option("osd_deep_scrub_large_omap_object_value_sum_threshold", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(1_G)
+    .set_description("Warn when we encounter an object with more omap key bytes than this")
+    .add_service("osd")
+    .add_see_also("osd_deep_scrub_large_omap_object_key_threshold"),
+
+    Option("osd_class_dir", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default(CEPH_LIBDIR "/rados-classes")
+    .set_description(""),
+
+    Option("osd_open_classes_on_start", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description(""),
+
+    Option("osd_class_load_list", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("cephfs hello journal lock log numops " "otp rbd refcount rgw timeindex user version cas")
+    .set_description(""),
+
+    Option("osd_class_default_list", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("cephfs hello journal lock log numops " "otp rbd refcount rgw timeindex user version cas")
+    .set_description(""),
+
+    Option("osd_check_for_log_corruption", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_use_stale_snap", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_rollback_to_cluster_snap", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description(""),
+
+    Option("osd_default_notify_timeout", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(30)
+    .set_description(""),
+
+    Option("osd_kill_backfill_at", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("osd_pg_epoch_persisted_max_stale", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(40)
+    .set_description(""),
+
+    Option("osd_min_pg_log_entries", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(3000)
+    .set_description("minimum number of entries to maintain in the PG log")
+    .add_service("osd")
+    .add_see_also("osd_max_pg_log_entries")
+    .add_see_also("osd_pg_log_dups_tracked"),
+
+    Option("osd_max_pg_log_entries", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(3000)
+    .set_description("maximum number of entries to maintain in the PG log when degraded before we trim")
+    .add_service("osd")
+    .add_see_also("osd_min_pg_log_entries")
+    .add_see_also("osd_pg_log_dups_tracked"),
+
+    Option("osd_pg_log_dups_tracked", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(3000)
+    .set_description("how many versions back to track in order to detect duplicate ops; this is combined with both the regular pg log entries and additional minimal dup detection entries")
+    .add_service("osd")
+    .add_see_also("osd_min_pg_log_entries")
+    .add_see_also("osd_max_pg_log_entries"),
+
+    Option("osd_force_recovery_pg_log_entries_factor", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1.3)
+    .set_description(""),
+
+    Option("osd_pg_log_trim_min", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(100)
+    .set_description(""),
+
+    Option("osd_force_auth_primary_missing_objects", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(100)
+    .set_description("Approximate missing objects above which to force auth_log_shard to be primary temporarily"),
+
+    Option("osd_async_recovery_min_cost", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(100)
+    .set_description("A mixture measure of number of current log entries difference "
+                     "and historical missing objects,  above which we switch to use "
+                     "asynchronous recovery when appropriate"),
+
+    Option("osd_max_pg_per_osd_hard_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(3)
+    .set_min(1)
+    .set_description("Maximum number of PG per OSD, a factor of 'mon_max_pg_per_osd'")
+    .set_long_description("OSD will refuse to instantiate PG if the number of PG it serves exceeds this number.")
+    .add_see_also("mon_max_pg_per_osd"),
+
+    Option("osd_pg_log_trim_max", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(10000)
+    .set_description("maximum number of entries to remove at once from the PG log")
+    .add_service("osd")
+    .add_see_also("osd_min_pg_log_entries")
+    .add_see_also("osd_max_pg_log_entries"),
+
+    Option("osd_op_complaint_time", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(30)
+    .set_description(""),
+
+    Option("osd_command_max_records", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(256)
+    .set_description(""),
+
+    Option("osd_max_pg_blocked_by", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(16)
+    .set_description(""),
+
+    Option("osd_op_log_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description(""),
+
+    Option("osd_verify_sparse_read_holes", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_backoff_on_unfound", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description(""),
+
+    Option("osd_backoff_on_degraded", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_backoff_on_peering", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_debug_shutdown", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("Turn up debug levels during shutdown"),
+
+    Option("osd_debug_crash_on_ignored_backoff", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_debug_inject_dispatch_delay_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("osd_debug_inject_dispatch_delay_duration", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(.1)
+    .set_description(""),
+
+    Option("osd_debug_drop_ping_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("osd_debug_drop_ping_duration", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("osd_debug_op_order", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_debug_verify_missing_on_start", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_debug_verify_snaps", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_debug_verify_stray_on_activate", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_debug_skip_full_check_in_backfill_reservation", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_debug_reject_backfill_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("osd_debug_inject_copyfrom_error", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_debug_misdirected_ops", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_debug_skip_full_check_in_recovery", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_debug_random_push_read_error", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("osd_debug_verify_cached_snaps", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_debug_deep_scrub_sleep", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description("Inject an expensive sleep during deep scrub IO to make it easier to induce preemption"),
+
+    Option("osd_debug_no_acting_change", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false),
+    Option("osd_debug_no_purge_strays", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false),
+
+    Option("osd_debug_pretend_recovery_active", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_enable_op_tracker", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description(""),
+
+    Option("osd_num_op_tracker_shard", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(32)
+    .set_description(""),
+
+    Option("osd_op_history_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(20)
+    .set_description(""),
+
+    Option("osd_op_history_duration", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(600)
+    .set_description(""),
+
+    Option("osd_op_history_slow_op_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(20)
+    .set_description(""),
+
+    Option("osd_op_history_slow_op_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(10.0)
+    .set_description(""),
+
+    Option("osd_target_transaction_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(30)
+    .set_description(""),
+
+    Option("osd_delete_sleep", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("Time in seconds to sleep before next removal transaction (overrides values below)"),
+
+    Option("osd_delete_sleep_hdd", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("Time in seconds to sleep before next removal transaction for HDDs"),
+
+    Option("osd_delete_sleep_ssd", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description("Time in seconds to sleep before next removal transaction for SSDs"),
+
+    Option("osd_delete_sleep_hybrid", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description("Time in seconds to sleep before next removal transaction when data is on HDD and journal is on SSD"),
+
+    Option("osd_failsafe_full_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.97)
+    .set_description(""),
+
+    Option("osd_fast_shutdown", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Fast, immediate shutdown")
+    .set_long_description("Setting this to false makes the OSD do a slower teardown of all state when it receives a SIGINT or SIGTERM or when shutting down for any other reason.  That slow shutdown is primarilyy useful for doing memory leak checking with valgrind."),
+
+    Option("osd_fast_fail_on_connection_refused", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description(""),
+
+    Option("osd_pg_object_context_cache_count", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(64)
+    .set_description(""),
+
+    Option("osd_tracing", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_function_tracing", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_fast_info", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description(""),
+
+    Option("osd_debug_pg_log_writeout", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_loop_before_reset_tphandle", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(64)
+    .set_description(""),
+
+    Option("threadpool_default_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(60)
+    .set_description(""),
+
+    Option("threadpool_empty_queue_max_wait", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(2)
+    .set_description(""),
+
+    Option("leveldb_log_to_ceph_log", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description(""),
+
+    Option("leveldb_write_buffer_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(8_M)
+    .set_description(""),
+
+    Option("leveldb_cache_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(128_M)
+    .set_description(""),
+
+    Option("leveldb_block_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description(""),
+
+    Option("leveldb_bloom_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description(""),
+
+    Option("leveldb_max_open_files", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description(""),
+
+    Option("leveldb_compression", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description(""),
+
+    Option("leveldb_paranoid", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("leveldb_log", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("/dev/null")
+    .set_description(""),
+
+    Option("leveldb_compact_on_mount", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("kinetic_host", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description(""),
+
+    Option("kinetic_port", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(8123)
+    .set_description(""),
+
+    Option("kinetic_user_id", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description(""),
+
+    Option("kinetic_hmac_key", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("asdfasdf")
+    .set_description(""),
+
+    Option("kinetic_use_ssl", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("rocksdb_log_to_ceph_log", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description(""),
+
+    Option("rocksdb_cache_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(512_M)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description(""),
+
+    Option("rocksdb_cache_row_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description(""),
+
+    Option("rocksdb_cache_shard_bits", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(4)
+    .set_description(""),
+
+    Option("rocksdb_cache_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("binned_lru")
+    .set_description(""),
+
+    Option("rocksdb_block_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(4_K)
+    .set_description(""),
+
+    Option("rocksdb_perf", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("rocksdb_collect_compaction_stats", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("rocksdb_collect_extended_stats", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("rocksdb_collect_memory_stats", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("rocksdb_enable_rmrange", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Refer to github.com/facebook/rocksdb/wiki/DeleteRange-Implementation"),
+
+    Option("rocksdb_max_items_rmrange", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1024)
+    .set_description("Delete Range will be called if number of keys exceeded, must enable rocksdb_enable_rmrange first")
+    .add_see_also("rocksdb_enable_rmrange"),
+
+    Option("rocksdb_bloom_bits_per_key", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(20)
+    .set_description("Number of bits per key to use for RocksDB's bloom filters.")
+    .set_long_description("RocksDB bloom filters can be used to quickly answer the question of whether or not a key may exist or definitely does not exist in a given RocksDB SST file without having to read all keys into memory.  Using a higher bit value decreases the likelihood of false positives at the expense of additional disk space and memory consumption when the filter is loaded into RAM.  The current default value of 20 was found to provide significant performance gains when getattr calls are made (such as during new object creation in bluestore) without significant memory overhead or cache pollution when combined with rocksdb partitioned index filters.  See: https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters for more information."),
+
+    Option("rocksdb_cache_index_and_filter_blocks", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(true)
+    .set_description("Whether to cache indices and filters in block cache")
+    .set_long_description("By default RocksDB will load an SST file's index and bloom filters into memory when it is opened and remove them from memory when an SST file is closed.  Thus, memory consumption by indices and bloom filters is directly tied to the number of concurrent SST files allowed to be kept open.  This option instead stores cached indicies and filters in the block cache where they directly compete with other cached data.  By default we set this option to true to better account for and bound rocksdb memory usage and keep filters in memory even when an SST file is closed."),
+
+    Option("rocksdb_cache_index_and_filter_blocks_with_high_priority", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(true)
+    .set_description("Whether to cache indices and filters in the block cache with high priority")
+    .set_long_description("A downside of setting rocksdb_cache_index_and_filter_blocks to true is that regular data can push indices and filters out of memory.  Setting this option to true means they are cached with higher priority than other data and should typically stay in the block cache."),
+
+    Option("rocksdb_pin_l0_filter_and_index_blocks_in_cache", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("Whether to pin Level 0 indices and bloom filters in the block cache")
+    .set_long_description("A downside of setting rocksdb_cache_index_and_filter_blocks to true is that regular data can push indices and filters out of memory.  Setting this option to true means that level 0 SST files will always have their indices and filters pinned in the block cache."),
+
+    Option("rocksdb_index_type", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("binary_search")
+    .set_description("Type of index for SST files: binary_search, hash_search, two_level")
+    .set_long_description("This option controls the table index type.  binary_search is a space efficient index block that is optimized for block-search-based index. hash_search may improve prefix lookup performance at the expense of higher disk and memory usage and potentially slower compactions.  two_level is an experimental index type that uses two binary search indexes and works in conjunction with partition filters.  See: http://rocksdb.org/blog/2017/05/12/partitioned-index-filter.html"),
+
+    Option("rocksdb_partition_filters", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("(experimental) partition SST index/filters into smaller blocks")
+    .set_long_description("This is an experimental option for rocksdb that works in conjunction with two_level indices to avoid having to keep the entire filter/index in cache when cache_index_and_filter_blocks is true.  The idea is to keep a much smaller top-level index in heap/cache and then opportunistically cache the lower level indices.  See: https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters"),
+
+    Option("rocksdb_metadata_block_size", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(4_K)
+    .set_description("The block size for index partitions. (0 = rocksdb default)"),
+
+    Option("mon_rocksdb_options", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("write_buffer_size=33554432,"
+		 "compression=kNoCompression,"
+		 "level_compaction_dynamic_level_bytes=true")
+    .set_description(""),
+
+    Option("osd_client_op_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(63)
+    .set_description(""),
+
+    Option("osd_recovery_op_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(3)
+    .set_description("Priority to use for recovery operations if not specified for the pool"),
+
+    Option("osd_peering_op_priority", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(255)
+    .set_description(""),
+
+    Option("osd_snap_trim_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description(""),
+
+    Option("osd_snap_trim_cost", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(1<<20)
+    .set_description(""),
+
+    Option("osd_pg_delete_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description(""),
+
+    Option("osd_pg_delete_cost", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(1<<20)
+    .set_description(""),
+
+    Option("osd_scrub_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("Priority for scrub operations in work queue"),
+
+    Option("osd_scrub_cost", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(50<<20)
+    .set_description("Cost for scrub operations in work queue"),
+
+    Option("osd_requested_scrub_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(120)
+    .set_description(""),
+
+    Option("osd_recovery_priority", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("Priority of recovery in the work queue")
+    .set_long_description("Not related to a pool's recovery_priority"),
+
+    Option("osd_recovery_cost", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(20<<20)
+    .set_description(""),
+
+    Option("osd_recovery_op_warn_multiple", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(16)
+    .set_description(""),
+
+    Option("osd_mon_shutdown_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description(""),
+
+    Option("osd_shutdown_pgref_assert", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_max_object_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(128_M)
+    .set_description(""),
+
+    Option("osd_max_object_name_len", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(2048)
+    .set_description(""),
+
+    Option("osd_max_object_namespace_len", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(256)
+    .set_description(""),
+
+    Option("osd_max_attr_name_len", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(100)
+    .set_description(""),
+
+    Option("osd_max_attr_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description(""),
+
+    Option("osd_max_omap_entries_per_request", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(131072)
+    .set_description(""),
+
+    Option("osd_max_omap_bytes_per_request", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(1_G)
+    .set_description(""),
+
+    Option("osd_objectstore", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("bluestore")
+    .set_enum_allowed({"bluestore", "filestore", "memstore", "kstore"})
+    .set_flag(Option::FLAG_CREATE)
+    .set_description("backend type for an OSD (like filestore or bluestore)"),
+
+    Option("osd_objectstore_tracing", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_objectstore_fuse", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_bench_small_size_max_iops", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(100)
+    .set_description(""),
+
+    Option("osd_bench_large_size_max_throughput", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(100_M)
+    .set_description(""),
+
+    Option("osd_bench_max_block_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(64_M)
+    .set_description(""),
+
+    Option("osd_bench_duration", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(30)
+    .set_description(""),
+
+    Option("osd_blkin_trace_all", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osdc_blkin_trace_all", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("osd_discard_disconnected_ops", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description(""),
+
+    Option("osd_memory_target", Option::TYPE_SIZE, Option::LEVEL_BASIC)
+    .set_default(4_G)
+    .set_min(896_M)
+    .set_flag(Option::FLAG_RUNTIME)
+    .add_see_also("bluestore_cache_autotune")
+    .add_see_also("osd_memory_cache_min")
+    .add_see_also("osd_memory_base")
+    .set_description("When tcmalloc and cache autotuning is enabled, try to keep this many bytes mapped in memory.")
+    .set_long_description("The minimum value must be at least equal to osd_memory_base + osd_memory_cache_min."),
+
+    Option("osd_memory_target_cgroup_limit_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.8)
+    .set_min_max(0.0, 1.0)
+    .add_see_also("osd_memory_target")
+    .set_description("Set the default value for osd_memory_target to the cgroup memory limit (if set) times this value")
+    .set_long_description("A value of 0 disables this feature."),
+
+    Option("osd_memory_base", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(768_M)
+    .set_flag(Option::FLAG_RUNTIME)
+    .add_see_also("bluestore_cache_autotune")
+    .set_description("When tcmalloc and cache autotuning is enabled, estimate the minimum amount of memory in bytes the OSD will need."),
+
+    Option("osd_memory_expected_fragmentation", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0.15)
+    .set_min_max(0.0, 1.0)
+    .set_flag(Option::FLAG_RUNTIME)
+    .add_see_also("bluestore_cache_autotune")
+    .set_description("When tcmalloc and cache autotuning is enabled, estimate the percent of memory fragmentation."),
+
+    Option("osd_memory_cache_min", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(128_M)
+    .set_min(128_M)
+    .set_flag(Option::FLAG_RUNTIME)
+    .add_see_also("bluestore_cache_autotune")
+    .set_description("When tcmalloc and cache autotuning is enabled, set the minimum amount of memory used for caches."),
+
+    Option("osd_memory_cache_resize_interval", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(1)
+    .add_see_also("bluestore_cache_autotune")
+    .set_description("When tcmalloc and cache autotuning is enabled, wait this many seconds between resizing caches."),
+
+    Option("memstore_device_bytes", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(1_G)
+    .set_description(""),
+
+    Option("memstore_page_set", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("memstore_page_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(64_K)
+    .set_description(""),
+
+    Option("objectstore_blackhole", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    // --------------------------
+    // bluestore
+
+    Option("bdev_debug_inflight_ios", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("bdev_inject_crash", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("bdev_inject_crash_flush_delay", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(2)
+    .set_description(""),
+
+    Option("bdev_aio", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description(""),
+
+    Option("bdev_aio_poll_ms", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(250)
+    .set_description(""),
+
+    Option("bdev_aio_max_queue_depth", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1024)
+    .set_description(""),
+
+    Option("bdev_aio_reap_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(16)
+    .set_description(""),
+
+    Option("bdev_block_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(4_K)
+    .set_description(""),
+
+    Option("bdev_debug_aio", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("bdev_debug_aio_suicide_timeout", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(60.0)
+    .set_description(""),
+
+    Option("bdev_debug_aio_log_age", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(5.0)
+    .set_description(""),
+
+    Option("bdev_nvme_unbind_from_kernel", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("bdev_nvme_retry_count", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(-1)
+    .set_description(""),
+
+    Option("bdev_enable_discard", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("bdev_async_discard", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+    
+    Option("bdev_flock_retry_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.1)
+    .set_description("interval to retry the flock"),
+    
+    Option("bdev_flock_retry", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(3)
+    .set_description("times to retry the flock")
+    .set_long_description(
+        "The number of times to retry on getting the block device lock. "
+        "Programs such as systemd-udevd may compete with Ceph for this lock. "
+        "0 means 'unlimited'."),
+
+    Option("bluefs_alloc_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(1_M)
+    .set_description("Allocation unit size for DB and WAL devices"),
+
+    Option("bluefs_shared_alloc_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(64_K)
+    .set_description("Allocation unit size for primary/shared device"),
+
+    Option("bluefs_max_prefetch", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(1_M)
+    .set_description(""),
+
+    Option("bluefs_min_log_runway", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(1_M)
+    .set_description(""),
+
+    Option("bluefs_max_log_runway", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(4194304)
+    .set_description(""),
+
+    Option("bluefs_log_compact_min_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(5.0)
+    .set_description(""),
+
+    Option("bluefs_log_compact_min_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(16_M)
+    .set_description(""),
+
+    Option("bluefs_min_flush_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(512_K)
+    .set_description(""),
+
+    Option("bluefs_compact_log_sync", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("bluefs_buffered_io", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Enabled buffered IO for bluefs reads.")
+    .set_long_description("When this option is enabled, bluefs will in some cases perform buffered reads.  This allows the kernel page cache to act as a secondary cache for things like RocksDB compaction.  For example, if the rocksdb block cache isn't large enough to hold blocks from the compressed SST files itself, they can be read from page cache instead of from the disk.  This option previously was enabled by default, however in some test cases it appears to cause excessive swap utilization by the linux kernel and a large negative performance impact after several hours of run time.  Please exercise caution when enabling."),
+
+    Option("bluefs_sync_write", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("bluefs_allocator", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("hybrid")
+    .set_enum_allowed({"bitmap", "stupid", "avl", "hybrid"})
+    .set_description(""),
+
+    Option("bluefs_preextend_wal_files", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("bluefs_replay_recovery", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("Attempt to read bluefs log so large that it became unreadable.")
+    .set_long_description("If BlueFS log grows to extreme sizes (200GB+) it is likely that it becames unreadable. "
+			  "This options enables heuristics that scans devices for missing data. "
+			  "DO NOT ENABLE BY DEFAULT"),
+
+    Option("bluefs_replay_recovery_disable_compact", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("bluefs_check_for_zeros", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Check data read for suspicious pages")
+    .set_long_description("Looks into data read to check if there is a 4K block entirely filled with zeros. "
+			  "If this happens, we re-read data. If there is difference, we print error to log.")
+    .add_see_also("bluestore_retry_disk_reads"),
+
+    Option("bluestore_bluefs", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(true)
+    .set_flag(Option::FLAG_CREATE)
+    .set_description("Use BlueFS to back rocksdb")
+    .set_long_description("BlueFS allows rocksdb to share the same physical device(s) as the rest of BlueStore.  It should be used in all cases unless testing/developing an alternative metadata database for BlueStore."),
+
+    Option("bluestore_bluefs_env_mirror", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_flag(Option::FLAG_CREATE)
+    .set_description("Mirror bluefs data to file system for testing/validation"),
+
+    Option("bluestore_bluefs_min", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(1_G)
+    .set_description("minimum disk space allocated to BlueFS (e.g., at mkfs)"),
+
+    Option("bluestore_bluefs_min_free", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(1_G)
+    .set_description("minimum free space allocated to BlueFS"),
+
+    Option("bluestore_bluefs_min_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.02)
+    .set_description("Minimum fraction of free space devoted to BlueFS"),
+
+    Option("bluestore_bluefs_max_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.90)
+    .set_description("Maximum fraction of free storage devoted to BlueFS"),
+
+    Option("bluestore_bluefs_gift_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.02)
+    .set_description("Maximum fraction of free space to give to BlueFS at once"),
+
+    Option("bluestore_bluefs_reclaim_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.20)
+    .set_description("Maximum fraction of free space to reclaim from BlueFS at once"),
+
+    Option("bluestore_bluefs_balance_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description("How frequently (in seconds) to balance free space between BlueFS and BlueStore"),
+
+    Option("bluestore_bluefs_alloc_failure_dump_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("How frequently (in seconds) to dump allocator on"
+      "BlueFS space allocation failure"),
+
+    Option("bluestore_bluefs_db_compatibility", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(true)
+    .set_description("Sync db with legacy bluefs extents info")
+    .set_long_description("Enforces db sync with legacy bluefs extents information on close."
+                          " Enables downgrades to pre-nautilus releases"),
+
+    Option("bluestore_spdk_mem", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(512)
+    .set_description("Amount of dpdk memory size in MB")
+    .set_long_description("If running multiple SPDK instances per node, you must specify the amount of dpdk memory size in MB each instance will use, to make sure each instance uses its own dpdk memory"),
+
+    Option("bluestore_spdk_coremask", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("0x1")
+    .set_description("A hexadecimal bit mask of the cores to run on. Note the core numbering can change between platforms and should be determined beforehand"),
+
+    Option("bluestore_spdk_max_io_completion", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description("Maximal I/Os to be batched completed while checking queue pair completions, 0 means let spdk library determine it"),
+
+    Option("bluestore_spdk_io_sleep", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(5)
+    .set_description("Time period to wait if there is no completed I/O from polling"),
+
+    Option("bluestore_block_path", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("")
+    .set_flag(Option::FLAG_CREATE)
+    .set_description("Path to block device/file"),
+
+    Option("bluestore_block_size", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(100_G)
+    .set_flag(Option::FLAG_CREATE)
+    .set_description("Size of file to create for backing bluestore"),
+
+    Option("bluestore_block_create", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(true)
+    .set_flag(Option::FLAG_CREATE)
+    .set_description("Create bluestore_block_path if it doesn't exist")
+    .add_see_also("bluestore_block_path").add_see_also("bluestore_block_size"),
+
+    Option("bluestore_block_db_path", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("")
+    .set_flag(Option::FLAG_CREATE)
+    .set_description("Path for db block device"),
+
+    Option("bluestore_block_db_size", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_flag(Option::FLAG_CREATE)
+    .set_description("Size of file to create for bluestore_block_db_path"),
+
+    Option("bluestore_block_db_create", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_flag(Option::FLAG_CREATE)
+    .set_description("Create bluestore_block_db_path if it doesn't exist")
+    .add_see_also("bluestore_block_db_path")
+    .add_see_also("bluestore_block_db_size"),
+
+    Option("bluestore_block_wal_path", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("")
+    .set_flag(Option::FLAG_CREATE)
+    .set_description("Path to block device/file backing bluefs wal"),
+
+    Option("bluestore_block_wal_size", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(96_M)
+    .set_flag(Option::FLAG_CREATE)
+    .set_description("Size of file to create for bluestore_block_wal_path"),
+
+    Option("bluestore_block_wal_create", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_flag(Option::FLAG_CREATE)
+    .set_description("Create bluestore_block_wal_path if it doesn't exist")
+    .add_see_also("bluestore_block_wal_path")
+    .add_see_also("bluestore_block_wal_size"),
+
+    Option("bluestore_block_preallocate_file", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_flag(Option::FLAG_CREATE)
+    .set_description("Preallocate file created via bluestore_block*_create"),
+
+    Option("bluestore_ignore_data_csum", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Ignore checksum errors on read and do not generate an EIO error"),
+
+    Option("bluestore_csum_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("crc32c")
+    .set_enum_allowed({"none", "crc32c", "crc32c_16", "crc32c_8", "xxhash32", "xxhash64"})
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Default checksum algorithm to use")
+    .set_long_description("crc32c, xxhash32, and xxhash64 are available.  The _16 and _8 variants use only a subset of the bits for more compact (but less reliable) checksumming."),
+
+    Option("bluestore_retry_disk_reads", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(3)
+    .set_min_max(0, 255)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Number of read retries on checksum validation error")
+    .set_long_description("Retries to read data from the disk this many times when checksum validation fails to handle spurious read errors gracefully."),
+
+    Option("bluestore_min_alloc_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_flag(Option::FLAG_CREATE)
+    .set_description("Minimum allocation size to allocate for an object")
+    .set_long_description("A smaller allocation size generally means less data is read and then rewritten when a copy-on-write operation is triggered (e.g., when writing to something that was recently snapshotted).  Similarly, less data is journaled before performing an overwrite (writes smaller than min_alloc_size must first pass through the BlueStore journal).  Larger values of min_alloc_size reduce the amount of metadata required to describe the on-disk layout and reduce overall fragmentation."),
+
+    Option("bluestore_min_alloc_size_hdd", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(64_K)
+    .set_flag(Option::FLAG_CREATE)
+    .set_description("Default min_alloc_size value for rotational media")
+    .add_see_also("bluestore_min_alloc_size"),
+
+    Option("bluestore_min_alloc_size_ssd", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(4_K)
+    .set_flag(Option::FLAG_CREATE)
+    .set_description("Default min_alloc_size value for non-rotational (solid state)  media")
+    .add_see_also("bluestore_min_alloc_size"),
+
+    Option("bluestore_max_alloc_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_flag(Option::FLAG_CREATE)
+    .set_description("Maximum size of a single allocation (0 for no max)"),
+
+    Option("bluestore_prefer_deferred_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Writes smaller than this size will be written to the journal and then asynchronously written to the device.  This can be beneficial when using rotational media where seeks are expensive, and is helpful both with and without solid state journal/wal devices."),
+
+    Option("bluestore_prefer_deferred_size_hdd", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(32768)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Default bluestore_prefer_deferred_size for rotational media")
+    .add_see_also("bluestore_prefer_deferred_size"),
+
+    Option("bluestore_prefer_deferred_size_ssd", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Default bluestore_prefer_deferred_size for non-rotational (solid state) media")
+    .add_see_also("bluestore_prefer_deferred_size"),
+
+    Option("bluestore_compression_mode", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("none")
+    .set_enum_allowed({"none", "passive", "aggressive", "force"})
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Default policy for using compression when pool does not specify")
+    .set_long_description("'none' means never use compression.  'passive' means use compression when clients hint that data is compressible.  'aggressive' means use compression unless clients hint that data is not compressible.  This option is used when the per-pool property for the compression mode is not present."),
+
+    Option("bluestore_compression_algorithm", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("snappy")
+    .set_enum_allowed({"", "snappy", "zlib", "zstd", "lz4"})
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Default compression algorithm to use when writing object data")
+    .set_long_description("This controls the default compressor to use (if any) if the per-pool property is not set.  Note that zstd is *not* recommended for bluestore due to high CPU overhead when compressing small amounts of data."),
+
+    Option("bluestore_compression_min_blob_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Maximum chunk size to apply compression to when random access is expected for an object.")
+    .set_long_description("Chunks larger than this are broken into smaller chunks before being compressed"),
+
+    Option("bluestore_compression_min_blob_size_hdd", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(128_K)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Default value of bluestore_compression_min_blob_size for rotational media")
+    .add_see_also("bluestore_compression_min_blob_size"),
+
+    Option("bluestore_compression_min_blob_size_ssd", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(8_K)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Default value of bluestore_compression_min_blob_size for non-rotational (solid state) media")
+    .add_see_also("bluestore_compression_min_blob_size"),
+
+    Option("bluestore_compression_max_blob_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Maximum chunk size to apply compression to when non-random access is expected for an object.")
+    .set_long_description("Chunks larger than this are broken into smaller chunks before being compressed"),
+
+    Option("bluestore_compression_max_blob_size_hdd", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(512_K)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Default value of bluestore_compression_max_blob_size for rotational media")
+    .add_see_also("bluestore_compression_max_blob_size"),
+
+    Option("bluestore_compression_max_blob_size_ssd", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(64_K)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Default value of bluestore_compression_max_blob_size for non-rotational (solid state) media")
+    .add_see_also("bluestore_compression_max_blob_size"),
+
+    Option("bluestore_gc_enable_blob_threshold", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description(""),
+
+    Option("bluestore_gc_enable_total_threshold", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description(""),
+
+    Option("bluestore_max_blob_size", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("")
+    .set_long_description("Bluestore blobs are collections of extents (ie on-disk data) originating from one or more objects.  Blobs can be compressed, typically have checksum data, may be overwritten, may be shared (with an extent ref map), or split.  This setting controls the maximum size a blob is allowed to be."),
+
+    Option("bluestore_max_blob_size_hdd", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(512_K)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("")
+    .add_see_also("bluestore_max_blob_size"),
+
+    Option("bluestore_max_blob_size_ssd", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(64_K)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("")
+    .add_see_also("bluestore_max_blob_size"),
+
+    Option("bluestore_compression_required_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.875)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Compression ratio required to store compressed data")
+    .set_long_description("If we compress data and get less than this we discard the result and store the original uncompressed data."),
+
+    Option("bluestore_extent_map_shard_max_size", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(1200)
+    .set_description("Max size (bytes) for a single extent map shard before splitting"),
+
+    Option("bluestore_extent_map_shard_target_size", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(500)
+    .set_description("Target size (bytes) for a single extent map shard"),
+
+    Option("bluestore_extent_map_shard_min_size", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(150)
+    .set_description("Min size (bytes) for a single extent map shard before merging"),
+
+    Option("bluestore_extent_map_shard_target_size_slop", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(.2)
+    .set_description("Ratio above/below target for a shard when trying to align to an existing extent or blob boundary"),
+
+    Option("bluestore_extent_map_inline_shard_prealloc_size", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(256)
+    .set_description("Preallocated buffer for inline shards"),
+
+    Option("bluestore_cache_trim_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.05)
+    .set_description("How frequently we trim the bluestore cache"),
+
+    Option("bluestore_cache_trim_max_skip_pinned", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(64)
+    .set_description("Max pinned cache entries we consider before giving up"),
+
+    Option("bluestore_cache_type", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("2q")
+    .set_enum_allowed({"2q", "lru"})
+    .set_description("Cache replacement algorithm"),
+
+    Option("bluestore_2q_cache_kin_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(.5)
+    .set_description("2Q paper suggests .5"),
+
+    Option("bluestore_2q_cache_kout_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(.5)
+    .set_description("2Q paper suggests .5"),
+
+    Option("bluestore_cache_size", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description("Cache size (in bytes) for BlueStore")
+    .set_long_description("This includes data and metadata cached by BlueStore as well as memory devoted to rocksdb's cache(s)."),
+
+    Option("bluestore_cache_size_hdd", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(1_G)
+    .set_description("Default bluestore_cache_size for rotational media")
+    .add_see_also("bluestore_cache_size"),
+
+    Option("bluestore_cache_size_ssd", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(3_G)
+    .set_description("Default bluestore_cache_size for non-rotational (solid state) media")
+    .add_see_also("bluestore_cache_size"),
+
+    Option("bluestore_cache_meta_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(.4)
+    .add_see_also("bluestore_cache_size")
+    .set_description("Ratio of bluestore cache to devote to metadata"),
+
+    Option("bluestore_cache_kv_ratio", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(.4)
+    .add_see_also("bluestore_cache_size")
+    .set_description("Ratio of bluestore cache to devote to kv database (rocksdb)"),
+
+    Option("bluestore_cache_autotune", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(true)
+    .add_see_also("bluestore_cache_size")
+    .add_see_also("bluestore_cache_meta_ratio")
+    .set_description("Automatically tune the ratio of caches while respecting min values."),
+
+    Option("bluestore_cache_autotune_interval", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(5)
+    .add_see_also("bluestore_cache_autotune")
+    .set_description("The number of seconds to wait between rebalances when cache autotune is enabled."),
+
+    Option("bluestore_kvbackend", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("rocksdb")
+    .set_flag(Option::FLAG_CREATE)
+    .set_description("Key value database to use for bluestore"),
+
+    Option("bluestore_allocator", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("hybrid")
+    .set_enum_allowed({"bitmap", "stupid", "avl", "hybrid"})
+    .set_description("Allocator policy")
+    .set_long_description("Allocator to use for bluestore.  Stupid should only be used for testing."),
+
+    Option("bluestore_freelist_blocks_per_key", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(128)
+    .set_description("Block (and bits) per database key"),
+
+    Option("bluestore_bitmapallocator_blocks_per_zone", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(1024)
+    .set_description(""),
+
+    Option("bluestore_bitmapallocator_span_size", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(1024)
+    .set_description(""),
+
+    Option("bluestore_max_deferred_txc", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(32)
+    .set_description("Max transactions with deferred writes that can accumulate before we force flush deferred writes"),
+
+    Option("bluestore_rocksdb_options", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("compression=kNoCompression,max_write_buffer_number=4,min_write_buffer_number_to_merge=1,recycle_log_file_num=4,write_buffer_size=268435456,writable_file_max_buffer_size=0,compaction_readahead_size=2097152,max_background_compactions=2")
+    .set_description("Rocksdb options"),
+
+    Option("bluestore_rocksdb_cf", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Enable use of rocksdb column families for bluestore metadata"),
+
+    Option("bluestore_rocksdb_cfs", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("M= P= L=")
+    .set_description("List of whitespace-separate key/value pairs where key is CF name and value is CF options"),
+
+    Option("bluestore_fsck_on_mount", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("Run fsck at mount"),
+
+    Option("bluestore_fsck_on_mount_deep", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("Run deep fsck at mount when bluestore_fsck_on_mount is set to true"),
+
+    Option("bluestore_fsck_quick_fix_on_mount", Option::TYPE_BOOL, Option::LEVEL_DEV)
+      .set_default(true)
+      .set_description("Do quick-fix for the store at mount"),
+
+    Option("bluestore_fsck_on_umount", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("Run fsck at umount"),
+
+    Option("bluestore_fsck_on_umount_deep", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("Run deep fsck at umount when bluestore_fsck_on_umount is set to true"),
+
+    Option("bluestore_fsck_on_mkfs", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(true)
+    .set_description("Run fsck after mkfs"),
+
+    Option("bluestore_fsck_on_mkfs_deep", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("Run deep fsck after mkfs"),
+
+    Option("bluestore_sync_submit_transaction", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("Try to submit metadata transaction to rocksdb in queuing thread context"),
+
+    Option("bluestore_fsck_read_bytes_cap", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(64_M)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Maximum bytes read at once by deep fsck"),
+
+    Option("bluestore_fsck_quick_fix_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+      .set_default(2)
+      .set_description("Number of additional threads to perform quick-fix (shallow fsck) command"),
+
+    Option("bluestore_throttle_bytes", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(64_M)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Maximum bytes in flight before we throttle IO submission"),
+
+    Option("bluestore_throttle_deferred_bytes", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(128_M)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Maximum bytes for deferred writes before we throttle IO submission"),
+
+    Option("bluestore_throttle_cost_per_io", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Overhead added to transaction cost (in bytes) for each IO"),
+
+  Option("bluestore_throttle_cost_per_io_hdd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(670000)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Default bluestore_throttle_cost_per_io for rotational media")
+    .add_see_also("bluestore_throttle_cost_per_io"),
+
+    Option("bluestore_throttle_cost_per_io_ssd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(4000)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Default bluestore_throttle_cost_per_io for non-rotation (solid state) media")
+    .add_see_also("bluestore_throttle_cost_per_io"),
+
+    Option("bluestore_deferred_batch_ops", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Max number of deferred writes before we flush the deferred write queue"),
+
+    Option("bluestore_deferred_batch_ops_hdd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(64)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Default bluestore_deferred_batch_ops for rotational media")
+    .add_see_also("bluestore_deferred_batch_ops"),
+
+    Option("bluestore_deferred_batch_ops_ssd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(16)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Default bluestore_deferred_batch_ops for non-rotational (solid state) media")
+    .add_see_also("bluestore_deferred_batch_ops"),
+
+    Option("bluestore_nid_prealloc", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(1024)
+    .set_description("Number of unique object ids to preallocate at a time"),
+
+    Option("bluestore_blobid_prealloc", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(10240)
+    .set_description("Number of unique blob ids to preallocate at a time"),
+
+    Option("bluestore_clone_cow", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Use copy-on-write when cloning objects (versus reading and rewriting them at clone time)"),
+
+    Option("bluestore_default_buffered_read", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Cache read results by default (unless hinted NOCACHE or WONTNEED)"),
+
+    Option("bluestore_default_buffered_write", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Cache writes by default (unless hinted NOCACHE or WONTNEED)"),
+
+    Option("bluestore_debug_misc", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("bluestore_debug_no_reuse_blocks", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("bluestore_debug_small_allocations", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+    Option("bluestore_debug_max_cached_onodes", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description("This allows to explicitly cap number of onode entries per cache shard "
+                     "effectively bypassing all the smart but indirect cache adjustment logic."
+                     " Intended for testing purposes only "),
+    Option("bluestore_debug_too_many_blobs_threshold", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(24*1024)
+    .set_description(""),
+
+    Option("bluestore_debug_freelist", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("bluestore_debug_prefill", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description("simulate fragmentation"),
+
+    Option("bluestore_debug_prefragment_max", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(1_M)
+    .set_description(""),
+
+    Option("bluestore_debug_inject_read_err", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("bluestore_debug_randomize_serial_transaction", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("bluestore_debug_omit_block_device_write", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("bluestore_debug_fsck_abort", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("bluestore_debug_omit_kv_commit", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("bluestore_debug_permit_any_bdev_label", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("bluestore_debug_random_read_err", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("bluestore_debug_inject_bug21040", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("bluestore_debug_inject_csum_err_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0.0)
+    .set_description("inject crc verification errors into bluestore device reads"),
+
+    Option("bluestore_fsck_error_on_no_per_pool_stats", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Make fsck error (instead of warn) when bluestore lacks per-pool stats, e.g., after an upgrade"),
+
+    Option("bluestore_warn_on_bluefs_spillover", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Enable health indication on bluefs slow device usage"),
+
+    Option("bluestore_warn_on_legacy_statfs", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Enable health indication on lack of per-pool statfs reporting from bluestore"),
+
+    Option("bluestore_log_op_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("log operation if it's slower than this age (seconds)"),
+
+    Option("bluestore_log_omap_iterator_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("log omap iteration operation if it's slower than this age (seconds)"),
+
+    Option("bluestore_log_collection_list_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(60)
+    .set_description("log collection list operation if it's slower than this age (seconds)"),
+
+    Option("bluestore_volume_selection_policy", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("use_some_extra")
+    .set_enum_allowed({ "rocksdb_original", "use_some_extra" })
+    .set_description("Determines bluefs volume selection policy")
+    .set_long_description("Determines bluefs volume selection policy. 'use_some_extra' policy allows to override RocksDB level granularity and put high level's data to faster device even when the level doesn't completely fit there"),
+
+    Option("bluestore_volume_selection_reserved_factor", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+      .set_flag(Option::FLAG_STARTUP)
+      .set_default(2.0)
+      .set_description("DB level size multiplier. Determines amount of space at DB device to bar from the usage when 'use some extra' policy is in action. Reserved size is determined as sum(L_max_size[0], L_max_size[L-1]) + L_max_size[L] * this_factor"),
+
+    Option("bluestore_volume_selection_reserved", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+      .set_flag(Option::FLAG_STARTUP)
+      .set_default(0)
+      .set_description("Space reserved at DB device and not allowed for 'use some extra' policy usage. Overrides 'bluestore_volume_selection_reserved_factor' setting and introduces straightforward limit."),
+
+    Option("bluestore_avl_alloc_bf_threshold", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(131072)
+    .set_description(""),
+
+    Option("bluestore_avl_alloc_bf_free_pct", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(4)
+    .set_description(""),
+
+    Option("bluestore_hybrid_alloc_mem_cap", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(64_M)
+    .set_description("Maximum RAM hybrid allocator should use before enabling bitmap supplement"),
+
+    Option("bluestore_kv_sync_util_logging_s", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(10.0)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("KV sync thread utilization logging period")
+    .set_long_description("How often (in seconds) to print KV sync thread utilization, "
+      "not logged when set to 0 or when utilization is 0%"),
+
+
+    // -----------------------------------------
+    // kstore
+
+    Option("kstore_max_ops", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(512)
+    .set_description(""),
+
+    Option("kstore_max_bytes", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(64_M)
+    .set_description(""),
+
+    Option("kstore_backend", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("rocksdb")
+    .set_description(""),
+
+    Option("kstore_rocksdb_options", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("compression=kNoCompression")
+    .set_description("Options to pass through when RocksDB is used as the KeyValueDB for kstore."),
+
+    Option("kstore_fsck_on_mount", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Whether or not to run fsck on mount for kstore."),
+
+    Option("kstore_fsck_on_mount_deep", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Whether or not to run deep fsck on mount for kstore"),
+
+    Option("kstore_nid_prealloc", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1024)
+    .set_description(""),
+
+    Option("kstore_sync_transaction", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("kstore_sync_submit_transaction", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("kstore_onode_map_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1024)
+    .set_description(""),
+
+    Option("kstore_default_stripe_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(65536)
+    .set_description(""),
+
+    // ---------------------
+    // filestore
+
+    Option("filestore_rocksdb_options", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("max_background_jobs=10,compaction_readahead_size=2097152,compression=kNoCompression")
+    .set_description("Options to pass through when RocksDB is used as the KeyValueDB for filestore."),
+
+    Option("filestore_omap_backend", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("rocksdb")
+    .set_enum_allowed({"leveldb", "rocksdb"})
+    .set_description("The KeyValueDB to use for filestore metadata (ie omap)."),
+
+    Option("filestore_omap_backend_path", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("")
+    .set_description("The path where the filestore KeyValueDB should store it's database(s)."),
+
+    Option("filestore_wbthrottle_enable", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Enabling throttling of operations to backing file system"),
+
+    Option("filestore_wbthrottle_btrfs_bytes_start_flusher", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(41943040)
+    .set_description("Start flushing (fsyncing) when this many bytes are written(btrfs)"),
+
+    Option("filestore_wbthrottle_btrfs_bytes_hard_limit", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(419430400)
+    .set_description("Block writes when this many bytes haven't been flushed (fsynced) (btrfs)"),
+
+    Option("filestore_wbthrottle_btrfs_ios_start_flusher", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(500)
+    .set_description("Start flushing (fsyncing) when this many IOs are written (brtrfs)"),
+
+    Option("filestore_wbthrottle_btrfs_ios_hard_limit", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5000)
+    .set_description("Block writes when this many IOs haven't been flushed (fsynced) (btrfs)"),
+
+    Option("filestore_wbthrottle_btrfs_inodes_start_flusher", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(500)
+    .set_description("Start flushing (fsyncing) when this many distinct inodes have been modified (btrfs)"),
+
+    Option("filestore_wbthrottle_xfs_bytes_start_flusher", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(41943040)
+    .set_description("Start flushing (fsyncing) when this many bytes are written(xfs)"),
+
+    Option("filestore_wbthrottle_xfs_bytes_hard_limit", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(419430400)
+    .set_description("Block writes when this many bytes haven't been flushed (fsynced) (xfs)"),
+
+    Option("filestore_wbthrottle_xfs_ios_start_flusher", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(500)
+    .set_description("Start flushing (fsyncing) when this many IOs are written (xfs)"),
+
+    Option("filestore_wbthrottle_xfs_ios_hard_limit", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5000)
+    .set_description("Block writes when this many IOs haven't been flushed (fsynced) (xfs)"),
+
+    Option("filestore_wbthrottle_xfs_inodes_start_flusher", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(500)
+    .set_description("Start flushing (fsyncing) when this many distinct inodes have been modified (xfs)"),
+
+    Option("filestore_wbthrottle_btrfs_inodes_hard_limit", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5000)
+    .set_description("Block writing when this many inodes have outstanding writes (btrfs)"),
+
+    Option("filestore_wbthrottle_xfs_inodes_hard_limit", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5000)
+    .set_description("Block writing when this many inodes have outstanding writes (xfs)"),
+
+    Option("filestore_odsync_write", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("Write with O_DSYNC"),
+
+    Option("filestore_index_retry_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("filestore_debug_inject_read_err", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("filestore_debug_random_read_err", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("filestore_debug_omap_check", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("filestore_omap_header_cache_size", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(1024)
+    .set_description(""),
+
+    Option("filestore_max_inline_xattr_size", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("filestore_max_inline_xattr_size_xfs", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(65536)
+    .set_description(""),
+
+    Option("filestore_max_inline_xattr_size_btrfs", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(2048)
+    .set_description(""),
+
+    Option("filestore_max_inline_xattr_size_other", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(512)
+    .set_description(""),
+
+    Option("filestore_max_inline_xattrs", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("filestore_max_inline_xattrs_xfs", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(10)
+    .set_description(""),
+
+    Option("filestore_max_inline_xattrs_btrfs", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(10)
+    .set_description(""),
+
+    Option("filestore_max_inline_xattrs_other", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(2)
+    .set_description(""),
+
+    Option("filestore_max_xattr_value_size", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("filestore_max_xattr_value_size_xfs", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(64_K)
+    .set_description(""),
+
+    Option("filestore_max_xattr_value_size_btrfs", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(64_K)
+    .set_description(""),
+
+    Option("filestore_max_xattr_value_size_other", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(1_K)
+    .set_description(""),
+
+    Option("filestore_sloppy_crc", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("filestore_sloppy_crc_block_size", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(65536)
+    .set_description(""),
+
+    Option("filestore_max_alloc_hint_size", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(1ULL << 20)
+    .set_description(""),
+
+    Option("filestore_max_sync_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("Period between calls to syncfs(2) and journal trims (seconds)"),
+
+    Option("filestore_min_sync_interval", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(.01)
+    .set_description("Minimum period between calls to syncfs(2)"),
+
+    Option("filestore_btrfs_snap", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(true)
+    .set_description(""),
+
+    Option("filestore_btrfs_clone_range", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Use btrfs clone_range ioctl to efficiently duplicate objects"),
+
+    Option("filestore_zfs_snap", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("filestore_fsync_flushes_journal_data", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("filestore_fiemap", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Use fiemap ioctl(2) to determine which parts of objects are sparse"),
+
+    Option("filestore_punch_hole", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Use fallocate(2) FALLOC_FL_PUNCH_HOLE to efficiently zero ranges of objects"),
+
+    Option("filestore_seek_data_hole", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Use lseek(2) SEEK_HOLE and SEEK_DATA to determine which parts of objects are sparse"),
+
+    Option("filestore_splice", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Use splice(2) to more efficiently copy data between files"),
+
+    Option("filestore_fadvise", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Use posix_fadvise(2) to pass hints to file system"),
+
+    Option("filestore_collect_device_partition_information", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Collect metadata about the backing file system on OSD startup"),
+
+    Option("filestore_xfs_extsize", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Use XFS extsize ioctl(2) to hint allocator about expected write sizes"),
+
+    Option("filestore_journal_parallel", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("filestore_journal_writeahead", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("filestore_journal_trailing", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("filestore_queue_max_ops", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(50)
+    .set_description("Max IO operations in flight"),
+
+    Option("filestore_queue_max_bytes", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(100_M)
+    .set_description("Max (written) bytes in flight"),
+
+    Option("filestore_caller_concurrency", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(10)
+    .set_description(""),
+
+    Option("filestore_expected_throughput_bytes", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(200_M)
+    .set_description("Expected throughput of backend device (aids throttling calculations)"),
+
+    Option("filestore_expected_throughput_ops", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(200)
+    .set_description("Expected through of backend device in IOPS (aids throttling calculations)"),
+
+    Option("filestore_queue_max_delay_multiple", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("filestore_queue_high_delay_multiple", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("filestore_queue_max_delay_multiple_bytes", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("filestore_queue_high_delay_multiple_bytes", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("filestore_queue_max_delay_multiple_ops", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("filestore_queue_high_delay_multiple_ops", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("filestore_queue_low_threshhold", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0.3)
+    .set_description(""),
+
+    Option("filestore_queue_high_threshhold", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0.9)
+    .set_description(""),
+
+    Option("filestore_op_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(2)
+    .set_description("Threads used to apply changes to backing file system"),
+
+    Option("filestore_op_thread_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(60)
+    .set_description("Seconds before a worker thread is considered stalled"),
+
+    Option("filestore_op_thread_suicide_timeout", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(180)
+    .set_description("Seconds before a worker thread is considered dead"),
+
+    Option("filestore_commit_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(600)
+    .set_description("Seconds before backing file system is considered hung"),
+
+    Option("filestore_fiemap_threshold", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(4_K)
+    .set_description(""),
+
+    Option("filestore_merge_threshold", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(-10)
+    .set_description(""),
+
+    Option("filestore_split_multiple", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(2)
+    .set_description(""),
+
+    Option("filestore_split_rand_factor", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(20)
+    .set_description(""),
+
+    Option("filestore_update_to", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(1000)
+    .set_description(""),
+
+    Option("filestore_blackhole", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("filestore_fd_cache_size", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(128)
+    .set_description(""),
+
+    Option("filestore_fd_cache_shards", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(16)
+    .set_description(""),
+
+    Option("filestore_ondisk_finisher_threads", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(1)
+    .set_description(""),
+
+    Option("filestore_apply_finisher_threads", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(1)
+    .set_description(""),
+
+    Option("filestore_dump_file", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("")
+    .set_description(""),
+
+    Option("filestore_kill_at", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("filestore_inject_stall", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("filestore_fail_eio", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(true)
+    .set_description(""),
+
+    Option("filestore_debug_verify_split", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("journal_dio", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(true)
+    .set_description(""),
+
+    Option("journal_aio", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(true)
+    .set_description(""),
+
+    Option("journal_force_aio", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("journal_block_size", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(4_K)
+    .set_description(""),
+
+    Option("journal_block_align", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(true)
+    .set_description(""),
+
+    Option("journal_write_header_frequency", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("journal_max_write_bytes", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(10_M)
+    .set_description("Max bytes in flight to journal"),
+
+    Option("journal_max_write_entries", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(100)
+    .set_description("Max IOs in flight to journal"),
+
+    Option("journal_throttle_low_threshhold", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0.6)
+    .set_description(""),
+
+    Option("journal_throttle_high_threshhold", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0.9)
+    .set_description(""),
+
+    Option("journal_throttle_high_multiple", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("journal_throttle_max_multiple", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("journal_align_min_size", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(64_K)
+    .set_description(""),
+
+    Option("journal_replay_from", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+  Option("mgr_stats_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+  .set_default((int64_t)PerfCountersBuilder::PRIO_USEFUL)
+  .set_description("Lowest perfcounter priority collected by mgr")
+  .set_long_description("Daemons only set perf counter data to the manager "
+    "daemon if the counter has a priority higher than this.")
+  .set_min_max((int64_t)PerfCountersBuilder::PRIO_DEBUGONLY,
+               (int64_t)PerfCountersBuilder::PRIO_CRITICAL + 1),
+
+    Option("journal_zero_on_create", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("journal_ignore_corruption", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("journal_discard", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("fio_dir", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("/tmp/fio")
+    .set_description(""),
+
+    Option("rados_mon_op_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description(""),
+
+    Option("rados_osd_op_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description(""),
+
+    Option("rados_tracing", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("nss_db_path", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description(""),
+
+    Option("mgr_module_path", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default(CEPH_DATADIR "/mgr")
+    .add_service("mgr")
+    .set_description("Filesystem path to manager modules."),
+
+    Option("mgr_initial_modules", Option::TYPE_STR, Option::LEVEL_BASIC)
+    .set_default("restful iostat")
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .set_flag(Option::FLAG_CLUSTER_CREATE)
+    .add_service("mon")
+    .set_description("List of manager modules to enable when the cluster is "
+                     "first started")
+    .set_long_description("This list of module names is read by the monitor "
+        "when the cluster is first started after installation, to populate "
+        "the list of enabled manager modules.  Subsequent updates are done using "
+        "the 'mgr module [enable|disable]' commands.  List may be comma "
+        "or space separated."),
+
+    Option("mgr_data", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("/var/lib/ceph/mgr/$cluster-$id")
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .add_service("mgr")
+    .set_description("Filesystem path to the ceph-mgr data directory, used to "
+                     "contain keyring."),
+
+    Option("mgr_tick_period", Option::TYPE_SECS, Option::LEVEL_ADVANCED)
+    .set_default(2)
+    .add_service("mgr")
+    .set_description("Period in seconds of beacon messages to monitor"),
+
+    Option("mgr_stats_period", Option::TYPE_INT, Option::LEVEL_BASIC)
+    .set_default(5)
+    .add_service("mgr")
+    .set_description("Period in seconds of OSD/MDS stats reports to manager")
+    .set_long_description("Use this setting to control the granularity of "
+                          "time series data collection from daemons.  Adjust "
+                          "upwards if the manager CPU load is too high, or "
+                          "if you simply do not require the most up to date "
+                          "performance counter data."),
+
+    Option("mgr_client_bytes", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(128_M)
+    .add_service("mgr"),
+
+    Option("mgr_client_messages", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(512)
+    .add_service("mgr"),
+
+    Option("mgr_osd_bytes", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(512_M)
+    .add_service("mgr"),
+
+    Option("mgr_osd_messages", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(8192)
+    .add_service("mgr"),
+
+    Option("mgr_mds_bytes", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(128_M)
+    .add_service("mgr"),
+
+    Option("mgr_mds_messages", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(128)
+    .add_service("mgr"),
+
+    Option("mgr_mon_bytes", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(128_M)
+    .add_service("mgr"),
+
+    Option("mgr_mon_messages", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(128)
+    .add_service("mgr"),
+
+    Option("mgr_connect_retry_interval", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(1.0)
+    .add_service("common"),
+
+    Option("mgr_service_beacon_grace", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(60.0)
+    .add_service("mgr")
+    .set_description("Period in seconds from last beacon to manager dropping "
+                     "state about a monitored service (RGW, rbd-mirror etc)"),
+
+    Option("mgr_client_service_daemon_unregister_timeout", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(1.0)
+    .set_description("Time to wait during shutdown to deregister service with mgr"),
+
+    Option("mgr_debug_aggressive_pg_num_changes", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("Bypass most throttling and safety checks in pg[p]_num controller")
+    .add_service("mgr"),
+
+    Option("mon_mgr_digest_period", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(5)
+    .add_service("mon")
+    .set_description("Period in seconds between monitor-to-manager "
+                     "health/status updates"),
+
+    Option("mon_mgr_beacon_grace", Option::TYPE_SECS, Option::LEVEL_ADVANCED)
+    .set_default(30)
+    .add_service("mon")
+    .set_description("Period in seconds from last beacon to monitor marking "
+                     "a manager daemon as failed"),
+
+    Option("mon_mgr_inactive_grace", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(60)
+    .add_service("mon")
+    .set_description("Period in seconds after cluster creation during which "
+                     "cluster may have no active manager")
+    .set_long_description("This grace period enables the cluster to come "
+                          "up cleanly without raising spurious health check "
+                          "failures about managers that aren't online yet"),
+
+    Option("mon_mgr_mkfs_grace", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(120)
+    .add_service("mon")
+    .set_description("Period in seconds that the cluster may have no active "
+                     "manager before this is reported as an ERR rather than "
+                     "a WARN"),
+
+    Option("throttler_perf_counter", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description(""),
+
+    Option("event_tracing", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    Option("debug_deliberately_leak_memory", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("debug_asserts_on_shutdown", Option::TYPE_BOOL,Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("Enable certain asserts to check for refcounting bugs on shutdown; see http://tracker.ceph.com/issues/21738"),
+
+    Option("debug_asok_assert_abort", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("allow commands 'assert' and 'abort' via asok for testing crash dumps etc"),
+
+    Option("target_max_misplaced_ratio", Option::TYPE_FLOAT, Option::LEVEL_BASIC)
+    .set_default(.05)
+    .set_description("Max ratio of misplaced objects to target when throttling data rebalancing activity"),
+
+    Option("device_failure_prediction_mode", Option::TYPE_STR, Option::LEVEL_BASIC)
+    .set_default("none")
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_enum_allowed({"none", "local", "cloud"})
+    .set_description("Method used to predict device failures")
+    .set_long_description("To disable prediction, use 'none',  'local' uses a prediction model that runs inside the mgr daemon.  'cloud' will share metrics with a cloud service and query the service for devicelife expectancy."),
+
+    /*  KRB Authentication. */
+    Option("gss_ktab_client_file", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("/var/lib/ceph/$name/gss_client_$name.ktab")
+    .set_description("GSS/KRB5 Keytab file for client authentication")
+    .add_service({"mon", "osd"})
+    .set_long_description("This sets the full path for the GSS/Kerberos client keytab file location."),
+
+    Option("gss_target_name", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("ceph")
+    .set_description("")
+    .add_service({"mon", "osd"})
+    .set_long_description("This sets the gss target service name."),
+
+    Option("debug_disable_randomized_ping", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("Disable heartbeat ping randomization for testing purposes"),
+
+    Option("debug_heartbeat_testing_span", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description("Override 60 second periods for testing only"),
+  });
+}
+
+std::vector<Option> get_rgw_options() {
+  return std::vector<Option>({
+    Option("rgw_acl_grants_max_num", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(100)
+    .set_description("Max number of ACL grants in a single request"),
+
+    Option("rgw_cors_rules_max_num", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(100)
+    .set_description("Max number of cors rules in a single request"),
+
+    Option("rgw_delete_multi_obj_max_num", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1000)
+    .set_description("Max number of objects in a single multi-object delete request"),
+
+    Option("rgw_website_routing_rules_max_num", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(50)
+    .set_description("Max number of website routing rules in a single request"),
+
+    Option("rgw_rados_tracing", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("true if LTTng-UST tracepoints should be enabled"),
+
+    Option("rgw_op_tracing", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("true if LTTng-UST tracepoints should be enabled"),
+
+    Option("rgw_max_chunk_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(4_M)
+    .set_description("Set RGW max chunk size")
+    .set_long_description(
+        "The chunk size is the size of RADOS I/O requests that RGW sends when accessing "
+        "data objects. RGW read and write operation will never request more than this amount "
+        "in a single request. This also defines the rgw object head size, as head operations "
+        "need to be atomic, and anything larger than this would require more than a single "
+        "operation."),
+
+    Option("rgw_put_obj_min_window_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(16_M)
+    .set_description("The minimum RADOS write window size (in bytes).")
+    .set_long_description(
+        "The window size determines the total concurrent RADOS writes of a single rgw object. "
+        "When writing an object RGW will send multiple chunks to RADOS. The total size of the "
+        "writes does not exceed the window size. The window size can be automatically "
+        "in order to better utilize the pipe.")
+    .add_see_also({"rgw_put_obj_max_window_size", "rgw_max_chunk_size"}),
+
+    Option("rgw_put_obj_max_window_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(64_M)
+    .set_description("The maximum RADOS write window size (in bytes).")
+    .set_long_description("The window size may be dynamically adjusted, but will not surpass this value.")
+    .add_see_also({"rgw_put_obj_min_window_size", "rgw_max_chunk_size"}),
+
+    Option("rgw_max_put_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(5_G)
+    .set_description("Max size (in bytes) of regular (non multi-part) object upload.")
+    .set_long_description(
+        "Plain object upload is capped at this amount of data. In order to upload larger "
+        "objects, a special upload mechanism is required. The S3 API provides the "
+        "multi-part upload, and Swift provides DLO and SLO."),
+
+    Option("rgw_max_put_param_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(1_M)
+    .set_description("The maximum size (in bytes) of data input of certain RESTful requests."),
+
+    Option("rgw_max_attr_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("The maximum length of metadata value. 0 skips the check"),
+
+    Option("rgw_max_attr_name_len", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("The maximum length of metadata name. 0 skips the check"),
+
+    Option("rgw_max_attrs_num_in_req", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("The maximum number of metadata items that can be put via single request"),
+
+    Option("rgw_override_bucket_index_max_shards", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("rgw_bucket_index_max_aio", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(8)
+    .set_description("Max number of concurrent RADOS requests when handling bucket shards."),
+
+    Option("rgw_enable_quota_threads", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Enables the quota maintenance thread.")
+    .set_long_description(
+        "The quota maintenance thread is responsible for quota related maintenance work. "
+        "The thread itself can be disabled, but in order for quota to work correctly, at "
+        "least one RGW in each zone needs to have this thread running. Having the thread "
+        "enabled on multiple RGW processes within the same zone can spread "
+        "some of the maintenance work between them.")
+    .add_see_also({"rgw_enable_gc_threads", "rgw_enable_lc_threads"}),
+
+    Option("rgw_enable_gc_threads", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Enables the garbage collection maintenance thread.")
+    .set_long_description(
+        "The garbage collection maintenance thread is responsible for garbage collector "
+        "maintenance work. The thread itself can be disabled, but in order for garbage "
+        "collection to work correctly, at least one RGW in each zone needs to have this "
+        "thread running.  Having the thread enabled on multiple RGW processes within the "
+        "same zone can spread some of the maintenance work between them.")
+    .add_see_also({"rgw_enable_quota_threads", "rgw_enable_lc_threads"}),
+
+    Option("rgw_enable_lc_threads", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Enables the lifecycle maintenance thread. This is required on at least one rgw for each zone.")
+    .set_long_description(
+        "The lifecycle maintenance thread is responsible for lifecycle related maintenance "
+        "work. The thread itself can be disabled, but in order for lifecycle to work "
+        "correctly, at least one RGW in each zone needs to have this thread running. Having"
+        "the thread enabled on multiple RGW processes within the same zone can spread "
+        "some of the maintenance work between them.")
+    .add_see_also({"rgw_enable_gc_threads", "rgw_enable_quota_threads"}),
+
+    Option("rgw_data", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("/var/lib/ceph/radosgw/$cluster-$id")
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .set_description("Alternative location for RGW configuration.")
+    .set_long_description(
+        "If this is set, the different Ceph system configurables (such as the keyring file "
+        "will be located in the path that is specified here. "),
+
+    Option("rgw_enable_apis", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("s3, s3website, swift, swift_auth, admin, sts, iam, pubsub")
+    .set_description("A list of set of RESTful APIs that rgw handles."),
+
+    Option("rgw_cache_enabled", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Enable RGW metadata cache.")
+    .set_long_description(
+        "The metadata cache holds metadata entries that RGW requires for processing "
+        "requests. Metadata entries can be user info, bucket info, and bucket instance "
+        "info. If not found in the cache, entries will be fetched from the backing "
+        "RADOS store.")
+    .add_see_also("rgw_cache_lru_size"),
+
+    Option("rgw_cache_lru_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(10000)
+    .set_description("Max number of items in RGW metadata cache.")
+    .set_long_description(
+        "When full, the RGW metadata cache evicts least recently used entries.")
+    .add_see_also("rgw_cache_enabled"),
+
+    Option("rgw_socket_path", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("RGW FastCGI socket path (for FastCGI over Unix domain sockets).")
+    .add_see_also("rgw_fcgi_socket_backlog"),
+
+    Option("rgw_host", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("RGW FastCGI host name (for FastCGI over TCP)")
+    .add_see_also({"rgw_port", "rgw_fcgi_socket_backlog"}),
+
+    Option("rgw_port", Option::TYPE_STR, Option::LEVEL_BASIC)
+    .set_default("")
+    .set_description("RGW FastCGI port number (for FastCGI over TCP)")
+    .add_see_also({"rgw_host", "rgw_fcgi_socket_backlog"}),
+
+    Option("rgw_dns_name", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("The host name that RGW uses.")
+    .set_long_description(
+        "This is Needed for virtual hosting of buckets to work properly, unless configured "
+        "via zonegroup configuration."),
+
+    Option("rgw_dns_s3website_name", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("The host name that RGW uses for static websites (S3)")
+    .set_long_description(
+        "This is needed for virtual hosting of buckets, unless configured via zonegroup "
+        "configuration."),
+
+    Option("rgw_service_provider_name", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Service provider name which is contained in http response headers")
+    .set_long_description(
+        "As S3 or other cloud storage providers do, http response headers should contain the name of the provider. "
+        "This name will be placed in http header 'Server'."),
+
+    Option("rgw_content_length_compat", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Multiple content length headers compatibility")
+    .set_long_description(
+        "Try to handle requests with abiguous multiple content length headers "
+        "(Content-Length, Http-Content-Length)."),
+
+    Option("rgw_relaxed_region_enforcement", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Disable region constraint enforcement")
+    .set_long_description(
+        "Enable requests such as bucket creation to succeed irrespective of region restrictions (Jewel compat)."),
+
+    Option("rgw_lifecycle_work_time", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("00:00-06:00")
+    .set_description("Lifecycle allowed work time")
+    .set_long_description("Local time window in which the lifecycle maintenance thread can work."),
+
+    Option("rgw_lc_lock_max_time", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(60)
+    .set_description(""),
+
+    Option("rgw_lc_thread_delay", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("Delay after processing of bucket listing chunks (i.e., per 1000 entries) in milliseconds"),
+
+    Option("rgw_lc_max_objs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(32)
+    .set_description("Number of lifecycle data shards")
+    .set_long_description(
+          "Number of RADOS objects to use for storing lifecycle index. This can affect "
+          "concurrency of lifecycle maintenance, but requires multiple RGW processes "
+          "running on the zone to be utilized."),
+
+    Option("rgw_lc_max_rules", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1000)
+    .set_description("Max number of lifecycle rules set on one bucket")
+    .set_long_description("Number of lifecycle rules set on one bucket should be limited."),
+
+    Option("rgw_lc_debug_interval", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(-1)
+    .set_description(""),
+
+    Option("rgw_mp_lock_max_time", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(600)
+    .set_description("Multipart upload max completion time")
+    .set_long_description(
+        "Time length to allow completion of a multipart upload operation. This is done "
+        "to prevent concurrent completions on the same object with the same upload id."),
+
+    Option("rgw_script_uri", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("")
+    .set_description(""),
+
+    Option("rgw_request_uri", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("")
+    .set_description(""),
+
+    Option("rgw_ignore_get_invalid_range", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Treat invalid (e.g., negative) range request as full")
+    .set_long_description("Treat invalid (e.g., negative) range request "
+			  "as request for the full object (AWS compatibility)"),
+
+    Option("rgw_swift_url", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Swift-auth storage URL")
+    .set_long_description(
+        "Used in conjunction with rgw internal swift authentication. This affects the "
+        "X-Storage-Url response header value.")
+    .add_see_also("rgw_swift_auth_entry"),
+
+    Option("rgw_swift_url_prefix", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("swift")
+    .set_description("Swift URL prefix")
+    .set_long_description("The URL path prefix for swift requests."),
+
+    Option("rgw_swift_auth_url", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Swift auth URL")
+    .set_long_description(
+        "Default url to which RGW connects and verifies tokens for v1 auth (if not using "
+        "internal swift auth)."),
+
+    Option("rgw_swift_auth_entry", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("auth")
+    .set_description("Swift auth URL prefix")
+    .set_long_description("URL path prefix for internal swift auth requests.")
+    .add_see_also("rgw_swift_url"),
+
+    Option("rgw_swift_tenant_name", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Swift tenant name")
+    .set_long_description("Tenant name that is used when constructing the swift path.")
+    .add_see_also("rgw_swift_account_in_url"),
+
+    Option("rgw_swift_account_in_url", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Swift account encoded in URL")
+    .set_long_description("Whether the swift account is encoded in the uri path (AUTH_<account>).")
+    .add_see_also("rgw_swift_tenant_name"),
+
+    Option("rgw_swift_enforce_content_length", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Send content length when listing containers (Swift)")
+    .set_long_description(
+        "Whether content length header is needed when listing containers. When this is "
+        "set to false, RGW will send extra info for each entry in the response."),
+
+    Option("rgw_keystone_url", Option::TYPE_STR, Option::LEVEL_BASIC)
+    .set_default("")
+    .set_description("The URL to the Keystone server."),
+
+    Option("rgw_keystone_admin_token", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("DEPRECATED: The admin token (shared secret) that is used for the Keystone requests."),
+
+    Option("rgw_keystone_admin_token_path", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Path to a file containing the admin token (shared secret) that is used for the Keystone requests."),
+
+    Option("rgw_keystone_admin_user", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Keystone admin user."),
+
+    Option("rgw_keystone_admin_password", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("DEPRECATED: Keystone admin password."),
+
+    Option("rgw_keystone_admin_password_path", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Path to a file containing the Keystone admin password."),
+
+    Option("rgw_keystone_admin_tenant", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Keystone admin user tenant."),
+
+    Option("rgw_keystone_admin_project", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Keystone admin user project (for Keystone v3)."),
+
+    Option("rgw_keystone_admin_domain", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Keystone admin user domain (for Keystone v3)."),
+
+    Option("rgw_keystone_barbican_user", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Keystone user to access barbican secrets."),
+
+    Option("rgw_keystone_barbican_password", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Keystone password for barbican user."),
+
+    Option("rgw_keystone_barbican_tenant", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Keystone barbican user tenant (Keystone v2.0)."),
+
+    Option("rgw_keystone_barbican_project", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Keystone barbican user project (Keystone v3)."),
+
+    Option("rgw_keystone_barbican_domain", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Keystone barbican user domain."),
+
+    Option("rgw_keystone_api_version", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(2)
+    .set_description("Version of Keystone API to use (2 or 3)."),
+
+    Option("rgw_keystone_accepted_roles", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("Member, admin")
+    .set_description("Only users with one of these roles will be served when doing Keystone authentication."),
+
+    Option("rgw_keystone_accepted_admin_roles", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("List of roles allowing user to gain admin privileges (Keystone)."),
+
+    Option("rgw_keystone_token_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(10000)
+    .set_description("Keystone token cache size")
+    .set_long_description(
+        "Max number of Keystone tokens that will be cached. Token that is not cached "
+        "requires RGW to access the Keystone server when authenticating."),
+
+    Option("rgw_keystone_revocation_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(15_min)
+    .set_description("Keystone cache revocation interval")
+    .set_long_description(
+        "Time (in seconds) that RGW waits between requests to Keystone for getting a list "
+        "of revoked tokens. A revoked token might still be considered valid by RGW for "
+        "this amount of time."),
+
+    Option("rgw_keystone_verify_ssl", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Should RGW verify the Keystone server SSL certificate."),
+
+    Option("rgw_keystone_implicit_tenants", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("false")
+    .set_enum_allowed( { "false", "true", "swift", "s3", "both", "0", "1", "none" } )
+    .set_description("RGW Keystone implicit tenants creation")
+    .set_long_description(
+        "Implicitly create new users in their own tenant with the same name when "
+        "authenticating via Keystone.  Can be limited to s3 or swift only."),
+
+    Option("rgw_cross_domain_policy", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("<allow-access-from domain=\"*\" secure=\"false\" />")
+    .set_description("RGW handle cross domain policy")
+    .set_long_description("Returned cross domain policy when accessing the crossdomain.xml "
+                          "resource (Swift compatiility)."),
+
+    Option("rgw_healthcheck_disabling_path", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("")
+    .set_description("Swift health check api can be disabled if a file can be accessed in this path."),
+
+    Option("rgw_s3_auth_use_rados", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Should S3 authentication use credentials stored in RADOS backend."),
+
+    Option("rgw_s3_auth_use_keystone", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Should S3 authentication use Keystone."),
+
+    Option("rgw_s3_auth_order", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+     .set_default("sts, external, local")
+     .set_description("Authentication strategy order to use for s3 authentication")
+     .set_long_description(
+	  "Order of authentication strategies to try for s3 authentication, the allowed "
+	   "options are a comma separated list of engines external, local. The "
+	   "default order is to try all the externally configured engines before "
+	   "attempting local rados based authentication"),
+
+    Option("rgw_barbican_url", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("URL to barbican server."),
+
+    Option("rgw_ldap_uri", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("ldaps://<ldap.your.domain>")
+    .set_description("Space-separated list of LDAP servers in URI format."),
+
+    Option("rgw_ldap_binddn", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("uid=admin,cn=users,dc=example,dc=com")
+    .set_description("LDAP entry RGW will bind with (user match)."),
+
+    Option("rgw_ldap_searchdn", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("cn=users,cn=accounts,dc=example,dc=com")
+    .set_description("LDAP search base (basedn)."),
+
+    Option("rgw_ldap_dnattr", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("uid")
+    .set_description("LDAP attribute containing RGW user names (to form binddns)."),
+
+    Option("rgw_ldap_secret", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("/etc/openldap/secret")
+    .set_description("Path to file containing credentials for rgw_ldap_binddn."),
+
+    Option("rgw_s3_auth_use_ldap", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Should S3 authentication use LDAP."),
+
+    Option("rgw_ldap_searchfilter", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("LDAP search filter."),
+
+    Option("rgw_opa_url", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("URL to OPA server."),
+
+    Option("rgw_opa_token", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("The Bearer token OPA uses to authenticate client requests."),
+
+    Option("rgw_opa_verify_ssl", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Should RGW verify the OPA server SSL certificate."),
+
+    Option("rgw_use_opa_authz", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Should OPA be used to authorize client requests."),
+
+    Option("rgw_admin_entry", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("admin")
+    .set_description("Path prefix to be used for accessing RGW RESTful admin API."),
+
+    Option("rgw_enforce_swift_acls", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("RGW enforce swift acls")
+    .set_long_description(
+        "Should RGW enforce special Swift-only ACLs. Swift has a special ACL that gives "
+        "permission to access all objects in a container."),
+
+    Option("rgw_swift_token_expiration", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1_day)
+    .set_description("Expiration time (in seconds) for token generated through RGW Swift auth."),
+
+    Option("rgw_print_continue", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("RGW support of 100-continue")
+    .set_long_description(
+        "Should RGW explicitly send 100 (continue) responses. This is mainly relevant when "
+        "using FastCGI, as some FastCGI modules do not fully support this feature."),
+
+    Option("rgw_print_prohibited_content_length", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("RGW RFC-7230 compatibility")
+    .set_long_description(
+        "Specifies whether RGW violates RFC 7230 and sends Content-Length with 204 or 304 "
+        "statuses."),
+
+    Option("rgw_remote_addr_param", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("REMOTE_ADDR")
+    .set_description("HTTP header that holds the remote address in incoming requests.")
+    .set_long_description(
+        "RGW will use this header to extract requests origin. When RGW runs behind "
+        "a reverse proxy, the remote address header will point at the proxy's address "
+        "and not at the originator's address. Therefore it is sometimes possible to "
+        "have the proxy add the originator's address in a separate HTTP header, which "
+        "will allow RGW to log it correctly."
+        )
+    .add_see_also("rgw_enable_ops_log"),
+
+    Option("rgw_op_thread_timeout", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(10*60)
+    .set_description("Timeout for async rados coroutine operations."),
+
+    Option("rgw_op_thread_suicide_timeout", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("rgw_thread_pool_size", Option::TYPE_INT, Option::LEVEL_BASIC)
+    .set_default(512)
+    .set_description("RGW requests handling thread pool size.")
+    .set_long_description(
+        "This parameter determines the number of concurrent requests RGW can process "
+        "when using either the civetweb, or the fastcgi frontends. The higher this "
+        "number is, RGW will be able to deal with more concurrent requests at the "
+        "cost of more resource utilization."),
+
+    Option("rgw_num_control_oids", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(8)
+    .set_description("Number of control objects used for cross-RGW communication.")
+    .set_long_description(
+        "RGW uses certain control objects to send messages between different RGW "
+        "processes running on the same zone. These messages include metadata cache "
+        "invalidation info that is being sent when metadata is modified (such as "
+        "user or bucket information). A higher number of control objects allows "
+        "better concurrency of these messages, at the cost of more resource "
+        "utilization."),
+
+    Option("rgw_num_rados_handles", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description("Number of librados handles that RGW uses.")
+    .set_long_description(
+        "This param affects the number of separate librados handles it uses to "
+        "connect to the RADOS backend, which directly affects the number of connections "
+        "RGW will have to each OSD. A higher number affects resource utilization."),
+
+    Option("rgw_verify_ssl", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Should RGW verify SSL when connecing to a remote HTTP server")
+    .set_long_description(
+        "RGW can send requests to other RGW servers (e.g., in multi-site sync work). "
+        "This configurable selects whether RGW should verify the certificate for "
+        "the remote peer and host.")
+    .add_see_also("rgw_keystone_verify_ssl"),
+
+    Option("rgw_nfs_lru_lanes", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description(""),
+
+    Option("rgw_nfs_lru_lane_hiwat", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(911)
+    .set_description(""),
+
+    Option("rgw_nfs_fhcache_partitions", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(3)
+    .set_description(""),
+
+    Option("rgw_nfs_fhcache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(2017)
+    .set_description(""),
+
+    Option("rgw_nfs_namespace_expire_secs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(300)
+    .set_min(1)
+    .set_description(""),
+
+    Option("rgw_nfs_max_gc", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(300)
+    .set_min(1)
+    .set_description(""),
+
+    Option("rgw_nfs_write_completion_interval_s", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .set_description(""),
+
+    Option("rgw_nfs_s3_fast_attrs", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("use fast S3 attrs from bucket index (immutable only)")
+    .set_long_description("use fast S3 attrs from bucket index (assumes NFS "
+			  "mounts are immutable)"),
+
+    Option("rgw_rados_pool_autoscale_bias", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(4.0)
+    .set_min_max(0.01, 100000.0)
+    .set_description("pg_autoscale_bias value for RGW metadata (omap-heavy) pools"),
+
+    Option("rgw_rados_pool_pg_num_min", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(8)
+    .set_min_max(1, 1024)
+    .set_description("pg_num_min value for RGW metadata (omap-heavy) pools"),
+
+    Option("rgw_zone", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Zone name")
+    .add_see_also({"rgw_zonegroup", "rgw_realm"}),
+
+    Option("rgw_zone_root_pool", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default(".rgw.root")
+    .set_description("Zone root pool name")
+    .set_long_description(
+        "The zone root pool, is the pool where the RGW zone configuration located."
+    )
+    .add_see_also({"rgw_zonegroup_root_pool", "rgw_realm_root_pool", "rgw_period_root_pool"}),
+
+    Option("rgw_default_zone_info_oid", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("default.zone")
+    .set_description("Default zone info object id")
+    .set_long_description(
+        "Name of the RADOS object that holds the default zone information."
+    ),
+
+    Option("rgw_region", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Region name")
+    .set_long_description(
+        "Obsolete config option. The rgw_zonegroup option should be used instead.")
+    .add_see_also("rgw_zonegroup"),
+
+    Option("rgw_region_root_pool", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default(".rgw.root")
+    .set_description("Region root pool")
+    .set_long_description(
+        "Obsolete config option. The rgw_zonegroup_root_pool should be used instead.")
+    .add_see_also("rgw_zonegroup_root_pool"),
+
+    Option("rgw_default_region_info_oid", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("default.region")
+    .set_description("Default region info object id")
+    .set_long_description(
+        "Obsolete config option. The rgw_default_zonegroup_info_oid should be used instead.")
+    .add_see_also("rgw_default_zonegroup_info_oid"),
+
+    Option("rgw_zonegroup", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Zonegroup name")
+    .add_see_also({"rgw_zone", "rgw_realm"}),
+
+    Option("rgw_zonegroup_root_pool", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default(".rgw.root")
+    .set_description("Zonegroup root pool")
+    .set_long_description(
+        "The zonegroup root pool, is the pool where the RGW zonegroup configuration located."
+    )
+    .add_see_also({"rgw_zone_root_pool", "rgw_realm_root_pool", "rgw_period_root_pool"}),
+
+    Option("rgw_default_zonegroup_info_oid", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("default.zonegroup")
+    .set_description(""),
+
+    Option("rgw_realm", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description(""),
+
+    Option("rgw_realm_root_pool", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default(".rgw.root")
+    .set_description("Realm root pool")
+    .set_long_description(
+        "The realm root pool, is the pool where the RGW realm configuration located."
+    )
+    .add_see_also({"rgw_zonegroup_root_pool", "rgw_zone_root_pool", "rgw_period_root_pool"}),
+
+    Option("rgw_default_realm_info_oid", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("default.realm")
+    .set_description(""),
+
+    Option("rgw_period_root_pool", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default(".rgw.root")
+    .set_description("Period root pool")
+    .set_long_description(
+        "The period root pool, is the pool where the RGW period configuration located."
+    )
+    .add_see_also({"rgw_zonegroup_root_pool", "rgw_zone_root_pool", "rgw_realm_root_pool"}),
+
+    Option("rgw_period_latest_epoch_info_oid", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default(".latest_epoch")
+    .set_description(""),
+
+    Option("rgw_log_nonexistent_bucket", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Should RGW log operations on bucket that does not exist")
+    .set_long_description(
+        "This config option applies to the ops log. When this option is set, the ops log "
+        "will log operations that are sent to non existing buckets. These operations "
+        "inherently fail, and do not correspond to a specific user.")
+    .add_see_also("rgw_enable_ops_log"),
+
+    Option("rgw_log_object_name", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("%Y-%m-%d-%H-%i-%n")
+    .set_description("Ops log object name format")
+    .set_long_description(
+        "Defines the format of the RADOS objects names that ops log uses to store ops "
+        "log data")
+    .add_see_also("rgw_enable_ops_log"),
+
+    Option("rgw_log_object_name_utc", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Should ops log object name based on UTC")
+    .set_long_description(
+        "If set, the names of the RADOS objects that hold the ops log data will be based "
+        "on UTC time zone. If not set, it will use the local time zone.")
+    .add_see_also({"rgw_enable_ops_log", "rgw_log_object_name"}),
+
+    Option("rgw_usage_max_shards", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(32)
+    .set_description("Number of shards for usage log.")
+    .set_long_description(
+        "The number of RADOS objects that RGW will use in order to store the usage log "
+        "data.")
+    .add_see_also("rgw_enable_usage_log"),
+
+    Option("rgw_usage_max_user_shards", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_min(1)
+    .set_description("Number of shards for single user in usage log")
+    .set_long_description(
+        "The number of shards that a single user will span over in the usage log.")
+    .add_see_also("rgw_enable_usage_log"),
+
+    Option("rgw_enable_ops_log", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Enable ops log")
+    .add_see_also({"rgw_log_nonexistent_bucket", "rgw_log_object_name", "rgw_ops_log_rados",
+               "rgw_ops_log_socket_path"}),
+
+    Option("rgw_enable_usage_log", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Enable usage log")
+    .add_see_also("rgw_usage_max_shards"),
+
+    Option("rgw_ops_log_rados", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Use RADOS for ops log")
+    .set_long_description(
+       "If set, RGW will store ops log information in RADOS.")
+    .add_see_also({"rgw_enable_ops_log"}),
+
+    Option("rgw_ops_log_socket_path", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Unix domain socket path for ops log.")
+    .set_long_description(
+        "Path to unix domain socket that RGW will listen for connection on. When connected, "
+        "RGW will send ops log data through it.")
+    .add_see_also({"rgw_enable_ops_log", "rgw_ops_log_data_backlog"}),
+
+    Option("rgw_ops_log_data_backlog", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(5 << 20)
+    .set_description("Ops log socket backlog")
+    .set_long_description(
+        "Maximum amount of data backlog that RGW can keep when ops log is configured to "
+        "send info through unix domain socket. When data backlog is higher than this, "
+        "ops log entries will be lost. In order to avoid ops log information loss, the "
+        "listener needs to clear data (by reading it) quickly enough.")
+    .add_see_also({"rgw_enable_ops_log", "rgw_ops_log_socket_path"}),
+
+    Option("rgw_fcgi_socket_backlog", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1024)
+    .set_description("FastCGI socket connection backlog")
+    .set_long_description(
+        "Size of FastCGI connection backlog. This reflects the maximum number of new "
+        "connection requests that RGW can handle concurrently without dropping any. ")
+    .add_see_also({"rgw_host", "rgw_socket_path"}),
+
+    Option("rgw_usage_log_flush_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1024)
+    .set_description("Number of entries in usage log before flushing")
+    .set_long_description(
+        "This is the max number of entries that will be held in the usage log, before it "
+        "will be flushed to the backend. Note that the usage log is periodically flushed, "
+        "even if number of entries does not reach this threshold. A usage log entry "
+        "corresponds to one or more operations on a single bucket.i")
+    .add_see_also({"rgw_enable_usage_log", "rgw_usage_log_tick_interval"}),
+
+    Option("rgw_usage_log_tick_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(30)
+    .set_description("Number of seconds between usage log flush cycles")
+    .set_long_description(
+        "The number of seconds between consecutive usage log flushes. The usage log will "
+        "also flush itself to the backend if the number of pending entries reaches a "
+        "certain threshold.")
+    .add_see_also({"rgw_enable_usage_log", "rgw_usage_log_flush_threshold"}),
+
+    Option("rgw_init_timeout", Option::TYPE_INT, Option::LEVEL_BASIC)
+    .set_default(300)
+    .set_description("Initialization timeout")
+    .set_long_description(
+        "The time length (in seconds) that RGW will allow for its initialization. RGW "
+        "process will give up and quit if initialization is not complete after this amount "
+        "of time."),
+
+    Option("rgw_mime_types_file", Option::TYPE_STR, Option::LEVEL_BASIC)
+    .set_default("/etc/mime.types")
+    .set_description("Path to local mime types file")
+    .set_long_description(
+        "The mime types file is needed in Swift when uploading an object. If object's "
+        "content type is not specified, RGW will use data from this file to assign "
+        "a content type to the object."),
+
+    Option("rgw_gc_max_objs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(32)
+    .set_description("Number of shards for garbage collector data")
+    .set_long_description(
+        "The number of garbage collector data shards, is the number of RADOS objects that "
+        "RGW will use to store the garbage collection information on.")
+    .add_see_also({"rgw_gc_obj_min_wait", "rgw_gc_processor_max_time", "rgw_gc_processor_period", "rgw_gc_max_concurrent_io"}),
+
+    Option("rgw_gc_obj_min_wait", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(2_hr)
+    .set_description("Garbage collection object expiration time")
+    .set_long_description(
+       "The length of time (in seconds) that the RGW collector will wait before purging "
+       "a deleted object's data. RGW will not remove object immediately, as object could "
+       "still have readers. A mechanism exists to increase the object's expiration time "
+       "when it's being read.")
+    .add_see_also({"rgw_gc_max_objs", "rgw_gc_processor_max_time", "rgw_gc_processor_period", "rgw_gc_max_concurrent_io"}),
+
+    Option("rgw_gc_processor_max_time", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1_hr)
+    .set_description("Length of time GC processor can lease shard")
+    .set_long_description(
+        "Garbage collection thread in RGW process holds a lease on its data shards. These "
+        "objects contain the information about the objects that need to be removed. RGW "
+        "takes a lease in order to prevent multiple RGW processes from handling the same "
+        "objects concurrently. This time signifies that maximum amount of time (in seconds) that RGW "
+        "is allowed to hold that lease. In the case where RGW goes down uncleanly, this "
+        "is the amount of time where processing of that data shard will be blocked.")
+    .add_see_also({"rgw_gc_max_objs", "rgw_gc_obj_min_wait", "rgw_gc_processor_period", "rgw_gc_max_concurrent_io"}),
+
+    Option("rgw_gc_processor_period", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1_hr)
+    .set_description("Garbage collector cycle run time")
+    .set_long_description(
+        "The amount of time between the start of consecutive runs of the garbage collector "
+        "threads. If garbage collector runs takes more than this period, it will not wait "
+        "before running again.")
+    .add_see_also({"rgw_gc_max_objs", "rgw_gc_obj_min_wait", "rgw_gc_processor_max_time", "rgw_gc_max_concurrent_io", "rgw_gc_max_trim_chunk"}),
+
+    Option("rgw_gc_max_concurrent_io", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .set_description("Max concurrent RADOS IO operations for garbage collection")
+    .set_long_description(
+        "The maximum number of concurrent IO operations that the RGW garbage collection "
+        "thread will use when purging old data.")
+    .add_see_also({"rgw_gc_max_objs", "rgw_gc_obj_min_wait", "rgw_gc_processor_max_time", "rgw_gc_max_trim_chunk"}),
+
+    Option("rgw_gc_max_trim_chunk", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(16)
+    .set_description("Max number of keys to remove from garbage collector log in a single operation")
+    .add_see_also({"rgw_gc_max_objs", "rgw_gc_obj_min_wait", "rgw_gc_processor_max_time", "rgw_gc_max_concurrent_io"}),
+
+    Option("rgw_s3_success_create_obj_status", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("HTTP return code override for object creation")
+    .set_long_description(
+        "If not zero, this is the HTTP return code that will be returned on a successful S3 "
+        "object creation."),
+
+    Option("rgw_resolve_cname", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Support vanity domain names via CNAME")
+    .set_long_description(
+        "If true, RGW will query DNS when detecting that it's serving a request that was "
+        "sent to a host in another domain. If a CNAME record is configured for that domain "
+        "it will use it instead. This gives user to have the ability of creating a unique "
+        "domain of their own to point at data in their bucket."),
+
+    Option("rgw_obj_stripe_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(4_M)
+    .set_description("RGW object stripe size")
+    .set_long_description(
+        "The size of an object stripe for RGW objects. This is the maximum size a backing "
+        "RADOS object will have. RGW objects that are larger than this will span over "
+        "multiple objects."),
+
+    Option("rgw_extended_http_attrs", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("RGW support extended HTTP attrs")
+    .set_long_description(
+        "Add new set of attributes that could be set on an object. These extra attributes "
+        "can be set through HTTP header fields when putting the objects. If set, these "
+        "attributes will return as HTTP fields when doing GET/HEAD on the object."),
+
+    Option("rgw_exit_timeout_secs", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(120)
+    .set_description("RGW shutdown timeout")
+    .set_long_description("Number of seconds to wait for a process before exiting unconditionally."),
+
+    Option("rgw_get_obj_window_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(16_M)
+    .set_description("RGW object read window size")
+    .set_long_description("The window size in bytes for a single object read request"),
+
+    Option("rgw_get_obj_max_req_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(4_M)
+    .set_description("RGW object read chunk size")
+    .set_long_description(
+        "The maximum request size of a single object read operation sent to RADOS"),
+
+    Option("rgw_relaxed_s3_bucket_names", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("RGW enable relaxed S3 bucket names")
+    .set_long_description("RGW enable relaxed S3 bucket name rules for US region buckets."),
+
+    Option("rgw_defer_to_bucket_acls", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Bucket ACLs override object ACLs")
+    .set_long_description(
+        "If not empty, a string that selects that mode of operation. 'recurse' will use "
+        "bucket's ACL for the authorizaton. 'full-control' will allow users that users "
+        "that have full control permission on the bucket have access to the object."),
+
+    Option("rgw_list_buckets_max_chunk", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1000)
+    .set_description("Max number of buckets to retrieve in a single listing operation")
+    .set_long_description(
+        "When RGW fetches lists of user's buckets from the backend, this is the max number "
+        "of entries it will try to retrieve in a single operation. Note that the backend "
+        "may choose to return a smaller number of entries."),
+
+    Option("rgw_md_log_max_shards", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(64)
+    .set_description("RGW number of metadata log shards")
+    .set_long_description(
+        "The number of shards the RGW metadata log entries will reside in. This affects "
+        "the metadata sync parallelism as a shard can only be processed by a single "
+        "RGW at a time"),
+
+    Option("rgw_curl_wait_timeout_ms", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(1000)
+    .set_description(""),
+
+    Option("rgw_curl_low_speed_limit", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1024)
+    .set_long_description(
+        "It contains the average transfer speed in bytes per second that the "
+        "transfer should be below during rgw_curl_low_speed_time seconds for libcurl "
+        "to consider it to be too slow and abort. Set it zero to disable this."),
+
+    Option("rgw_curl_low_speed_time", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(300)
+    .set_long_description(
+        "It contains the time in number seconds that the transfer speed should be below "
+        "the rgw_curl_low_speed_limit for the library to consider it too slow and abort. "
+        "Set it zero to disable this."),
+
+    Option("rgw_copy_obj_progress", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Send progress report through copy operation")
+    .set_long_description(
+        "If true, RGW will send progress information when copy operation is executed. "),
+
+    Option("rgw_copy_obj_progress_every_bytes", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(1_M)
+    .set_description("Send copy-object progress info after these many bytes"),
+
+    Option("rgw_sync_obj_etag_verify", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Verify if the object copied from remote is identical to its source")
+    .set_long_description(
+        "If true, this option computes the MD5 checksum of the data which is written at the "
+	"destination and checks if it is identical to the ETAG stored in the source. "
+        "It ensures integrity of the objects fetched from a remote server over HTTP including "
+        "multisite sync."),
+
+    Option("rgw_obj_tombstone_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1000)
+    .set_description("Max number of entries to keep in tombstone cache")
+    .set_long_description(
+        "The tombstone cache is used when doing a multi-zone data sync. RGW keeps "
+        "there information about removed objects which is needed in order to prevent "
+        "re-syncing of objects that were already removed."),
+
+    Option("rgw_data_log_window", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(30)
+    .set_description("Data log time window")
+    .set_long_description(
+        "The data log keeps information about buckets that have objectst that were "
+        "modified within a specific timeframe. The sync process then knows which buckets "
+        "are needed to be scanned for data sync."),
+
+    Option("rgw_data_log_changes_size", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(1000)
+    .set_description("Max size of pending changes in data log")
+    .set_long_description(
+        "RGW will trigger update to the data log if the number of pending entries reached "
+        "this number."),
+
+    Option("rgw_data_log_num_shards", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(128)
+    .set_description("Number of data log shards")
+    .set_long_description(
+        "The number of shards the RGW data log entries will reside in. This affects the "
+        "data sync parallelism as a shard can only be processed by a single RGW at a time."),
+
+    Option("rgw_data_log_obj_prefix", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("data_log")
+    .set_description(""),
+
+    Option("rgw_bucket_quota_ttl", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(600)
+    .set_description("Bucket quota stats cache TTL")
+    .set_long_description(
+        "Length of time for bucket stats to be cached within RGW instance."),
+
+    Option("rgw_bucket_quota_soft_threshold", Option::TYPE_FLOAT, Option::LEVEL_BASIC)
+    .set_default(0.95)
+    .set_description("RGW quota soft threshold")
+    .set_long_description(
+        "Threshold from which RGW doesn't rely on cached info for quota "
+        "decisions. This is done for higher accuracy of the quota mechanism at "
+        "cost of performance, when getting close to the quota limit. The value "
+        "configured here is the ratio between the data usage to the max usage "
+        "as specified by the quota."),
+
+    Option("rgw_bucket_quota_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(10000)
+    .set_description("RGW quota stats cache size")
+    .set_long_description(
+        "Maximum number of entries in the quota stats cache."),
+
+    Option("rgw_bucket_default_quota_max_objects", Option::TYPE_INT, Option::LEVEL_BASIC)
+    .set_default(-1)
+    .set_description("Default quota for max objects in a bucket")
+    .set_long_description(
+        "The default quota configuration for max number of objects in a bucket. A "
+        "negative number means 'unlimited'."),
+
+    Option("rgw_bucket_default_quota_max_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(-1)
+    .set_description("Default quota for total size in a bucket")
+    .set_long_description(
+        "The default quota configuration for total size of objects in a bucket. A "
+        "negative number means 'unlimited'."),
+
+    Option("rgw_expose_bucket", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Send Bucket HTTP header with the response")
+    .set_long_description(
+        "If true, RGW will send a Bucket HTTP header with the responses. The header will "
+        "contain the name of the bucket the operation happened on."),
+
+    Option("rgw_frontends", Option::TYPE_STR, Option::LEVEL_BASIC)
+    .set_default("beast port=7480")
+    .set_description("RGW frontends configuration")
+    .set_long_description(
+        "A comma delimited list of frontends configuration. Each configuration contains "
+        "the type of the frontend followed by an optional space delimited set of "
+        "key=value config parameters."),
+
+    Option("rgw_user_quota_bucket_sync_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(180)
+    .set_description("User quota bucket sync interval")
+    .set_long_description(
+        "Time period for accumulating modified buckets before syncing these stats."),
+
+    Option("rgw_user_quota_sync_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1_day)
+    .set_description("User quota sync interval")
+    .set_long_description(
+        "Time period for accumulating modified buckets before syncing entire user stats."),
+
+    Option("rgw_user_quota_sync_idle_users", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Should sync idle users quota")
+    .set_long_description(
+        "Whether stats for idle users be fully synced."),
+
+    Option("rgw_user_quota_sync_wait_time", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1_day)
+    .set_description("User quota full-sync wait time")
+    .set_long_description(
+        "Minimum time between two full stats sync for non-idle users."),
+
+    Option("rgw_user_default_quota_max_objects", Option::TYPE_INT, Option::LEVEL_BASIC)
+    .set_default(-1)
+    .set_description("User quota max objects")
+    .set_long_description(
+        "The default quota configuration for total number of objects for a single user. A "
+        "negative number means 'unlimited'."),
+
+    Option("rgw_user_default_quota_max_size", Option::TYPE_INT, Option::LEVEL_BASIC)
+    .set_default(-1)
+    .set_description("User quota max size")
+    .set_long_description(
+        "The default quota configuration for total size of objects for a single user. A "
+        "negative number means 'unlimited'."),
+
+    Option("rgw_multipart_min_part_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(5_M)
+    .set_description("Minimum S3 multipart-upload part size")
+    .set_long_description(
+        "When doing a multipart upload, each part (other than the last part) should be "
+        "at least this size."),
+
+    Option("rgw_multipart_part_upload_limit", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(10000)
+    .set_description("Max number of parts in multipart upload"),
+
+    Option("rgw_max_slo_entries", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1000)
+    .set_description("Max number of entries in Swift Static Large Object manifest"),
+
+    Option("rgw_olh_pending_timeout_sec", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(1_hr)
+    .set_description("Max time for pending OLH change to complete")
+    .set_long_description(
+        "OLH is a versioned object's logical head. Operations on it are journaled and "
+        "as pending before completion. If an operation doesn't complete with this amount "
+        "of seconds, we remove the operation from the journal."),
+
+    Option("rgw_user_max_buckets", Option::TYPE_INT, Option::LEVEL_BASIC)
+    .set_default(1000)
+    .set_description("Max number of buckets per user")
+    .set_long_description(
+      "A user can create at most this number of buckets. Zero means "
+      "no limit; a negative value means users cannot create any new "
+      "buckets, although users will retain buckets already created."),
+
+    Option("rgw_objexp_gc_interval", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(10_min)
+    .set_description("Swift objects expirer garbage collector interval"),
+
+    Option("rgw_objexp_hints_num_shards", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(127)
+    .set_description("Number of object expirer data shards")
+    .set_long_description(
+        "The number of shards the (Swift) object expirer will store its data on."),
+
+    Option("rgw_objexp_chunk_size", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(100)
+    .set_description(""),
+
+    Option("rgw_enable_static_website", Option::TYPE_BOOL, Option::LEVEL_BASIC)
+    .set_default(false)
+    .set_description("Enable static website APIs")
+    .set_long_description(
+        "This configurable controls whether RGW handles the website control APIs. RGW can "
+        "server static websites if s3website hostnames are configured, and unrelated to "
+        "this configurable."),
+
+     Option("rgw_user_unique_email", Option::TYPE_BOOL, Option::LEVEL_BASIC)
+    .set_default(true)
+    .set_description("Require local RGW users to have unique email addresses")
+    .set_long_description(
+        "Enforce builtin user accounts to have unique email addresses.  This "
+	"setting is historical.  In future, non-enforcement of email address "
+        "uniqueness is likely to become the default."),
+
+    Option("rgw_log_http_headers", Option::TYPE_STR, Option::LEVEL_BASIC)
+    .set_default("")
+    .set_description("List of HTTP headers to log")
+    .set_long_description(
+        "A comma delimited list of HTTP headers to log when seen, ignores case (e.g., "
+        "http_x_forwarded_for)."),
+
+    Option("rgw_num_async_rados_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(32)
+    .set_description("Number of concurrent RADOS operations in multisite sync")
+    .set_long_description(
+        "The number of concurrent RADOS IO operations that will be triggered for handling "
+        "multisite sync operations. This includes control related work, and not the actual "
+        "sync operations."),
+
+    Option("rgw_md_notify_interval_msec", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(200)
+    .set_description("Length of time to aggregate metadata changes")
+    .set_long_description(
+        "Length of time (in milliseconds) in which the master zone aggregates all the "
+        "metadata changes that occurred, before sending notifications to all the other "
+        "zones."),
+
+    Option("rgw_run_sync_thread", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Should run sync thread"),
+
+    Option("rgw_sync_lease_period", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(120)
+    .set_description(""),
+
+    Option("rgw_sync_log_trim_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1200)
+    .set_description("Sync log trim interval")
+    .set_long_description(
+        "Time in seconds between attempts to trim sync logs."),
+
+    Option("rgw_sync_log_trim_max_buckets", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(16)
+    .set_description("Maximum number of buckets to trim per interval")
+    .set_long_description("The maximum number of buckets to consider for bucket index log trimming each trim interval, regardless of the number of bucket index shards. Priority is given to buckets with the most sync activity over the last trim interval.")
+    .add_see_also("rgw_sync_log_trim_interval")
+    .add_see_also("rgw_sync_log_trim_min_cold_buckets")
+    .add_see_also("rgw_sync_log_trim_concurrent_buckets"),
+
+    Option("rgw_sync_log_trim_min_cold_buckets", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(4)
+    .set_description("Minimum number of cold buckets to trim per interval")
+    .set_long_description("Of the `rgw_sync_log_trim_max_buckets` selected for bucket index log trimming each trim interval, at least this many of them must be 'cold' buckets. These buckets are selected in order from the list of all bucket instances, to guarantee that all buckets will be visited eventually.")
+    .add_see_also("rgw_sync_log_trim_interval")
+    .add_see_also("rgw_sync_log_trim_max_buckets")
+    .add_see_also("rgw_sync_log_trim_concurrent_buckets"),
+
+    Option("rgw_sync_log_trim_concurrent_buckets", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(4)
+    .set_description("Maximum number of buckets to trim in parallel")
+    .add_see_also("rgw_sync_log_trim_interval")
+    .add_see_also("rgw_sync_log_trim_max_buckets")
+    .add_see_also("rgw_sync_log_trim_min_cold_buckets"),
+
+    Option("rgw_sync_data_inject_err_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("rgw_sync_meta_inject_err_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("rgw_sync_trace_history_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(4096)
+    .set_description("Sync trace history size")
+    .set_long_description(
+      "Maximum number of complete sync trace entries to keep."),
+
+    Option("rgw_sync_trace_per_node_log_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(32)
+    .set_description("Sync trace per-node log size")
+    .set_long_description(
+        "The number of log entries to keep per sync-trace node."),
+
+    Option("rgw_sync_trace_servicemap_update_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .set_description("Sync-trace service-map update interval")
+    .set_long_description(
+        "Number of seconds between service-map updates of sync-trace events."),
+
+    Option("rgw_period_push_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(2)
+    .set_description("Period push interval")
+    .set_long_description(
+        "Number of seconds to wait before retrying 'period push' operation."),
+
+    Option("rgw_period_push_interval_max", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(30)
+    .set_description("Period push maximum interval")
+    .set_long_description(
+        "The max number of seconds to wait before retrying 'period push' after exponential "
+        "backoff."),
+
+    Option("rgw_safe_max_objects_per_shard", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(100*1024)
+    .set_description("Safe number of objects per shard")
+    .set_long_description(
+        "This is the max number of objects per bucket index shard that RGW considers "
+        "safe. RGW will warn if it identifies a bucket where its per-shard count is "
+        "higher than a percentage of this number.")
+    .add_see_also("rgw_shard_warning_threshold"),
+
+    Option("rgw_shard_warning_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(90)
+    .set_description("Warn about max objects per shard")
+    .set_long_description(
+        "Warn if number of objects per shard in a specific bucket passed this percentage "
+        "of the safe number.")
+    .add_see_also("rgw_safe_max_objects_per_shard"),
+
+    Option("rgw_swift_versioning_enabled", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Enable Swift versioning"),
+
+    Option("rgw_swift_custom_header", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Enable swift custom header")
+    .set_long_description(
+        "If not empty, specifies a name of HTTP header that can include custom data. When "
+        "uploading an object, if this header is passed RGW will store this header info "
+        "and it will be available when listing the bucket."),
+
+    Option("rgw_swift_need_stats", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Enable stats on bucket listing in Swift"),
+
+    Option("rgw_reshard_num_logs", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(16)
+    .set_min(1)
+    .set_description("")
+    .add_service("rgw"),
+
+    Option("rgw_reshard_bucket_lock_duration", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(360)
+    .set_min(30)
+    .set_description("Number of seconds the timeout on the reshard locks (bucket reshard lock and reshard log lock) are set to. As a reshard proceeds these locks can be renewed/extended. If too short, reshards cannot complete and will fail, causing a future reshard attempt. If too long a hung or crashed reshard attempt will keep the bucket locked for an extended period, not allowing RGW to detect the failed reshard attempt and recover.")
+    .add_tag("performance")
+    .add_service("rgw"),
+    
+    Option("rgw_reshard_batch_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(64)
+    .set_min(8)
+    .set_description("Number of reshard entries to batch together before sending the operations to the CLS back-end")
+    .add_tag("performance")
+    .add_service("rgw"),
+
+    Option("rgw_reshard_max_aio", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(128)
+    .set_min(16)
+    .set_description("Maximum number of outstanding asynchronous I/O operations to allow at a time during resharding")
+    .add_tag("performance")
+    .add_service("rgw"),
+
+    Option("rgw_trust_forwarded_https", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Trust Forwarded and X-Forwarded-Proto headers")
+    .set_long_description(
+        "When a proxy in front of radosgw is used for ssl termination, radosgw "
+        "does not know whether incoming http connections are secure. Enable "
+        "this option to trust the Forwarded and X-Forwarded-Proto headers sent "
+        "by the proxy when determining whether the connection is secure. This "
+        "is required for some features, such as server side encryption.")
+    .add_see_also("rgw_crypt_require_ssl"),
+
+    Option("rgw_crypt_require_ssl", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Requests including encryption key headers must be sent over ssl"),
+
+    Option("rgw_crypt_default_encryption_key", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("")
+    .set_description(""),
+
+    Option("rgw_crypt_s3_kms_encryption_keys", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("")
+    .set_description(""),
+
+    Option("rgw_crypt_suppress_logs", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("Suppress logs that might print client key"),
+
+    Option("rgw_list_bucket_min_readahead", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1000)
+    .set_description("Minimum number of entries to request from rados for bucket listing"),
+
+    Option("rgw_rest_getusage_op_compat", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("REST GetUsage request backward compatibility"),
+
+    Option("rgw_torrent_flag", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("When true, uploaded objects will calculate and store "
+                     "a SHA256 hash of object data so the object can be "
+                     "retrieved as a torrent file"),
+
+    Option("rgw_torrent_tracker", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Torrent field announce and announce list"),
+
+    Option("rgw_torrent_createby", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("torrent field created by"),
+
+    Option("rgw_torrent_comment", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Torrent field comment"),
+
+    Option("rgw_torrent_encoding", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("torrent field encoding"),
+
+    Option("rgw_data_notify_interval_msec", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(200)
+    .set_description("data changes notification interval to followers"),
+
+    Option("rgw_torrent_origin", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Torrent origin"),
+
+    Option("rgw_torrent_sha_unit", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(512*1024)
+    .set_description(""),
+
+    Option("rgw_dynamic_resharding", Option::TYPE_BOOL, Option::LEVEL_BASIC)
+    .set_default(true)
+    .set_description("Enable dynamic resharding")
+    .set_long_description(
+        "If true, RGW will dynamicall increase the number of shards in buckets that have "
+        "a high number of objects per shard.")
+    .add_see_also("rgw_max_objs_per_shard"),
+
+    Option("rgw_max_objs_per_shard", Option::TYPE_UINT, Option::LEVEL_BASIC)
+    .set_default(100000)
+    .set_description("Max objects per shard for dynamic resharding")
+    .set_long_description(
+        "This is the max number of objects per bucket index shard that RGW will "
+        "allow with dynamic resharding. RGW will trigger an automatic reshard operation "
+        "on the bucket if it exceeds this number.")
+    .add_see_also("rgw_dynamic_resharding"),
+
+    Option("rgw_reshard_thread_interval", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(10_min)
+    .set_min(10_min)
+    .set_description("Number of seconds between processing of reshard log entries"),
+
+    Option("rgw_cache_expiry_interval", Option::TYPE_UINT,
+	   Option::LEVEL_ADVANCED)
+    .set_default(15_min)
+    .set_description("Number of seconds before entries in the cache are "
+		     "assumed stale and re-fetched. Zero is never.")
+    .add_tag("performance")
+    .add_service("rgw")
+    .set_long_description("The Rados Gateway stores metadata and objects in "
+			  "an internal cache. This should be kept consistent "
+			  "by the OSD's relaying notify events between "
+			  "multiple watching RGW processes. In the event "
+			  "that this notification protocol fails, bounding "
+			  "the length of time that any data in the cache will "
+			  "be assumed valid will ensure that any RGW instance "
+			  "that falls out of sync will eventually recover. "
+			  "This seems to be an issue mostly for large numbers "
+			  "of RGW instances under heavy use. If you would like "
+			  "to turn off cache expiry, set this value to zero."),
+
+    Option("rgw_inject_notify_timeout_probability", Option::TYPE_FLOAT,
+	   Option::LEVEL_DEV)
+    .set_default(0)
+    .add_tag("fault injection")
+    .add_tag("testing")
+    .add_service("rgw")
+    .set_min_max(0.0, 1.0)
+    .set_description("Likelihood of ignoring a notify")
+    .set_long_description("This is the probability that the RGW cache will "
+			  "ignore a cache notify message. It exists to help "
+			  "with the development and testing of cache "
+			  "consistency and recovery improvements. Please "
+			  "do not set it in a production cluster, as it "
+			  "actively causes failures. Set this to a floating "
+			  "point value between 0 and 1."),
+    Option("rgw_max_notify_retries", Option::TYPE_UINT,
+	   Option::LEVEL_ADVANCED)
+    .set_default(3)
+    .add_tag("error recovery")
+    .add_service("rgw")
+    .set_description("Number of attempts to notify peers before giving up.")
+    .set_long_description("The number of times we will attempt to update "
+			  "a peer's cache in the event of error before giving "
+			  "up. This is unlikely to be an issue unless your "
+			  "cluster is very heavily loaded. Beware that "
+			  "increasing this value may cause some operations to "
+			  "take longer in exceptional cases and thus may, "
+			  "rarely, cause clients to time out."),
+    Option("rgw_sts_entry", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("sts")
+    .set_description("STS URL prefix")
+    .set_long_description("URL path prefix for internal STS requests."),
+
+    Option("rgw_sts_key", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("sts")
+    .set_description("STS Key")
+    .set_long_description("Key used for encrypting/ decrypting session token."),
+
+    Option("rgw_s3_auth_use_sts", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("Should S3 authentication use STS."),
+
+    Option("rgw_sts_max_session_duration", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(43200)
+    .set_description("Session token max duration")
+    .set_long_description("Max duration in seconds for which the session token is valid."),
+
+    Option("rgw_max_listing_results", Option::TYPE_UINT,
+	   Option::LEVEL_ADVANCED)
+    .set_default(1000)
+    .set_min_max(1, 100000)
+    .add_service("rgw")
+    .set_description("Upper bound on results in listing operations, ListBucket max-keys")
+    .set_long_description("This caps the maximum permitted value for listing-like operations in RGW S3. "
+			  "Affects ListBucket(max-keys), "
+			  "ListBucketVersions(max-keys), "
+			  "ListBucketMultiPartUploads(max-uploads), "
+			  "ListMultipartUploadParts(max-parts)"),
+
+    Option("rgw_sts_token_introspection_url", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("STS Web Token introspection URL")
+    .set_long_description("URL for introspecting an STS Web Token."),
+
+    Option("rgw_sts_client_id", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Client Id")
+    .set_long_description("Client Id needed for introspecting a Web Token."),
+
+    Option("rgw_sts_client_secret", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("Client Secret")
+    .set_long_description("Client Secret needed for introspecting a Web Token."),
+
+    Option("rgw_max_concurrent_requests", Option::TYPE_INT, Option::LEVEL_BASIC)
+    .set_default(1024)
+    .set_description("Maximum number of concurrent HTTP requests.")
+    .set_long_description(
+        "Maximum number of concurrent HTTP requests that the beast frontend "
+        "will process. Tuning this can help to limit memory usage under heavy "
+        "load.")
+    .add_tag("performance")
+    .add_see_also("rgw_frontends"),
+
+    Option("rgw_scheduler_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("throttler")
+    .set_description("Set the type of dmclock scheduler, defaults to throttler "
+		     "Other valid values are dmclock which is experimental"),
+
+    Option("rgw_dmclock_admin_res", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(100.0)
+    .set_description("mclock reservation for admin requests")
+    .add_see_also("rgw_dmclock_admin_wgt")
+    .add_see_also("rgw_dmclock_admin_lim"),
+
+    Option("rgw_dmclock_admin_wgt", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(100.0)
+    .set_description("mclock weight for admin requests")
+    .add_see_also("rgw_dmclock_admin_res")
+    .add_see_also("rgw_dmclock_admin_lim"),
+
+    Option("rgw_dmclock_admin_lim", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.0)
+    .set_description("mclock limit for admin requests")
+    .add_see_also("rgw_dmclock_admin_res")
+    .add_see_also("rgw_dmclock_admin_wgt"),
+
+    Option("rgw_dmclock_auth_res", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(200.0)
+    .set_description("mclock reservation for object data requests")
+    .add_see_also("rgw_dmclock_auth_wgt")
+    .add_see_also("rgw_dmclock_auth_lim"),
+
+    Option("rgw_dmclock_auth_wgt", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(100.0)
+    .set_description("mclock weight for object data requests")
+    .add_see_also("rgw_dmclock_auth_res")
+    .add_see_also("rgw_dmclock_auth_lim"),
+
+    Option("rgw_dmclock_auth_lim", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.0)
+    .set_description("mclock limit for object data requests")
+    .add_see_also("rgw_dmclock_auth_res")
+    .add_see_also("rgw_dmclock_auth_wgt"),
+
+    Option("rgw_dmclock_data_res", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(500.0)
+    .set_description("mclock reservation for object data requests")
+    .add_see_also("rgw_dmclock_data_wgt")
+    .add_see_also("rgw_dmclock_data_lim"),
+
+    Option("rgw_dmclock_data_wgt", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(500.0)
+    .set_description("mclock weight for object data requests")
+    .add_see_also("rgw_dmclock_data_res")
+    .add_see_also("rgw_dmclock_data_lim"),
+
+    Option("rgw_dmclock_data_lim", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.0)
+    .set_description("mclock limit for object data requests")
+    .add_see_also("rgw_dmclock_data_res")
+    .add_see_also("rgw_dmclock_data_wgt"),
+
+    Option("rgw_dmclock_metadata_res", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(500.0)
+    .set_description("mclock reservation for metadata requests")
+    .add_see_also("rgw_dmclock_metadata_wgt")
+    .add_see_also("rgw_dmclock_metadata_lim"),
+
+    Option("rgw_dmclock_metadata_wgt", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(500.0)
+    .set_description("mclock weight for metadata requests")
+    .add_see_also("rgw_dmclock_metadata_res")
+    .add_see_also("rgw_dmclock_metadata_lim"),
+
+    Option("rgw_dmclock_metadata_lim", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.0)
+    .set_description("mclock limit for metadata requests")
+    .add_see_also("rgw_dmclock_metadata_res")
+    .add_see_also("rgw_dmclock_metadata_wgt"),
+
+  });
+}
+
+static std::vector<Option> get_rbd_options() {
+  return std::vector<Option>({
+    Option("rbd_default_pool", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("rbd")
+    .set_description("default pool for storing new images")
+    .set_validator([](std::string *value, std::string *error_message){
+      std::regex pattern("^[^@/]+$");
+      if (!std::regex_match (*value, pattern)) {
+        *value = "rbd";
+        *error_message = "invalid RBD default pool, resetting to 'rbd'";
+      }
+      return 0;
+    }),
+
+    Option("rbd_default_data_pool", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("default pool for storing data blocks for new images")
+    .set_validator([](std::string *value, std::string *error_message){
+      std::regex pattern("^[^@/]*$");
+      if (!std::regex_match (*value, pattern)) {
+        *value = "";
+        *error_message = "ignoring invalid RBD data pool";
+      }
+      return 0;
+    }),
+
+    Option("rbd_default_features", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("layering,exclusive-lock,object-map,fast-diff,deep-flatten")
+    .set_description("default v2 image features for new images")
+    .set_long_description(
+        "RBD features are only applicable for v2 images. This setting accepts "
+        "either an integer bitmask value or comma-delimited string of RBD "
+        "feature names. This setting is always internally stored as an integer "
+        "bitmask value. The mapping between feature bitmask value and feature "
+        "name is as follows: +1 -> layering, +2 -> striping, "
+        "+4 -> exclusive-lock, +8 -> object-map, +16 -> fast-diff, "
+        "+32 -> deep-flatten, +64 -> journaling, +128 -> data-pool")
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_validator([](std::string *value, std::string *error_message) {
+	ostringstream ss;
+	uint64_t features = librbd::rbd_features_from_string(*value, &ss);
+	// Leave this in integer form to avoid breaking Cinder.  Someday
+	// we would like to present this in string form instead...
+	*value = stringify(features);
+	if (ss.str().size()) {
+	  return -EINVAL;
+	}
+	return 0;
+      }),
+
+    Option("rbd_op_threads", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description("number of threads to utilize for internal processing"),
+
+    Option("rbd_op_thread_timeout", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(60)
+    .set_description("time in seconds for detecting a hung thread"),
+
+    Option("rbd_non_blocking_aio", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("process AIO ops from a dispatch thread to prevent blocking"),
+
+    Option("rbd_cache", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("whether to enable caching (writeback unless rbd_cache_max_dirty is 0)"),
+
+    Option("rbd_cache_writethrough_until_flush", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("whether to make writeback caching writethrough until "
+                     "flush is called, to be sure the user of librbd will send "
+                     "flushes so that writeback is safe"),
+
+    Option("rbd_cache_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(32_M)
+    .set_description("cache size in bytes"),
+
+    Option("rbd_cache_max_dirty", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(24_M)
+    .set_description("dirty limit in bytes - set to 0 for write-through caching"),
+
+    Option("rbd_cache_target_dirty", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(16_M)
+    .set_description("target dirty limit in bytes"),
+
+    Option("rbd_cache_max_dirty_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1.0)
+    .set_description("seconds in cache before writeback starts"),
+
+    Option("rbd_cache_max_dirty_object", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("dirty limit for objects - set to 0 for auto calculate from rbd_cache_size"),
+
+    Option("rbd_cache_block_writes_upfront", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("whether to block writes to the cache before the aio_write call completes"),
+
+    Option("rbd_concurrent_management_ops", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .set_min(1)
+    .set_description("how many operations can be in flight for a management operation like deleting or resizing an image"),
+
+    Option("rbd_balance_snap_reads", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("distribute snap read requests to random OSD"),
+
+    Option("rbd_localize_snap_reads", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("localize snap read requests to closest OSD"),
+
+    Option("rbd_balance_parent_reads", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("distribute parent read requests to random OSD"),
+
+    Option("rbd_localize_parent_reads", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("localize parent requests to closest OSD"),
+
+    Option("rbd_sparse_read_threshold_bytes", Option::TYPE_SIZE,
+           Option::LEVEL_ADVANCED)
+    .set_default(64_K)
+    .set_description("threshold for issuing a sparse-read")
+    .set_long_description("minimum number of sequential bytes to read against "
+                          "an object before issuing a sparse-read request to "
+                          "the cluster. 0 implies it must be a full object read "
+                          "to issue a sparse-read, 1 implies always use "
+                          "sparse-read, and any value larger than the maximum "
+                          "object size will disable sparse-read for all "
+                          "requests"),
+
+    Option("rbd_readahead_trigger_requests", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .set_description("number of sequential requests necessary to trigger readahead"),
+
+    Option("rbd_readahead_max_bytes", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(512_K)
+    .set_description("set to 0 to disable readahead"),
+
+    Option("rbd_readahead_disable_after_bytes", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(50_M)
+    .set_description("how many bytes are read in total before readahead is disabled"),
+
+    Option("rbd_clone_copy_on_read", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("copy-up parent image blocks to clone upon read request"),
+
+    Option("rbd_blacklist_on_break_lock", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("whether to blacklist clients whose lock was broken"),
+
+    Option("rbd_blacklist_expire_seconds", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("number of seconds to blacklist - set to 0 for OSD default"),
+
+    Option("rbd_request_timed_out_seconds", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(30)
+    .set_description("number of seconds before maintenance request times out"),
+
+    Option("rbd_skip_partial_discard", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("skip discard (zero) of unaligned extents within an object"),
+
+    Option("rbd_discard_granularity_bytes", Option::TYPE_UINT,
+           Option::LEVEL_ADVANCED)
+    .set_default(64_K)
+    .set_min_max(4_K, 32_M)
+    .set_validator([](std::string *value, std::string *error_message){
+        uint64_t f = strict_si_cast<uint64_t>(value->c_str(), error_message);
+        if (!error_message->empty()) {
+          return -EINVAL;
+        } else if (!isp2(f)) {
+          *error_message = "value must be a power of two";
+          return -EINVAL;
+        }
+        return 0;
+      })
+    .set_description("minimum aligned size of discard operations"),
+
+    Option("rbd_enable_alloc_hint", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("when writing a object, it will issue a hint to osd backend to indicate the expected size object need"),
+
+    Option("rbd_compression_hint", Option::TYPE_STR, Option::LEVEL_BASIC)
+    .set_enum_allowed({"none", "compressible", "incompressible"})
+    .set_default("none")
+    .set_description("Compression hint to send to the OSDs during writes")
+    .set_flag(Option::FLAG_RUNTIME),
+
+    Option("rbd_tracing", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("true if LTTng-UST tracepoints should be enabled"),
+
+    Option("rbd_blkin_trace_all", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("create a blkin trace for all RBD requests"),
+
+    Option("rbd_validate_pool", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("validate empty pools for RBD compatibility"),
+
+    Option("rbd_validate_names", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("validate new image names for RBD compatibility"),
+
+    Option("rbd_auto_exclusive_lock_until_manual_request", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("automatically acquire/release exclusive lock until it is explicitly requested"),
+
+    Option("rbd_move_to_trash_on_remove", Option::TYPE_BOOL, Option::LEVEL_BASIC)
+    .set_default(false)
+    .set_description("automatically move images to the trash when deleted"),
+
+    Option("rbd_move_to_trash_on_remove_expire_seconds", Option::TYPE_UINT, Option::LEVEL_BASIC)
+    .set_default(0)
+    .set_description("default number of seconds to protect deleted images in the trash"),
+
+    Option("rbd_mirroring_resync_after_disconnect", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("automatically start image resync after mirroring is disconnected due to being laggy"),
+
+    Option("rbd_mirroring_delete_delay", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("time-delay in seconds for rbd-mirror delete propagation"),
+
+    Option("rbd_mirroring_replay_delay", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("time-delay in seconds for rbd-mirror asynchronous replication"),
+
+    Option("rbd_default_format", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(2)
+    .set_description("default image format for new images"),
+
+    Option("rbd_default_order", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(22)
+    .set_description("default order (data block object size) for new images"),
+
+    Option("rbd_default_stripe_count", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("default stripe count for new images"),
+
+    Option("rbd_default_stripe_unit", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("default stripe width for new images"),
+
+    Option("rbd_default_map_options", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("default krbd map options"),
+
+    Option("rbd_default_clone_format", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_enum_allowed({"1", "2", "auto"})
+    .set_default("auto")
+    .set_description("default internal format for handling clones")
+    .set_long_description("This sets the internal format for tracking cloned "
+                          "images. The setting of '1' requires attaching to "
+                          "protected snapshots that cannot be removed until "
+                          "the clone is removed/flattened. The setting of '2' "
+                          "will allow clones to be attached to any snapshot "
+                          "and permits removing in-use parent snapshots but "
+                          "requires Mimic or later clients. The default "
+                          "setting of 'auto' will use the v2 format if the "
+                          "cluster is configured to require mimic or later "
+                          "clients.")
+    .set_flag(Option::FLAG_RUNTIME),
+
+    Option("rbd_journal_order", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_min_max(12, 26)
+    .set_default(24)
+    .set_description("default order (object size) for journal data objects"),
+
+    Option("rbd_journal_splay_width", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(4)
+    .set_description("number of active journal objects"),
+
+    Option("rbd_journal_commit_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("commit time interval, seconds"),
+
+    Option("rbd_journal_object_writethrough_until_flush", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("when enabled, the rbd_journal_object_flush* configuration "
+                     "options are ignored until the first flush so that batched "
+                     "journal IO is known to be safe for consistency"),
+
+    Option("rbd_journal_object_flush_interval", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("maximum number of pending commits per journal object"),
+
+    Option("rbd_journal_object_flush_bytes", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(1_M)
+    .set_description("maximum number of pending bytes per journal object"),
+
+    Option("rbd_journal_object_flush_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("maximum age (in seconds) for pending commits"),
+
+    Option("rbd_journal_object_max_in_flight_appends", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("maximum number of in-flight appends per journal object"),
+
+    Option("rbd_journal_pool", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("pool for journal objects"),
+
+    Option("rbd_journal_max_payload_bytes", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(16384)
+    .set_description("maximum journal payload size before splitting"),
+
+    Option("rbd_journal_max_concurrent_object_sets", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("maximum number of object sets a journal client can be behind before it is automatically unregistered"),
+
+    Option("rbd_qos_iops_limit", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("the desired limit of IO operations per second"),
+
+    Option("rbd_qos_bps_limit", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("the desired limit of IO bytes per second"),
+
+    Option("rbd_qos_read_iops_limit", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("the desired limit of read operations per second"),
+
+    Option("rbd_qos_write_iops_limit", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("the desired limit of write operations per second"),
+
+    Option("rbd_qos_read_bps_limit", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("the desired limit of read bytes per second"),
+
+    Option("rbd_qos_write_bps_limit", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("the desired limit of write bytes per second"),
+
+    Option("rbd_qos_iops_burst", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("the desired burst limit of IO operations"),
+
+    Option("rbd_qos_bps_burst", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("the desired burst limit of IO bytes"),
+
+    Option("rbd_qos_read_iops_burst", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("the desired burst limit of read operations"),
+
+    Option("rbd_qos_write_iops_burst", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("the desired burst limit of write operations"),
+
+    Option("rbd_qos_read_bps_burst", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("the desired burst limit of read bytes"),
+
+    Option("rbd_qos_write_bps_burst", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("the desired burst limit of write bytes"),
+
+    Option("rbd_qos_schedule_tick_min", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(50)
+    .set_min(1)
+    .set_description("minimum schedule tick (in milliseconds) for QoS"),
+
+    Option("rbd_discard_on_zeroed_write_same", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("discard data on zeroed write same instead of writing zero"),
+
+    Option("rbd_mtime_update_interval", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(60)
+    .set_min(0)
+    .set_description("RBD Image modify timestamp refresh interval. Set to 0 to disable modify timestamp update."),
+
+    Option("rbd_atime_update_interval", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(60)
+    .set_min(0)
+    .set_description("RBD Image access timestamp refresh interval. Set to 0 to disable access timestamp update."),
+
+    Option("rbd_config_pool_override_update_timestamp", Option::TYPE_UINT,
+           Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description("timestamp of last update to pool-level config overrides"),
+
+  });
+}
+
+static std::vector<Option> get_rbd_mirror_options() {
+  return std::vector<Option>({
+    Option("rbd_mirror_journal_commit_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("commit time interval, seconds"),
+
+    Option("rbd_mirror_journal_poll_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("maximum age (in seconds) between successive journal polls"),
+
+    Option("rbd_mirror_journal_max_fetch_bytes", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(32768)
+    .set_description("maximum bytes to read from each journal data object per fetch"),
+
+    Option("rbd_mirror_sync_point_update_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(30)
+    .set_description("number of seconds between each update of the image sync point object number"),
+
+    Option("rbd_mirror_concurrent_image_syncs", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("maximum number of image syncs in parallel"),
+
+    Option("rbd_mirror_pool_replayers_refresh_interval", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(30)
+    .set_description("interval to refresh peers in rbd-mirror daemon"),
+
+    Option("rbd_mirror_concurrent_image_deletions", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_min(1)
+    .set_description("maximum number of image deletions in parallel"),
+
+    Option("rbd_mirror_delete_retry_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(30)
+    .set_description("interval to check and retry the failed deletion requests"),
+
+    Option("rbd_mirror_image_state_check_interval", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(30)
+    .set_min(1)
+    .set_description("interval to get images from pool watcher and set sources in replayer"),
+
+    Option("rbd_mirror_leader_heartbeat_interval", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_min(1)
+    .set_description("interval (in seconds) between mirror leader heartbeats"),
+
+    Option("rbd_mirror_leader_max_missed_heartbeats", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(2)
+    .set_description("number of missed heartbeats for non-lock owner to attempt to acquire lock"),
+
+    Option("rbd_mirror_leader_max_acquire_attempts_before_break", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(3)
+    .set_description("number of failed attempts to acquire lock after missing heartbeats before breaking lock"),
+
+    Option("rbd_mirror_image_policy_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("simple")
+    .set_enum_allowed({"none", "simple"})
+    .set_description("active/active policy type for mapping images to instances"),
+
+    Option("rbd_mirror_image_policy_migration_throttle", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(300)
+    .set_description("number of seconds after which an image can be reshuffled (migrated) again"),
+
+    Option("rbd_mirror_image_policy_update_throttle_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_min(1)
+    .set_description("interval (in seconds) to throttle images for mirror daemon peer updates"),
+
+    Option("rbd_mirror_image_policy_rebalance_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("number of seconds policy should be idle before trigerring reshuffle (rebalance) of images"),
+
+    Option("rbd_mirror_perf_stats_prio", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default((int64_t)PerfCountersBuilder::PRIO_USEFUL)
+    .set_description("Priority level for mirror daemon replication perf counters")
+    .set_long_description("The daemon will send perf counter data to the "
+                          "manager daemon if the priority is not lower than "
+                          "mgr_stats_threshold.")
+    .set_min_max((int64_t)PerfCountersBuilder::PRIO_DEBUGONLY,
+                 (int64_t)PerfCountersBuilder::PRIO_CRITICAL + 1),
+  });
+}
+
+std::vector<Option> get_mds_options() {
+  return std::vector<Option>({
+    Option("mds_data", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("/var/lib/ceph/mds/$cluster-$id")
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .set_description("path to MDS data and keyring"),
+
+    Option("mds_max_xattr_pairs_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(64_K)
+    .set_description("maximum aggregate size of extended attributes on a file"),
+
+    Option("mds_cache_trim_interval", Option::TYPE_SECS, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description("interval in seconds between cache trimming")
+    .set_flag(Option::FLAG_RUNTIME),
+
+    Option("mds_cache_release_free_interval", Option::TYPE_SECS, Option::LEVEL_DEV)
+    .set_default(10)
+    .set_description("interval in seconds between heap releases")
+    .set_flag(Option::FLAG_RUNTIME),
+
+    Option("mds_cache_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("maximum number of inodes in MDS cache (<=0 is unlimited)")
+    .set_long_description("This tunable is no longer recommended. Use mds_cache_memory_limit."),
+
+    Option("mds_cache_memory_limit", Option::TYPE_SIZE, Option::LEVEL_BASIC)
+    .set_default(1*(1LL<<30))
+    .set_description("target maximum memory usage of MDS cache")
+    .set_long_description("This sets a target maximum memory usage of the MDS cache and is the primary tunable to limit the MDS memory usage. The MDS will try to stay under a reservation of this limit (by default 95%; 1 - mds_cache_reservation) by trimming unused metadata in its cache and recalling cached items in the client caches. It is possible for the MDS to exceed this limit due to slow recall from clients. The mds_health_cache_threshold (150%) sets a cache full threshold for when the MDS signals a cluster health warning."),
+
+    Option("mds_cache_reservation", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.05)
+    .set_description("amount of memory to reserve for future cached objects"),
+
+    Option("mds_health_cache_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1.5)
+    .set_description("threshold for cache size to generate health warning"),
+
+    Option("mds_cache_mid", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.7)
+    .set_description("midpoint for MDS cache LRU"),
+
+    Option("mds_cache_trim_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description("decay rate for trimming MDS cache throttle"),
+
+    Option("mds_cache_trim_threshold", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(256_K)
+    .set_description("threshold for number of dentries that can be trimmed"),
+
+    Option("mds_max_file_recover", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(32)
+    .set_description("maximum number of files to recover file sizes in parallel"),
+
+    Option("mds_dir_max_commit_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .set_description("maximum size in megabytes for a RADOS write to a directory"),
+
+    Option("mds_dir_keys_per_op", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(16384)
+    .set_description("number of directory entries to read in one RADOS operation"),
+
+    Option("mds_decay_halflife", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("rate of decay for temperature counters on each directory for balancing"),
+
+    Option("mds_beacon_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(4)
+    .set_description("interval in seconds between MDS beacons to monitors"),
+
+    Option("mds_beacon_grace", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(15)
+    .set_description("tolerance in seconds for missed MDS beacons to monitors"),
+
+    Option("mds_heartbeat_grace", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(15)
+    .set_description("tolerance in seconds for MDS internal heartbeat"),
+
+    Option("mds_enforce_unique_name", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("require MDS name is unique in the cluster"),
+
+    Option("mds_session_blacklist_on_timeout", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("blacklist clients whose sessions have become stale"),
+
+    Option("mds_session_blacklist_on_evict", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("blacklist clients that have been evicted"),
+
+    Option("mds_sessionmap_keys_per_op", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1024)
+    .set_description("number of omap keys to read from the SessionMap in one operation"),
+
+    Option("mds_recall_max_caps", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(30000)
+    .set_description("maximum number of caps to recall from client session in single recall"),
+
+    Option("mds_recall_max_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1.5)
+    .set_description("decay rate for throttle on recalled caps on a session"),
+
+    Option("mds_recall_max_decay_threshold", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(128_K)
+    .set_description("decay threshold for throttle on recalled caps on a session"),
+
+    Option("mds_recall_global_max_decay_threshold", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(128_K)
+    .set_description("decay threshold for throttle on recalled caps globally"),
+
+    Option("mds_recall_warning_threshold", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(256_K)
+    .set_description("decay threshold for warning on slow session cap recall"),
+
+    Option("mds_recall_warning_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(60.0)
+    .set_description("decay rate for warning on slow session cap recall"),
+
+    Option("mds_session_cache_liveness_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .add_see_also("mds_session_cache_liveness_magnitude")
+    .set_default(5_min)
+    .set_description("decay rate for session liveness leading to preemptive cap recall")
+    .set_long_description("This determines how long a session needs to be quiescent before the MDS begins preemptively recalling capabilities. The default of 5 minutes will cause 10 halvings of the decay counter after 1 hour, or 1/1024. The default magnitude of 10 (1^10 or 1024) is chosen so that the MDS considers a previously chatty session (approximately) to be quiescent after 1 hour."),
+
+    Option("mds_session_cache_liveness_magnitude", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .add_see_also("mds_session_cache_liveness_decay_rate")
+    .set_default(10)
+    .set_description("decay magnitude for preemptively recalling caps on quiet client")
+    .set_long_description("This is the order of magnitude difference (in base 2) of the internal liveness decay counter and the number of capabilities the session holds. When this difference occurs, the MDS treats the session as quiescent and begins recalling capabilities."),
+
+    Option("mds_session_cap_acquisition_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .set_description("decay rate for session readdir caps leading to readdir throttle")
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_long_description("The half-life for the session cap acquisition counter of caps acquired by readdir. This is used for throttling readdir requests from clients slow to release caps."),
+
+    Option("mds_session_cap_acquisition_throttle", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(500000)
+    .set_description("throttle point for cap acquisition decay counter"),
+
+    Option("mds_session_max_caps_throttle_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1.1)
+    .set_description("ratio of mds_max_maps_per_client that client must exceed before readdir may be throttled by cap acquisition throttle"),
+
+    Option("mds_cap_acquisition_throttle_retry_request_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.5)
+    .set_description("timeout in seconds after which a client request is retried due to cap acquisition throttling"),
+
+    Option("mds_freeze_tree_timeout", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(30)
+    .set_description(""),
+
+    Option("mds_health_summarize_threshold", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .set_description("threshold of number of clients to summarize late client recall"),
+
+    Option("mds_reconnect_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(45)
+    .set_description("timeout in seconds to wait for clients to reconnect during MDS reconnect recovery state"),
+
+    Option("mds_tick_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("time in seconds between upkeep tasks"),
+
+    Option("mds_dirstat_min_interval", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(1)
+    .set_description(""),
+
+    Option("mds_scatter_nudge_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("minimum interval between scatter lock updates"),
+
+    Option("mds_client_prealloc_inos", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1000)
+    .set_description("number of unused inodes to pre-allocate to clients for file creation"),
+
+    Option("mds_early_reply", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("additional reply to clients that metadata requests are complete but not yet durable"),
+
+    Option("mds_replay_unsafe_with_closed_session", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("complete all the replay request when mds is restarted, no matter the session is closed or not"),
+
+    Option("mds_default_dir_hash", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(CEPH_STR_HASH_RJENKINS)
+    .set_description("hash function to select directory fragment for dentry name"),
+
+    Option("mds_log_pause", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("mds_log_skip_corrupt_events", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("mds_log_max_events", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(-1)
+    .set_description("maximum number of events in the MDS journal (-1 is unlimited)"),
+
+    Option("mds_log_events_per_segment", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1024)
+    .set_description("maximum number of events in an MDS journal segment"),
+
+    Option("mds_log_segment_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("size in bytes of each MDS log segment"),
+
+    Option("mds_log_max_segments", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(128)
+    .set_description("maximum number of segments which may be untrimmed"),
+
+    Option("mds_log_warn_factor", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(2.0)
+    .set_min(1.0)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("trigger MDS_HEALTH_TRIM warning when the mds log is longer than mds_log_max_segments * mds_log_warn_factor"),
+
+    Option("mds_bal_export_pin", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("allow setting directory export pins to particular ranks"),
+
+    Option("mds_bal_sample_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(3.0)
+    .set_description("interval in seconds between balancer ticks"),
+
+    Option("mds_bal_replicate_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(8000)
+    .set_description("hot popularity threshold to replicate a subtree"),
+
+    Option("mds_bal_unreplicate_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("cold popularity threshold to merge subtrees"),
+
+    Option("mds_bal_split_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(10000)
+    .set_description("minimum size of directory fragment before splitting"),
+
+    Option("mds_bal_split_rd", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(25000)
+    .set_description("hot read popularity threshold for splitting a directory fragment"),
+
+    Option("mds_bal_split_wr", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(10000)
+    .set_description("hot write popularity threshold for splitting a directory fragment"),
+
+    Option("mds_bal_split_bits", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(3)
+    .set_min_max(1, 24)
+    .set_description("power of two child fragments for a fragment on split"),
+
+    Option("mds_bal_merge_size", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(50)
+    .set_description("size of fragments where merging should occur"),
+
+    Option("mds_bal_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(10)
+    .set_description("interval between MDS balancer cycles"),
+
+    Option("mds_bal_fragment_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("delay in seconds before interrupting client IO to perform splits"),
+
+    Option("mds_bal_fragment_size_max", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(10000*10)
+    .set_description("maximum size of a directory fragment before new creat/links fail"),
+
+    Option("mds_bal_fragment_fast_factor", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1.5)
+    .set_description("ratio of mds_bal_split_size at which fast fragment splitting occurs"),
+
+    Option("mds_bal_fragment_dirs", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("enable directory fragmentation")
+    .set_long_description("Directory fragmentation is a standard feature of CephFS that allows sharding directories across multiple objects for performance and stability. Additionally, this allows fragments to be distributed across multiple active MDSs to increase throughput. Disabling (new) fragmentation should only be done in exceptional circumstances and may lead to performance issues."),
+
+    Option("mds_bal_idle_threshold", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("idle metadata popularity threshold before rebalancing"),
+
+    Option("mds_bal_max", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(-1)
+    .set_description(""),
+
+    Option("mds_bal_max_until", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(-1)
+    .set_description(""),
+
+    Option("mds_bal_mode", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("mds_bal_min_rebalance", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(.1)
+    .set_description("amount overloaded over internal target before balancer begins offloading"),
+
+    Option("mds_bal_min_start", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(.2)
+    .set_description(""),
+
+    Option("mds_bal_need_min", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(.8)
+    .set_description(""),
+
+    Option("mds_bal_need_max", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(1.2)
+    .set_description(""),
+
+    Option("mds_bal_midchunk", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(.3)
+    .set_description(""),
+
+    Option("mds_bal_minchunk", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(.001)
+    .set_description(""),
+
+    Option("mds_bal_target_decay", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(10.0)
+    .set_description("rate of decay for export targets communicated to clients"),
+
+    Option("mds_replay_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1.0)
+    .set_description("time in seconds between replay of updates to journal by standby replay MDS"),
+
+    Option("mds_shutdown_check", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("mds_thrash_exports", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("mds_thrash_fragments", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("mds_dump_cache_on_map", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("mds_dump_cache_after_rejoin", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("mds_verify_scatter", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("mds_debug_scatterstat", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("mds_debug_frag", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("mds_debug_auth_pins", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("mds_debug_subtrees", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("mds_kill_mdstable_at", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("mds_max_export_size", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(20_M)
+    .set_description(""),
+
+    Option("mds_kill_export_at", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("mds_kill_import_at", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("mds_kill_link_at", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("mds_kill_rename_at", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("mds_kill_openc_at", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("mds_kill_journal_at", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("mds_kill_journal_expire_at", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("mds_kill_journal_replay_at", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("mds_journal_format", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(1)
+    .set_description(""),
+
+    Option("mds_kill_create_at", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("mds_inject_traceless_reply_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("mds_wipe_sessions", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("mds_wipe_ino_prealloc", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("mds_skip_ino", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("mds_enable_op_tracker", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("track remote operation progression and statistics"),
+
+    Option("mds_op_history_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(20)
+    .set_description("maximum size for list of historical operations"),
+
+    Option("mds_op_history_duration", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(600)
+    .set_description("expiration time in seconds of historical operations"),
+
+    Option("mds_op_complaint_time", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(30)
+    .set_description("time in seconds to consider an operation blocked after no updates"),
+
+    Option("mds_op_log_threshold", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(5)
+    .set_description(""),
+
+    Option("mds_snap_min_uid", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("minimum uid of client to perform snapshots"),
+
+    Option("mds_snap_max_uid", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(4294967294)
+    .set_description("maximum uid of client to perform snapshots"),
+
+    Option("mds_snap_rstat", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("enabled nested rstat for snapshots"),
+
+    Option("mds_verify_backtrace", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(1)
+    .set_description(""),
+
+    Option("mds_max_completed_flushes", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(100000)
+    .set_description(""),
+
+    Option("mds_max_completed_requests", Option::TYPE_UINT, Option::LEVEL_DEV)
+    .set_default(100000)
+    .set_description(""),
+
+    Option("mds_action_on_write_error", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description("action to take when MDS cannot write to RADOS (0:ignore, 1:read-only, 2:suicide)"),
+
+    Option("mds_mon_shutdown_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("time to wait for mon to receive damaged MDS rank notification"),
+
+    Option("mds_max_purge_files", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(64)
+    .set_description("maximum number of deleted files to purge in parallel"),
+
+    Option("mds_max_purge_ops", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(8192)
+    .set_description("maximum number of purge operations performed in parallel"),
+
+    Option("mds_max_purge_ops_per_pg", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(0.5)
+    .set_description("number of parallel purge operations performed per PG"),
+
+    Option("mds_purge_queue_busy_flush_period", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(1.0)
+    .set_description(""),
+
+    Option("mds_root_ino_uid", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("default uid for new root directory"),
+
+    Option("mds_root_ino_gid", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("default gid for new root directory"),
+
+    Option("mds_max_scrub_ops_in_progress", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("maximum number of scrub operations performed in parallel"),
+
+    Option("mds_forward_all_requests_to_auth", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("always process op on auth mds"),
+    
+    Option("mds_damage_table_max_entries", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(10000)
+    .set_description("maximum number of damage table entries"),
+
+    Option("mds_client_writeable_range_max_inc_objs", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1024)
+    .set_description("maximum number of objects in writeable range of a file for a client"),
+
+    Option("mds_min_caps_per_client", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(100)
+    .set_description("minimum number of capabilities a client may hold"),
+
+    Option("mds_min_caps_working_set", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(10000)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("number of capabilities a client may hold without cache pressure warnings generated"),
+
+    Option("mds_max_caps_per_client", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1_M)
+    .set_description("maximum number of capabilities a client may hold"),
+
+    Option("mds_hack_allow_loading_invalid_metadata", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+     .set_default(0)
+     .set_description("INTENTIONALLY CAUSE DATA LOSS by bypasing checks for invalid metadata on disk. Allows testing repair tools."),
+
+    Option("mds_defer_session_stale", Option::TYPE_BOOL, Option::LEVEL_DEV)
+     .set_default(true),
+
+    Option("mds_inject_migrator_session_race", Option::TYPE_BOOL, Option::LEVEL_DEV)
+     .set_default(false),
+
+    Option("mds_request_load_average_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(60)
+    .set_description("rate of decay in seconds for calculating request load average"),
+
+    Option("mds_cap_revoke_eviction_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+     .set_default(0)
+     .set_description("number of seconds after which clients which have not responded to cap revoke messages by the MDS are evicted."),
+
+    Option("mds_max_retries_on_remount_failure", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+     .set_default(5)
+     .set_description("number of consecutive failed remount attempts for invalidating kernel dcache after which client would abort."),
+
+    Option("mds_dump_cache_threshold_formatter", Option::TYPE_SIZE, Option::LEVEL_DEV)
+     .set_default(1_G)
+     .set_description("threshold for cache usage to disallow \"dump cache\" operation to formatter")
+     .set_long_description("Disallow MDS from dumping caches to formatter via \"dump cache\" command if cache usage exceeds this threshold."),
+
+    Option("mds_dump_cache_threshold_file", Option::TYPE_SIZE, Option::LEVEL_DEV)
+     .set_default(0)
+     .set_description("threshold for cache usage to disallow \"dump cache\" operation to file")
+     .set_long_description("Disallow MDS from dumping caches to file via \"dump cache\" command if cache usage exceeds this threshold."),
+
+    Option("mds_max_snaps_per_dir", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+     .set_default(100)
+     .set_min_max(0, 4096)
+     .set_flag(Option::FLAG_RUNTIME)
+     .set_description("max snapshots per directory")
+     .set_long_description("maximum number of snapshots that can be created per directory"),
+
+    Option("mds_task_status_update_interval", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+     .set_default(2.0)
+     .set_description("task status update interval to manager")
+     .set_long_description("interval (in seconds) for sending mds task status to ceph manager"),
+  });
+}
+
+std::vector<Option> get_mds_client_options() {
+  return std::vector<Option>({
+    Option("client_cache_size", Option::TYPE_SIZE, Option::LEVEL_BASIC)
+    .set_default(16384)
+    .set_description("soft maximum number of directory entries in client cache"),
+
+    Option("client_cache_mid", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(.75)
+    .set_description("mid-point of client cache LRU"),
+
+    Option("client_use_random_mds", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("issue new requests to a random active MDS"),
+
+    Option("client_mount_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(300.0)
+    .set_description("timeout for mounting CephFS (seconds)"),
+
+    Option("client_tick_interval", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(1.0)
+    .set_description("seconds between client upkeep ticks"),
+
+    Option("client_trace", Option::TYPE_STR, Option::LEVEL_DEV)
+    .set_default("")
+    .set_description("file containing trace of client operations"),
+
+    Option("client_readahead_min", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(128*1024)
+    .set_description("minimum bytes to readahead in a file"),
+
+    Option("client_readahead_max_bytes", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("maximum bytes to readahead in a file (zero is unlimited)"),
+
+    Option("client_readahead_max_periods", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(4)
+    .set_description("maximum stripe periods to readahead in a file"),
+
+    Option("client_reconnect_stale", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("reconnect when the session becomes stale"),
+
+    Option("client_snapdir", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default(".snap")
+    .set_description("pseudo directory for snapshot access to a directory"),
+
+    Option("client_mountpoint", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("/")
+    .set_description("default mount-point"),
+
+    Option("client_mount_uid", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(-1)
+    .set_description("uid to mount as"),
+
+    Option("client_mount_gid", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(-1)
+    .set_description("gid to mount as"),
+
+    /* RADOS client option */
+    Option("client_notify_timeout", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(10)
+    .set_description(""),
+
+    /* RADOS client option */
+    Option("osd_client_watch_timeout", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(30)
+    .set_description(""),
+
+    Option("client_caps_release_delay", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(5)
+    .set_description(""),
+
+    Option("client_quota_df", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("show quota usage for statfs (df)"),
+
+    Option("client_oc", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("enable object caching"),
+
+    Option("client_oc_size", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(200_M)
+    .set_description("maximum size of object cache"),
+
+    Option("client_oc_max_dirty", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(100_M)
+    .set_description("maximum size of dirty pages in object cache"),
+
+    Option("client_oc_target_dirty", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(8_M)
+    .set_description("target size of dirty pages object cache"),
+
+    Option("client_oc_max_dirty_age", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(5.0)
+    .set_description("maximum age of dirty pages in object cache (seconds)"),
+
+    Option("client_oc_max_objects", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+    .set_default(1000)
+    .set_description("maximum number of objects in cache"),
+
+    Option("client_debug_getattr_caps", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("client_debug_force_sync_read", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("client_debug_inject_tick_delay", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description(""),
+
+    Option("client_max_inline_size", Option::TYPE_SIZE, Option::LEVEL_DEV)
+    .set_default(4_K)
+    .set_description(""),
+
+    Option("client_inject_release_failure", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("client_inject_fixed_oldest_tid", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("client_metadata", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("metadata key=value comma-delimited pairs appended to session metadata"),
+
+    Option("client_acl_type", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+    .set_description("ACL type to enforce (none or \"posix_acl\")"),
+
+    Option("client_permissions", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("client-enforced permission checking"),
+
+    Option("client_dirsize_rbytes", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("set the directory size as the number of file bytes recursively used")
+    .set_long_description("This option enables a CephFS feature that stores the recursive directory size (the bytes used by files in the directory and its descendents) in the st_size field of the stat structure."),
+
+    Option("client_force_lazyio", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description(""),
+
+    // note: the max amount of "in flight" dirty data is roughly (max - target)
+    Option("fuse_use_invalidate_cb", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("use fuse 2.8+ invalidate callback to keep page cache consistent"),
+
+    Option("fuse_disable_pagecache", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("disable page caching in the kernel for this FUSE mount"),
+
+    Option("fuse_allow_other", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("pass allow_other to FUSE on mount"),
+
+    Option("fuse_default_permissions", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_description("pass default_permisions to FUSE on mount")
+    .set_flag(Option::FLAG_STARTUP),
+
+    Option("fuse_big_writes", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("big_writes is deprecated in libfuse 3.0.0"),
+
+    Option("fuse_max_write", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(0)
+    .set_description("set the maximum number of bytes in a single write operation")
+    .set_long_description("Set the maximum number of bytes in a single write operation that may pass atomically through FUSE. The FUSE default is 128kB and may be indicated by setting this option to 0."),
+
+    Option("fuse_atomic_o_trunc", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("pass atomic_o_trunc flag to FUSE on mount"),
+
+    Option("fuse_debug", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(false)
+    .set_flag(Option::FLAG_STARTUP)
+    .set_flag(Option::FLAG_NO_MON_UPDATE)
+    .set_description("enable debugging for the libfuse"),
+
+    Option("fuse_multithreaded", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("allow parallel processing through FUSE library"),
+
+    Option("fuse_require_active_mds", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("require active MDSs in the file system when mounting"),
+
+    Option("fuse_syncfs_on_mksnap", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("synchronize all local metadata/file changes after snapshot"),
+
+    Option("fuse_set_user_groups", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("check for ceph-fuse to consider supplementary groups for permissions"),
+
+    Option("client_try_dentry_invalidate", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("client_die_on_failed_remount", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("client_die_on_failed_dentry_invalidate", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("kill the client when no dentry invalidation options are available")
+    .set_long_description("The CephFS client requires a mechanism to invalidate dentries in the caller (e.g. the kernel for ceph-fuse) when capabilities must be recalled. If the client cannot do this then the MDS cache cannot shrink which can cause the MDS to fail."),
+
+    Option("client_check_pool_perm", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+    .set_default(true)
+    .set_description("confirm access to inode's data pool/namespace described in file layout"),
+
+    Option("client_use_faked_inos", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description(""),
+
+    Option("client_mds_namespace", Option::TYPE_STR, Option::LEVEL_ADVANCED)
+    .set_default("")
+
+    .set_description("CephFS file system name to mount")
+    .set_long_description("Use this with ceph-fuse, or with any process "
+        "that uses libcephfs.  Programs using libcephfs may also pass "
+        "the filesystem name into mount(), which will override this setting. "
+        "If no filesystem name is given in mount() or this setting, the default "
+        "filesystem will be mounted (usually the first created)."),
+
+    Option("fake_statfs_for_testing", Option::TYPE_INT, Option::LEVEL_DEV)
+    .set_default(0)
+    .set_description("Set a value for kb and compute kb_used from total of num_bytes"),
+
+    Option("debug_allow_any_pool_priority", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(false)
+    .set_description("Allow any pool priority to be set to test conversion to new range"),
+
+    Option("client_shutdown_timeout", Option::TYPE_SECS, Option::LEVEL_ADVANCED)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_default(30)
+    .set_min(0)
+    .set_description("timeout for shutting down CephFS")
+    .set_long_description("Timeout for shutting down CephFS via unmount or shutdown.")
+    .add_tag("client")
+  });
+}
+
+
+static std::vector<Option> build_options()
+{
+  std::vector<Option> result = get_global_options();
+
+  auto ingest = [&result](std::vector<Option>&& options, const char* svc) {
+    for (auto &o : options) {
+      o.add_service(svc);
+      result.push_back(std::move(o));
+    }
+  };
+
+  ingest(get_rgw_options(), "rgw");
+  ingest(get_rbd_options(), "rbd");
+  ingest(get_rbd_mirror_options(), "rbd-mirror");
+  ingest(get_mds_options(), "mds");
+  ingest(get_mds_client_options(), "mds_client");
+
+  return result;
+}
+
+const std::vector<Option> ceph_options = build_options();
diff --git a/src/common/options.h b/src/common/options.h
new file mode 100644
index 00000000..3f1cc4c9
--- /dev/null
+++ b/src/common/options.h
@@ -0,0 +1,391 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <chrono>
+#include <string>
+#include <vector>
+#include <boost/variant.hpp>
+#include "include/str_list.h"
+#include "msg/msg_types.h"
+#include "include/uuid.h"
+
+struct Option {
+  enum type_t {
+    TYPE_UINT = 0,
+    TYPE_INT = 1,
+    TYPE_STR = 2,
+    TYPE_FLOAT = 3,
+    TYPE_BOOL = 4,
+    TYPE_ADDR = 5,
+    TYPE_ADDRVEC = 6,
+    TYPE_UUID = 7,
+    TYPE_SIZE = 8,
+    TYPE_SECS = 9,
+  };
+
+  static const char *type_to_c_type_str(type_t t) {
+    switch (t) {
+    case TYPE_UINT: return "uint64_t";
+    case TYPE_INT: return "int64_t";
+    case TYPE_STR: return "std::string";
+    case TYPE_FLOAT: return "double";
+    case TYPE_BOOL: return "bool";
+    case TYPE_ADDR: return "entity_addr_t";
+    case TYPE_ADDRVEC: return "entity_addrvec_t";
+    case TYPE_UUID: return "uuid_d";
+    case TYPE_SIZE: return "size_t";
+    case TYPE_SECS: return "secs";
+    default: return "unknown";
+    }
+  }
+  static const char *type_to_str(type_t t) {
+    switch (t) {
+    case TYPE_UINT: return "uint";
+    case TYPE_INT: return "int";
+    case TYPE_STR: return "str";
+    case TYPE_FLOAT: return "float";
+    case TYPE_BOOL: return "bool";
+    case TYPE_ADDR: return "addr";
+    case TYPE_ADDRVEC: return "addrvec";
+    case TYPE_UUID: return "uuid";
+    case TYPE_SIZE: return "size";
+    case TYPE_SECS: return "secs";
+    default: return "unknown";
+    }
+  }
+  static int str_to_type(const std::string& s) {
+    if (s == "uint") {
+      return TYPE_UINT;
+    }
+    if (s == "int") {
+      return TYPE_INT;
+    }
+    if (s == "str") {
+      return TYPE_STR;
+    }
+    if (s == "float") {
+      return TYPE_FLOAT;
+    }
+    if (s == "bool") {
+      return TYPE_BOOL;
+    }
+    if (s == "addr") {
+      return TYPE_ADDR;
+    }
+    if (s == "addrvec") {
+      return TYPE_ADDRVEC;
+    }
+    if (s == "uuid") {
+      return TYPE_UUID;
+    }
+    if (s == "size") {
+      return TYPE_SIZE;
+    }
+    if (s == "secs") {
+      return TYPE_SECS;
+    }
+    return -1;
+  }
+
+  /**
+   * Basic: for users, configures some externally visible functional aspect
+   * Advanced: for users, configures some internal behaviour
+   * Development: not for users.  May be dangerous, may not be documented.
+   */
+  enum level_t {
+    LEVEL_BASIC = 0,
+    LEVEL_ADVANCED = 1,
+    LEVEL_DEV = 2,
+    LEVEL_UNKNOWN = 3,
+  };
+
+  static const char *level_to_str(level_t l) {
+    switch (l) {
+      case LEVEL_BASIC: return "basic";
+      case LEVEL_ADVANCED: return "advanced";
+      case LEVEL_DEV: return "dev";
+      default: return "unknown";
+    }
+  }
+
+  enum flag_t {
+    FLAG_RUNTIME = 0x1,         ///< option can change changed at runtime
+    FLAG_NO_MON_UPDATE = 0x2,   ///< option cannot be changed via mon config
+    FLAG_STARTUP = 0x4,         ///< option can only take effect at startup
+    FLAG_CLUSTER_CREATE = 0x8,  ///< option only has effect at cluster creation
+    FLAG_CREATE = 0x10,         ///< option only has effect at daemon creation
+    FLAG_MGR = 0x20,            ///< option is a mgr module option
+    FLAG_MINIMAL_CONF = 0x40,   ///< option should go in a minimal ceph.conf
+  };
+
+  struct size_t {
+    std::size_t value;
+    operator uint64_t() const {
+      return static_cast<uint64_t>(value);
+    }
+    bool operator==(const size_t& rhs) const {
+      return value == rhs.value;
+    }
+  };
+
+  using value_t = boost::variant<
+    boost::blank,
+    std::string,
+    uint64_t,
+    int64_t,
+    double,
+    bool,
+    entity_addr_t,
+    entity_addrvec_t,
+    std::chrono::seconds,
+    size_t,
+    uuid_d>;
+  const std::string name;
+  const type_t type;
+  const level_t level;
+
+  std::string desc;
+  std::string long_desc;
+
+  unsigned flags = 0;
+
+  int subsys = -1; // if >= 0, we are a subsys debug level
+
+  value_t value;
+  value_t daemon_value;
+
+  static std::string to_str(const value_t& v);
+
+  // Items like mon, osd, rgw, rbd, ceph-fuse.  This is advisory metadata
+  // for presentation layers (like web dashboards, or generated docs), so that
+  // they know which options to display where.
+  // Additionally: "common" for settings that exist in any Ceph code.  Do
+  // not use common for settings that are just shared some places: for those
+  // places, list them.
+  std::vector<const char*> services;
+
+  // Topics like:
+  // "service": a catchall for the boring stuff like log/asok paths.
+  // "network"
+  // "performance": a setting that may need adjustment depending on
+  //                environment/workload to get best performance.
+  std::vector<const char*> tags;
+
+  std::vector<const char*> see_also;
+
+  value_t min, max;
+  std::vector<const char*> enum_allowed;
+
+  /**
+   * Return nonzero and set second argument to error string if the
+   * value is invalid.
+   *
+   * These callbacks are more than just validators, as they can also
+   * modify the value as it passes through.
+   */
+  typedef std::function<int(std::string *, std::string *)> validator_fn_t;
+  validator_fn_t validator;
+
+  Option(std::string const &name, type_t t, level_t l)
+    : name(name), type(t), level(l)
+  {
+    // While value_t is nullable (via boost::blank), we don't ever
+    // want it set that way in an Option instance: within an instance,
+    // the type of ::value should always match the declared type.
+    switch (type) {
+    case TYPE_INT:
+      value = int64_t(0); break;
+    case TYPE_UINT:
+      value = uint64_t(0); break;
+    case TYPE_STR:
+      value = std::string(""); break;
+    case TYPE_FLOAT:
+      value = 0.0; break;
+    case TYPE_BOOL:
+      value = false; break;
+    case TYPE_ADDR:
+      value = entity_addr_t(); break;
+    case TYPE_ADDRVEC:
+      value = entity_addrvec_t(); break;
+    case TYPE_UUID:
+      value = uuid_d(); break;
+    case TYPE_SIZE:
+      value = size_t{0}; break;
+    case TYPE_SECS:
+      value = std::chrono::seconds{0}; break;
+    default:
+      ceph_abort();
+    }
+  }
+
+  void dump_value(const char *field_name, const value_t &v, Formatter *f) const;
+
+  // Validate and potentially modify incoming string value
+  int pre_validate(std::string *new_value, std::string *err) const;
+
+  // Validate properly typed value against bounds
+  int validate(const Option::value_t &new_value, std::string *err) const;
+
+  // const char * must be explicit to avoid it being treated as an int
+  Option& set_value(value_t& v, const char *new_value) {
+    v = std::string(new_value);
+    return *this;
+  }
+
+  // bool is an integer, but we don't think so. teach it the hard way.
+  template<typename T>
+  using is_not_integer = std::enable_if<!std::is_integral<T>::value ||
+					std::is_same<T, bool>::value, int>;
+  template<typename T>
+  using is_integer = std::enable_if<std::is_integral<T>::value &&
+				    !std::is_same<T, bool>::value, int>;
+  template<typename T, typename is_not_integer<T>::type = 0>
+  Option& set_value(value_t& v, const T& new_value) {
+    v = new_value;
+    return *this;
+  }
+
+  // For potentially ambiguous types, inspect Option::type and
+  // do some casting.  This is necessary to make sure that setting
+  // a float option to "0" actually sets the double part of variant.
+  template<typename T, typename is_integer<T>::type = 0>
+  Option& set_value(value_t& v, T new_value) {
+    switch (type) {
+    case TYPE_INT:
+      v = int64_t(new_value); break;
+    case TYPE_UINT:
+      v = uint64_t(new_value); break;
+    case TYPE_FLOAT:
+      v = double(new_value); break;
+    case TYPE_BOOL:
+      v = bool(new_value); break;
+    case TYPE_SIZE:
+      v = size_t{static_cast<std::size_t>(new_value)}; break;
+    case TYPE_SECS:
+      v = std::chrono::seconds{new_value}; break;
+    default:
+      std::cerr << "Bad type in set_value: " << name << ": "
+                << typeid(T).name() << std::endl;
+      ceph_abort();
+    }
+    return *this;
+  }
+
+  /// parse and validate a string input
+  int parse_value(
+    const std::string& raw_val,
+    value_t *out,
+    std::string *error_message,
+    std::string *normalized_value=nullptr) const;
+
+  template<typename T>
+  Option& set_default(const T& v) {
+    return set_value(value, v);
+  }
+
+  template<typename T>
+  Option& set_daemon_default(const T& v) {
+    return set_value(daemon_value, v);
+  }
+  Option& add_tag(const char* tag) {
+    tags.push_back(tag);
+    return *this;
+  }
+  Option& add_tag(const std::initializer_list<const char*>& ts) {
+    tags.insert(tags.end(), ts);
+    return *this;
+  }
+  Option& add_service(const char* service) {
+    services.push_back(service);
+    return *this;
+  }
+  Option& add_service(const std::initializer_list<const char*>& ss) {
+    services.insert(services.end(), ss);
+    return *this;
+  }
+  Option& add_see_also(const char* t) {
+    see_also.push_back(t);
+    return *this;
+  }
+  Option& add_see_also(const std::initializer_list<const char*>& ts) {
+    see_also.insert(see_also.end(), ts);
+    return *this;
+  }
+  Option& set_description(const char* new_desc) {
+    desc = new_desc;
+    return *this;
+  }
+  Option& set_long_description(const char* new_desc) {
+    long_desc = new_desc;
+    return *this;
+  }
+
+  template<typename T>
+  Option& set_min(const T& mi) {
+    set_value(min, mi);
+    return *this;
+  }
+
+  template<typename T>
+  Option& set_min_max(const T& mi, const T& ma) {
+    set_value(min, mi);
+    set_value(max, ma);
+    return *this;
+  }
+
+  Option& set_enum_allowed(const std::vector<const char*>& allowed)
+  {
+    enum_allowed = allowed;
+    return *this;
+  }
+
+  Option &set_flag(flag_t f) {
+    flags |= f;
+    return *this;
+  }
+  Option &set_flags(flag_t f) {
+    flags |= f;
+    return *this;
+  }
+
+  Option &set_validator(const validator_fn_t  &validator_)
+  {
+    validator = validator_;
+    return *this;
+  }
+
+  Option &set_subsys(int s) {
+    subsys = s;
+    return *this;
+  }
+
+  void dump(Formatter *f) const;
+  void print(ostream *out) const;
+
+  bool has_flag(flag_t f) const {
+    return flags & f;
+  }
+
+  /**
+   * A crude indicator of whether the value may be
+   * modified safely at runtime -- should be replaced
+   * with proper locking!
+   */
+  bool can_update_at_runtime() const
+  {
+    return
+      (has_flag(FLAG_RUNTIME)
+       || (!has_flag(FLAG_MGR)
+	   && (type == TYPE_BOOL || type == TYPE_INT
+	       || type == TYPE_UINT || type == TYPE_FLOAT
+	       || type == TYPE_SIZE || type == TYPE_SECS)))
+      && !has_flag(FLAG_STARTUP)
+      && !has_flag(FLAG_CLUSTER_CREATE)
+      && !has_flag(FLAG_CREATE);
+  }
+};
+
+extern const std::vector<Option> ceph_options;
+
diff --git a/src/common/page.cc b/src/common/page.cc
new file mode 100644
index 00000000..7a252a60
--- /dev/null
+++ b/src/common/page.cc
@@ -0,0 +1,19 @@
+#include <unistd.h>
+
+namespace ceph {
+
+  // page size crap, see page.h
+  int _get_bits_of(int v) {
+    int n = 0;
+    while (v) {
+      n++;
+      v = v >> 1;
+    }
+    return n;
+  }
+
+  unsigned _page_size = sysconf(_SC_PAGESIZE);
+  unsigned long _page_mask = ~(unsigned long)(_page_size - 1);
+  unsigned _page_shift = _get_bits_of(_page_size - 1);
+
+}
diff --git a/src/common/perf_counters.cc b/src/common/perf_counters.cc
new file mode 100644
index 00000000..36105b0e
--- /dev/null
+++ b/src/common/perf_counters.cc
@@ -0,0 +1,585 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ * Copyright (C) 2017 OVH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/perf_counters.h"
+#include "common/dout.h"
+#include "common/valgrind.h"
+
+using std::ostringstream;
+
+PerfCountersCollectionImpl::PerfCountersCollectionImpl()
+{
+}
+
+PerfCountersCollectionImpl::~PerfCountersCollectionImpl()
+{
+  clear();
+}
+
+void PerfCountersCollectionImpl::add(PerfCounters *l)
+{
+  // make sure the name is unique
+  perf_counters_set_t::iterator i;
+  i = m_loggers.find(l);
+  while (i != m_loggers.end()) {
+    ostringstream ss;
+    ss << l->get_name() << "-" << (void*)l;
+    l->set_name(ss.str());
+    i = m_loggers.find(l);
+  }
+
+  m_loggers.insert(l);
+
+  for (unsigned int i = 0; i < l->m_data.size(); ++i) {
+    PerfCounters::perf_counter_data_any_d &data = l->m_data[i];
+
+    std::string path = l->get_name();
+    path += ".";
+    path += data.name;
+
+    by_path[path] = {&data, l};
+  }
+}
+
+void PerfCountersCollectionImpl::remove(PerfCounters *l)
+{
+  for (unsigned int i = 0; i < l->m_data.size(); ++i) {
+    PerfCounters::perf_counter_data_any_d &data = l->m_data[i];
+
+    std::string path = l->get_name();
+    path += ".";
+    path += data.name;
+
+    by_path.erase(path);
+  }
+
+  perf_counters_set_t::iterator i = m_loggers.find(l);
+  ceph_assert(i != m_loggers.end());
+  m_loggers.erase(i);
+}
+
+void PerfCountersCollectionImpl::clear()
+{
+  perf_counters_set_t::iterator i = m_loggers.begin();
+  perf_counters_set_t::iterator i_end = m_loggers.end();
+  for (; i != i_end; ) {
+    delete *i;
+    m_loggers.erase(i++);
+  }
+
+  by_path.clear();
+}
+
+bool PerfCountersCollectionImpl::reset(const std::string &name)
+{
+  bool result = false;
+  perf_counters_set_t::iterator i = m_loggers.begin();
+  perf_counters_set_t::iterator i_end = m_loggers.end();
+
+  if (!strcmp(name.c_str(), "all"))  {
+    while (i != i_end) {
+      (*i)->reset();
+      ++i;
+    }
+    result = true;
+  } else {
+    while (i != i_end) {
+      if (!name.compare((*i)->get_name())) {
+	(*i)->reset();
+	result = true;
+	break;
+      }
+      ++i;
+    }
+  }
+
+  return result;
+}
+
+
+/**
+ * Serialize current values of performance counters.  Optionally
+ * output the schema instead, or filter output to a particular
+ * PerfCounters or particular named counter.
+ *
+ * @param logger name of subsystem logger, e.g. "mds_cache", may be empty
+ * @param counter name of counter within subsystem, e.g. "num_strays",
+ *                may be empty.
+ * @param schema if true, output schema instead of current data.
+ * @param histograms if true, dump histogram values,
+ *                   if false dump all non-histogram counters
+ */
+void PerfCountersCollectionImpl::dump_formatted_generic(
+    Formatter *f,
+    bool schema,
+    bool histograms,
+    const std::string &logger,
+    const std::string &counter) const
+{
+  f->open_object_section("perfcounter_collection");
+  
+  for (perf_counters_set_t::iterator l = m_loggers.begin();
+       l != m_loggers.end(); ++l) {
+    // Optionally filter on logger name, pass through counter filter
+    if (logger.empty() || (*l)->get_name() == logger) {
+      (*l)->dump_formatted_generic(f, schema, histograms, counter);
+    }
+  }
+  f->close_section();
+}
+
+void PerfCountersCollectionImpl::with_counters(std::function<void(
+      const PerfCountersCollectionImpl::CounterMap &)> fn) const
+{
+  fn(by_path);
+}
+
+// ---------------------------
+
+PerfCounters::~PerfCounters()
+{
+}
+
+void PerfCounters::inc(int idx, uint64_t amt)
+{
+#ifndef WITH_SEASTAR
+  if (!m_cct->_conf->perf)
+    return;
+#endif
+
+  ceph_assert(idx > m_lower_bound);
+  ceph_assert(idx < m_upper_bound);
+  perf_counter_data_any_d& data(m_data[idx - m_lower_bound - 1]);
+  if (!(data.type & PERFCOUNTER_U64))
+    return;
+  if (data.type & PERFCOUNTER_LONGRUNAVG) {
+    data.avgcount++;
+    data.u64 += amt;
+    data.avgcount2++;
+  } else {
+    data.u64 += amt;
+  }
+}
+
+void PerfCounters::dec(int idx, uint64_t amt)
+{
+#ifndef WITH_SEASTAR
+  if (!m_cct->_conf->perf)
+    return;
+#endif
+
+  ceph_assert(idx > m_lower_bound);
+  ceph_assert(idx < m_upper_bound);
+  perf_counter_data_any_d& data(m_data[idx - m_lower_bound - 1]);
+  ceph_assert(!(data.type & PERFCOUNTER_LONGRUNAVG));
+  if (!(data.type & PERFCOUNTER_U64))
+    return;
+  data.u64 -= amt;
+}
+
+void PerfCounters::set(int idx, uint64_t amt)
+{
+#ifndef WITH_SEASTAR
+  if (!m_cct->_conf->perf)
+    return;
+#endif
+
+  ceph_assert(idx > m_lower_bound);
+  ceph_assert(idx < m_upper_bound);
+  perf_counter_data_any_d& data(m_data[idx - m_lower_bound - 1]);
+  if (!(data.type & PERFCOUNTER_U64))
+    return;
+
+  ANNOTATE_BENIGN_RACE_SIZED(&data.u64, sizeof(data.u64),
+                             "perf counter atomic");
+  if (data.type & PERFCOUNTER_LONGRUNAVG) {
+    data.avgcount++;
+    data.u64 = amt;
+    data.avgcount2++;
+  } else {
+    data.u64 = amt;
+  }
+}
+
+uint64_t PerfCounters::get(int idx) const
+{
+#ifndef WITH_SEASTAR
+  if (!m_cct->_conf->perf)
+    return 0;
+#endif
+
+  ceph_assert(idx > m_lower_bound);
+  ceph_assert(idx < m_upper_bound);
+  const perf_counter_data_any_d& data(m_data[idx - m_lower_bound - 1]);
+  if (!(data.type & PERFCOUNTER_U64))
+    return 0;
+  return data.u64;
+}
+
+void PerfCounters::tinc(int idx, utime_t amt)
+{
+#ifndef WITH_SEASTAR
+  if (!m_cct->_conf->perf)
+    return;
+#endif
+
+  ceph_assert(idx > m_lower_bound);
+  ceph_assert(idx < m_upper_bound);
+  perf_counter_data_any_d& data(m_data[idx - m_lower_bound - 1]);
+  if (!(data.type & PERFCOUNTER_TIME))
+    return;
+  if (data.type & PERFCOUNTER_LONGRUNAVG) {
+    data.avgcount++;
+    data.u64 += amt.to_nsec();
+    data.avgcount2++;
+  } else {
+    data.u64 += amt.to_nsec();
+  }
+}
+
+void PerfCounters::tinc(int idx, ceph::timespan amt)
+{
+#ifndef WITH_SEASTAR
+  if (!m_cct->_conf->perf)
+    return;
+#endif
+
+  ceph_assert(idx > m_lower_bound);
+  ceph_assert(idx < m_upper_bound);
+  perf_counter_data_any_d& data(m_data[idx - m_lower_bound - 1]);
+  if (!(data.type & PERFCOUNTER_TIME))
+    return;
+  if (data.type & PERFCOUNTER_LONGRUNAVG) {
+    data.avgcount++;
+    data.u64 += amt.count();
+    data.avgcount2++;
+  } else {
+    data.u64 += amt.count();
+  }
+}
+
+void PerfCounters::tset(int idx, utime_t amt)
+{
+#ifndef WITH_SEASTAR
+  if (!m_cct->_conf->perf)
+    return;
+#endif
+
+  ceph_assert(idx > m_lower_bound);
+  ceph_assert(idx < m_upper_bound);
+  perf_counter_data_any_d& data(m_data[idx - m_lower_bound - 1]);
+  if (!(data.type & PERFCOUNTER_TIME))
+    return;
+  data.u64 = amt.to_nsec();
+  if (data.type & PERFCOUNTER_LONGRUNAVG)
+    ceph_abort();
+}
+
+utime_t PerfCounters::tget(int idx) const
+{
+#ifndef WITH_SEASTAR
+  if (!m_cct->_conf->perf)
+    return utime_t();
+#endif
+
+  ceph_assert(idx > m_lower_bound);
+  ceph_assert(idx < m_upper_bound);
+  const perf_counter_data_any_d& data(m_data[idx - m_lower_bound - 1]);
+  if (!(data.type & PERFCOUNTER_TIME))
+    return utime_t();
+  uint64_t v = data.u64;
+  return utime_t(v / 1000000000ull, v % 1000000000ull);
+}
+
+void PerfCounters::hinc(int idx, int64_t x, int64_t y)
+{
+#ifndef WITH_SEASTAR
+  if (!m_cct->_conf->perf)
+    return;
+#endif
+
+  ceph_assert(idx > m_lower_bound);
+  ceph_assert(idx < m_upper_bound);
+
+  perf_counter_data_any_d& data(m_data[idx - m_lower_bound - 1]);
+  ceph_assert(data.type == (PERFCOUNTER_HISTOGRAM | PERFCOUNTER_COUNTER | PERFCOUNTER_U64));
+  ceph_assert(data.histogram);
+
+  data.histogram->inc(x, y);
+}
+
+pair<uint64_t, uint64_t> PerfCounters::get_tavg_ns(int idx) const
+{
+#ifndef WITH_SEASTAR
+  if (!m_cct->_conf->perf)
+    return make_pair(0, 0);
+#endif
+
+  ceph_assert(idx > m_lower_bound);
+  ceph_assert(idx < m_upper_bound);
+  const perf_counter_data_any_d& data(m_data[idx - m_lower_bound - 1]);
+  if (!(data.type & PERFCOUNTER_TIME))
+    return make_pair(0, 0);
+  if (!(data.type & PERFCOUNTER_LONGRUNAVG))
+    return make_pair(0, 0);
+  pair<uint64_t,uint64_t> a = data.read_avg();
+  return make_pair(a.second, a.first);
+}
+
+void PerfCounters::reset()
+{
+  perf_counter_data_vec_t::iterator d = m_data.begin();
+  perf_counter_data_vec_t::iterator d_end = m_data.end();
+
+  while (d != d_end) {
+    d->reset();
+    ++d;
+  }
+}
+
+void PerfCounters::dump_formatted_generic(Formatter *f, bool schema,
+    bool histograms, const std::string &counter) const
+{
+  f->open_object_section(m_name.c_str());
+  
+  for (perf_counter_data_vec_t::const_iterator d = m_data.begin();
+       d != m_data.end(); ++d) {
+    if (!counter.empty() && counter != d->name) {
+      // Optionally filter on counter name
+      continue;
+    }
+
+    // Switch between normal and histogram view
+    bool is_histogram = (d->type & PERFCOUNTER_HISTOGRAM) != 0;
+    if (is_histogram != histograms) {
+      continue;
+    }
+
+    if (schema) {
+      f->open_object_section(d->name);
+      // we probably should not have exposed this raw field (with bit
+      // values), but existing plugins rely on it so we're stuck with
+      // it.
+      f->dump_int("type", d->type);
+
+      if (d->type & PERFCOUNTER_COUNTER) {
+	f->dump_string("metric_type", "counter");
+      } else {
+	f->dump_string("metric_type", "gauge");
+      }
+
+      if (d->type & PERFCOUNTER_LONGRUNAVG) {
+	if (d->type & PERFCOUNTER_TIME) {
+	  f->dump_string("value_type", "real-integer-pair");
+	} else {
+	  f->dump_string("value_type", "integer-integer-pair");
+	}
+      } else if (d->type & PERFCOUNTER_HISTOGRAM) {
+	if (d->type & PERFCOUNTER_TIME) {
+	  f->dump_string("value_type", "real-2d-histogram");
+	} else {
+	  f->dump_string("value_type", "integer-2d-histogram");
+	}
+      } else {
+	if (d->type & PERFCOUNTER_TIME) {
+	  f->dump_string("value_type", "real");
+	} else {
+	  f->dump_string("value_type", "integer");
+	}
+      }
+
+      f->dump_string("description", d->description ? d->description : "");
+      if (d->nick != NULL) {
+        f->dump_string("nick", d->nick);
+      } else {
+        f->dump_string("nick", "");
+      }
+      f->dump_int("priority", get_adjusted_priority(d->prio));
+      
+      if (d->unit == UNIT_NONE) {
+	f->dump_string("units", "none"); 
+      } else if (d->unit == UNIT_BYTES) {
+	f->dump_string("units", "bytes");
+      }
+      f->close_section();
+    } else {
+      if (d->type & PERFCOUNTER_LONGRUNAVG) {
+	f->open_object_section(d->name);
+	pair<uint64_t,uint64_t> a = d->read_avg();
+	if (d->type & PERFCOUNTER_U64) {
+	  f->dump_unsigned("avgcount", a.second);
+	  f->dump_unsigned("sum", a.first);
+	} else if (d->type & PERFCOUNTER_TIME) {
+	  f->dump_unsigned("avgcount", a.second);
+	  f->dump_format_unquoted("sum", "%" PRId64 ".%09" PRId64,
+				  a.first / 1000000000ull,
+				  a.first % 1000000000ull);
+          uint64_t count = a.second;
+          uint64_t sum_ns = a.first;
+          if (count) {
+            uint64_t avg_ns = sum_ns / count;
+            f->dump_format_unquoted("avgtime", "%" PRId64 ".%09" PRId64,
+                                    avg_ns / 1000000000ull,
+                                    avg_ns % 1000000000ull);
+          } else {
+            f->dump_format_unquoted("avgtime", "%" PRId64 ".%09" PRId64, 0, 0);
+          }
+	} else {
+	  ceph_abort();
+	}
+	f->close_section();
+      } else if (d->type & PERFCOUNTER_HISTOGRAM) {
+        ceph_assert(d->type == (PERFCOUNTER_HISTOGRAM | PERFCOUNTER_COUNTER | PERFCOUNTER_U64));
+        ceph_assert(d->histogram);
+        f->open_object_section(d->name);
+        d->histogram->dump_formatted(f);
+        f->close_section();
+      } else {
+	uint64_t v = d->u64;
+	if (d->type & PERFCOUNTER_U64) {
+	  f->dump_unsigned(d->name, v);
+	} else if (d->type & PERFCOUNTER_TIME) {
+	  f->dump_format_unquoted(d->name, "%" PRId64 ".%09" PRId64,
+				  v / 1000000000ull,
+				  v % 1000000000ull);
+	} else {
+	  ceph_abort();
+	}
+      }
+    }
+  }
+  f->close_section();
+}
+
+const std::string &PerfCounters::get_name() const
+{
+  return m_name;
+}
+
+PerfCounters::PerfCounters(CephContext *cct, const std::string &name,
+	   int lower_bound, int upper_bound)
+  : m_cct(cct),
+    m_lower_bound(lower_bound),
+    m_upper_bound(upper_bound),
+    m_name(name)
+#ifndef WITH_SEASTAR
+    ,
+    m_lock_name(std::string("PerfCounters::") + name.c_str()),
+    m_lock(ceph::make_mutex(m_lock_name))
+#endif
+{
+  m_data.resize(upper_bound - lower_bound - 1);
+}
+
+PerfCountersBuilder::PerfCountersBuilder(CephContext *cct, const std::string &name,
+                  int first, int last)
+  : m_perf_counters(new PerfCounters(cct, name, first, last))
+{
+}
+
+PerfCountersBuilder::~PerfCountersBuilder()
+{
+  if (m_perf_counters)
+    delete m_perf_counters;
+  m_perf_counters = NULL;
+}
+
+void PerfCountersBuilder::add_u64_counter(
+  int idx, const char *name,
+  const char *description, const char *nick, int prio, int unit)
+{
+  add_impl(idx, name, description, nick, prio,
+	   PERFCOUNTER_U64 | PERFCOUNTER_COUNTER, unit);
+}
+
+void PerfCountersBuilder::add_u64(
+  int idx, const char *name,
+  const char *description, const char *nick, int prio, int unit)
+{
+  add_impl(idx, name, description, nick, prio, PERFCOUNTER_U64, unit);
+}
+
+void PerfCountersBuilder::add_u64_avg(
+  int idx, const char *name,
+  const char *description, const char *nick, int prio, int unit)
+{
+  add_impl(idx, name, description, nick, prio,
+	   PERFCOUNTER_U64 | PERFCOUNTER_LONGRUNAVG, unit);
+}
+
+void PerfCountersBuilder::add_time(
+  int idx, const char *name,
+  const char *description, const char *nick, int prio)
+{
+  add_impl(idx, name, description, nick, prio, PERFCOUNTER_TIME);
+}
+
+void PerfCountersBuilder::add_time_avg(
+  int idx, const char *name,
+  const char *description, const char *nick, int prio)
+{
+  add_impl(idx, name, description, nick, prio,
+	   PERFCOUNTER_TIME | PERFCOUNTER_LONGRUNAVG);
+}
+
+void PerfCountersBuilder::add_u64_counter_histogram(
+  int idx, const char *name,
+  PerfHistogramCommon::axis_config_d x_axis_config,
+  PerfHistogramCommon::axis_config_d y_axis_config,
+  const char *description, const char *nick, int prio, int unit)
+{
+  add_impl(idx, name, description, nick, prio,
+	   PERFCOUNTER_U64 | PERFCOUNTER_HISTOGRAM | PERFCOUNTER_COUNTER, unit,
+           unique_ptr<PerfHistogram<>>{new PerfHistogram<>{x_axis_config, y_axis_config}});
+}
+
+void PerfCountersBuilder::add_impl(
+  int idx, const char *name,
+  const char *description, const char *nick, int prio, int ty, int unit,
+  unique_ptr<PerfHistogram<>> histogram)
+{
+  ceph_assert(idx > m_perf_counters->m_lower_bound);
+  ceph_assert(idx < m_perf_counters->m_upper_bound);
+  PerfCounters::perf_counter_data_vec_t &vec(m_perf_counters->m_data);
+  PerfCounters::perf_counter_data_any_d
+    &data(vec[idx - m_perf_counters->m_lower_bound - 1]);
+  ceph_assert(data.type == PERFCOUNTER_NONE);
+  data.name = name;
+  data.description = description;
+  // nick must be <= 4 chars
+  if (nick) {
+    ceph_assert(strlen(nick) <= 4);
+  }
+  data.nick = nick;
+  data.prio = prio ? prio : prio_default;
+  data.type = (enum perfcounter_type_d)ty;
+  data.unit = (enum unit_t) unit;
+  data.histogram = std::move(histogram);
+}
+
+PerfCounters *PerfCountersBuilder::create_perf_counters()
+{
+  PerfCounters::perf_counter_data_vec_t::const_iterator d = m_perf_counters->m_data.begin();
+  PerfCounters::perf_counter_data_vec_t::const_iterator d_end = m_perf_counters->m_data.end();
+  for (; d != d_end; ++d) {
+    ceph_assert(d->type != PERFCOUNTER_NONE);
+    ceph_assert(d->type & (PERFCOUNTER_U64 | PERFCOUNTER_TIME));
+  }
+
+  PerfCounters *ret = m_perf_counters;
+  m_perf_counters = NULL;
+  return ret;
+}
+
diff --git a/src/common/perf_counters.h b/src/common/perf_counters.h
new file mode 100644
index 00000000..7087040f
--- /dev/null
+++ b/src/common/perf_counters.h
@@ -0,0 +1,378 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ * Copyright (C) 2017 OVH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+
+#ifndef CEPH_COMMON_PERF_COUNTERS_H
+#define CEPH_COMMON_PERF_COUNTERS_H
+
+#include <string>
+#include <vector>
+#include <memory>
+#include <atomic>
+#include <cstdint>
+
+#include "common/perf_histogram.h"
+#include "include/utime.h"
+#include "common/ceph_mutex.h"
+#include "common/ceph_time.h"
+
+class CephContext;
+class PerfCountersBuilder;
+class PerfCounters;
+
+enum perfcounter_type_d : uint8_t
+{
+  PERFCOUNTER_NONE = 0,
+  PERFCOUNTER_TIME = 0x1,       // float (measuring seconds)
+  PERFCOUNTER_U64 = 0x2,        // integer (note: either TIME or U64 *must* be set)
+  PERFCOUNTER_LONGRUNAVG = 0x4, // paired counter + sum (time)
+  PERFCOUNTER_COUNTER = 0x8,    // counter (vs gauge)
+  PERFCOUNTER_HISTOGRAM = 0x10, // histogram (vector) of values
+};
+
+enum unit_t : uint8_t
+{
+  UNIT_BYTES,
+  UNIT_NONE
+};
+
+/* Class for constructing a PerfCounters object.
+ *
+ * This class performs some validation that the parameters we have supplied are
+ * correct in create_perf_counters().
+ *
+ * In the future, we will probably get rid of the first/last arguments, since
+ * PerfCountersBuilder can deduce them itself.
+ */
+class PerfCountersBuilder
+{
+public:
+  PerfCountersBuilder(CephContext *cct, const std::string &name,
+		    int first, int last);
+  ~PerfCountersBuilder();
+
+  // prio values: higher is better, and higher values get included in
+  // 'ceph daemonperf' (and similar) results.
+  // Use of priorities enables us to add large numbers of counters
+  // internally without necessarily overwhelming consumers.
+  enum {
+    PRIO_CRITICAL = 10,
+    // 'interesting' is the default threshold for `daemonperf` output
+    PRIO_INTERESTING = 8,
+    // `useful` is the default threshold for transmission to ceph-mgr
+    // and inclusion in prometheus/influxdb plugin output
+    PRIO_USEFUL = 5,
+    PRIO_UNINTERESTING = 2,
+    PRIO_DEBUGONLY = 0,
+  };
+  void add_u64(int key, const char *name,
+	       const char *description=NULL, const char *nick = NULL,
+	       int prio=0, int unit=UNIT_NONE);
+  void add_u64_counter(int key, const char *name,
+		       const char *description=NULL,
+		       const char *nick = NULL,
+		       int prio=0, int unit=UNIT_NONE);
+  void add_u64_avg(int key, const char *name,
+		   const char *description=NULL,
+		   const char *nick = NULL,
+		   int prio=0, int unit=UNIT_NONE);
+  void add_time(int key, const char *name,
+		const char *description=NULL,
+		const char *nick = NULL,
+		int prio=0);
+  void add_time_avg(int key, const char *name,
+		    const char *description=NULL,
+		    const char *nick = NULL,
+		    int prio=0);
+  void add_u64_counter_histogram(
+    int key, const char* name,
+    PerfHistogramCommon::axis_config_d x_axis_config,
+    PerfHistogramCommon::axis_config_d y_axis_config,
+    const char *description=NULL,
+    const char* nick = NULL,
+    int prio=0, int unit=UNIT_NONE);
+
+  void set_prio_default(int prio_)
+  {
+    prio_default = prio_;
+  }
+
+  PerfCounters* create_perf_counters();
+private:
+  PerfCountersBuilder(const PerfCountersBuilder &rhs);
+  PerfCountersBuilder& operator=(const PerfCountersBuilder &rhs);
+  void add_impl(int idx, const char *name,
+                const char *description, const char *nick, int prio, int ty, int unit=UNIT_NONE,
+                unique_ptr<PerfHistogram<>> histogram = nullptr);
+
+  PerfCounters *m_perf_counters;
+
+  int prio_default = 0;
+};
+
+/*
+ * A PerfCounters object is usually associated with a single subsystem.
+ * It contains counters which we modify to track performance and throughput
+ * over time. 
+ *
+ * PerfCounters can track several different types of values:
+ * 1) integer values & counters
+ * 2) floating-point values & counters
+ * 3) floating-point averages
+ * 4) 2D histograms of quantized value pairs
+ *
+ * The difference between values, counters and histograms is in how they are initialized
+ * and accessed. For a counter, use the inc(counter, amount) function (note
+ * that amount defaults to 1 if you don't set it). For a value, use the
+ * set(index, value) function. For histogram use the hinc(value1, value2) function.
+ * (For time, use the tinc and tset variants.)
+ *
+ * If for some reason you would like to reset your counters, you can do so using
+ * the set functions even if they are counters, and you can also
+ * increment your values if for some reason you wish to.
+ *
+ * For the time average, it returns the current value and
+ * the "avgcount" member when read off. avgcount is incremented when you call
+ * tinc. Calling tset on an average is an error and will assert out.
+ */
+class PerfCounters
+{
+public:
+  /** Represents a PerfCounters data element. */
+  struct perf_counter_data_any_d {
+    perf_counter_data_any_d()
+      : name(NULL),
+        description(NULL),
+        nick(NULL),
+	 type(PERFCOUNTER_NONE),
+	 unit(UNIT_NONE)
+    {}
+    perf_counter_data_any_d(const perf_counter_data_any_d& other)
+      : name(other.name),
+        description(other.description),
+        nick(other.nick),
+	 type(other.type),
+	 unit(other.unit),
+	 u64(other.u64.load()) {
+      pair<uint64_t,uint64_t> a = other.read_avg();
+      u64 = a.first;
+      avgcount = a.second;
+      avgcount2 = a.second;
+      if (other.histogram) {
+        histogram.reset(new PerfHistogram<>(*other.histogram));
+      }
+    }
+
+    const char *name;
+    const char *description;
+    const char *nick;
+    uint8_t prio = 0;
+    enum perfcounter_type_d type;
+    enum unit_t unit;
+    std::atomic<uint64_t> u64 = { 0 };
+    std::atomic<uint64_t> avgcount = { 0 };
+    std::atomic<uint64_t> avgcount2 = { 0 };
+    std::unique_ptr<PerfHistogram<>> histogram;
+
+    void reset()
+    {
+      if (type != PERFCOUNTER_U64) {
+	    u64 = 0;
+	    avgcount = 0;
+	    avgcount2 = 0;
+      }
+      if (histogram) {
+        histogram->reset();
+      }
+    }
+
+    // read <sum, count> safely by making sure the post- and pre-count
+    // are identical; in other words the whole loop needs to be run
+    // without any intervening calls to inc, set, or tinc.
+    pair<uint64_t,uint64_t> read_avg() const {
+      uint64_t sum, count;
+      do {
+	count = avgcount2;
+	sum = u64;
+      } while (avgcount != count);
+      return make_pair(sum, count);
+    }
+  };
+
+  template <typename T>
+  struct avg_tracker {
+    pair<uint64_t, T> last;
+    pair<uint64_t, T> cur;
+    avg_tracker() : last(0, 0), cur(0, 0) {}
+    T current_avg() const {
+      if (cur.first == last.first)
+        return 0;
+      return (cur.second - last.second) / (cur.first - last.first);
+    }
+    void consume_next(const pair<uint64_t, T> &next) {
+      last = cur;
+      cur = next;
+    }
+  };
+
+  ~PerfCounters();
+
+  void inc(int idx, uint64_t v = 1);
+  void dec(int idx, uint64_t v = 1);
+  void set(int idx, uint64_t v);
+  uint64_t get(int idx) const;
+
+  void tset(int idx, utime_t v);
+  void tinc(int idx, utime_t v);
+  void tinc(int idx, ceph::timespan v);
+  utime_t tget(int idx) const;
+
+  void hinc(int idx, int64_t x, int64_t y);
+
+  void reset();
+  void dump_formatted(ceph::Formatter *f, bool schema,
+                      const std::string &counter = "") const {
+    dump_formatted_generic(f, schema, false, counter);
+  }
+  void dump_formatted_histograms(ceph::Formatter *f, bool schema,
+                                 const std::string &counter = "") const {
+    dump_formatted_generic(f, schema, true, counter);
+  }
+  pair<uint64_t, uint64_t> get_tavg_ns(int idx) const;
+
+  const std::string& get_name() const;
+  void set_name(std::string s) {
+    m_name = s;
+  }
+
+  /// adjust priority values by some value
+  void set_prio_adjust(int p) {
+    prio_adjust = p;
+  }
+
+  int get_adjusted_priority(int p) const {
+    return std::max(std::min(p + prio_adjust,
+                             (int)PerfCountersBuilder::PRIO_CRITICAL),
+                    0);
+  }
+
+private:
+  PerfCounters(CephContext *cct, const std::string &name,
+	     int lower_bound, int upper_bound);
+  PerfCounters(const PerfCounters &rhs);
+  PerfCounters& operator=(const PerfCounters &rhs);
+  void dump_formatted_generic(ceph::Formatter *f, bool schema, bool histograms,
+                              const std::string &counter = "") const;
+
+  typedef std::vector<perf_counter_data_any_d> perf_counter_data_vec_t;
+
+  CephContext *m_cct;
+  int m_lower_bound;
+  int m_upper_bound;
+  std::string m_name;
+
+  int prio_adjust = 0;
+
+#ifndef WITH_SEASTAR
+  const std::string m_lock_name;
+  /** Protects m_data */
+  ceph::mutex m_lock;
+#endif
+
+  perf_counter_data_vec_t m_data;
+
+  friend class PerfCountersBuilder;
+  friend class PerfCountersCollectionImpl;
+};
+
+class SortPerfCountersByName {
+public:
+  bool operator()(const PerfCounters* lhs, const PerfCounters* rhs) const {
+    return (lhs->get_name() < rhs->get_name());
+  }
+};
+
+typedef std::set <PerfCounters*, SortPerfCountersByName> perf_counters_set_t;
+
+/*
+ * PerfCountersCollectionImp manages PerfCounters objects for a Ceph process.
+ */
+class PerfCountersCollectionImpl
+{
+public:
+  PerfCountersCollectionImpl();
+  ~PerfCountersCollectionImpl();
+  void add(PerfCounters *l);
+  void remove(PerfCounters *l);
+  void clear();
+  bool reset(const std::string &name);
+
+  void dump_formatted(ceph::Formatter *f, bool schema,
+                      const std::string &logger = "",
+                      const std::string &counter = "") const {
+    dump_formatted_generic(f, schema, false, logger, counter);
+  }
+
+  void dump_formatted_histograms(ceph::Formatter *f, bool schema,
+                                 const std::string &logger = "",
+                                 const std::string &counter = "") const {
+    dump_formatted_generic(f, schema, true, logger, counter);
+  }
+
+  // A reference to a perf_counter_data_any_d, with an accompanying
+  // pointer to the enclosing PerfCounters, in order that the consumer
+  // can see the prio_adjust
+  class PerfCounterRef
+  {
+    public:
+    PerfCounters::perf_counter_data_any_d *data;
+    PerfCounters *perf_counters;
+  };
+  typedef std::map<std::string,
+          PerfCounterRef> CounterMap;
+
+  void with_counters(std::function<void(const CounterMap &)>) const;
+
+private:
+  void dump_formatted_generic(ceph::Formatter *f, bool schema, bool histograms,
+                              const std::string &logger = "",
+                              const std::string &counter = "") const;
+
+  perf_counters_set_t m_loggers;
+
+  CounterMap by_path; 
+};
+
+
+class PerfGuard {
+  const ceph::real_clock::time_point start;
+  PerfCounters* const counters;
+  const int event;
+
+public:
+  PerfGuard(PerfCounters* const counters,
+            const int event)
+  : start(ceph::real_clock::now()),
+    counters(counters),
+    event(event) {
+  }
+
+  ~PerfGuard() {
+    counters->tinc(event, ceph::real_clock::now() - start);
+  }
+};
+
+
+#endif
diff --git a/src/common/perf_counters_collection.cc b/src/common/perf_counters_collection.cc
new file mode 100644
index 00000000..c8a7ec01
--- /dev/null
+++ b/src/common/perf_counters_collection.cc
@@ -0,0 +1,60 @@
+#include "common/perf_counters_collection.h"
+#include "common/ceph_mutex.h"
+#include "common/ceph_context.h"
+
+/* PerfcounterCollection hold the lock for PerfCounterCollectionImp */
+PerfCountersCollection::PerfCountersCollection(CephContext *cct)
+  : m_cct(cct),
+    m_lock(ceph::make_mutex("PerfCountersCollection"))
+{
+}
+PerfCountersCollection::~PerfCountersCollection()
+{
+  clear();
+}
+void PerfCountersCollection::add(PerfCounters *l)
+{
+  std::lock_guard lck(m_lock);
+  perf_impl.add(l);
+}
+void PerfCountersCollection::remove(PerfCounters *l)
+{
+  std::lock_guard lck(m_lock);
+  perf_impl.remove(l);
+}
+void PerfCountersCollection::clear()
+{
+  std::lock_guard lck(m_lock);
+  perf_impl.clear();
+}
+bool PerfCountersCollection::reset(const std::string &name)
+{
+  std::lock_guard lck(m_lock);
+  return perf_impl.reset(name);
+}
+void PerfCountersCollection::dump_formatted(ceph::Formatter *f, bool schema,
+                      const std::string &logger,
+                      const std::string &counter)
+{
+  std::lock_guard lck(m_lock);
+  perf_impl.dump_formatted(f,schema,logger,counter);
+}
+void PerfCountersCollection::dump_formatted_histograms(ceph::Formatter *f, bool schema,
+                                 const std::string &logger,
+                                 const std::string &counter)
+{
+  std::lock_guard lck(m_lock);
+  perf_impl.dump_formatted_histograms(f,schema,logger,counter);
+}
+void PerfCountersCollection::with_counters(std::function<void(const PerfCountersCollectionImpl::CounterMap &)> fn) const
+{
+  std::lock_guard lck(m_lock);
+  perf_impl.with_counters(fn);
+}
+void PerfCountersDeleter::operator()(PerfCounters* p) noexcept
+{
+  if (cct)
+    cct->get_perfcounters_collection()->remove(p);
+  delete p;
+}
+
diff --git a/src/common/perf_counters_collection.h b/src/common/perf_counters_collection.h
new file mode 100644
index 00000000..53a8f341
--- /dev/null
+++ b/src/common/perf_counters_collection.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include "common/perf_counters.h"
+#include "common/ceph_mutex.h"
+
+class CephContext;
+
+class PerfCountersCollection
+{
+  CephContext *m_cct;
+
+  /** Protects perf_impl->m_loggers */
+  mutable ceph::mutex m_lock;
+  PerfCountersCollectionImpl perf_impl;
+public:
+  PerfCountersCollection(CephContext *cct);
+  ~PerfCountersCollection();
+  void add(PerfCounters *l);
+  void remove(PerfCounters *l);
+  void clear();
+  bool reset(const std::string &name);
+
+  void dump_formatted(ceph::Formatter *f, bool schema,
+                      const std::string &logger = "",
+                      const std::string &counter = "");
+  void dump_formatted_histograms(ceph::Formatter *f, bool schema,
+                                 const std::string &logger = "",
+                                 const std::string &counter = "");
+
+  void with_counters(std::function<void(const PerfCountersCollectionImpl::CounterMap &)>) const;
+
+  friend class PerfCountersCollectionTest;
+};
+
+class PerfCountersDeleter {
+  CephContext* cct;
+
+public:
+  PerfCountersDeleter() noexcept : cct(nullptr) {}
+  PerfCountersDeleter(CephContext* cct) noexcept : cct(cct) {}
+  void operator()(PerfCounters* p) noexcept;
+};
+
+using PerfCountersRef = std::unique_ptr<PerfCounters, PerfCountersDeleter>;
+
+
diff --git a/src/common/perf_histogram.cc b/src/common/perf_histogram.cc
new file mode 100644
index 00000000..13528764
--- /dev/null
+++ b/src/common/perf_histogram.cc
@@ -0,0 +1,121 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 OVH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/perf_histogram.h"
+
+#include <limits>
+
+void PerfHistogramCommon::dump_formatted_axis(
+    ceph::Formatter *f, const PerfHistogramCommon::axis_config_d &ac) {
+  f->open_object_section("axis");
+
+  // Dump axis configuration
+  f->dump_string("name", ac.m_name);
+  f->dump_int("min", ac.m_min);
+  f->dump_int("quant_size", ac.m_quant_size);
+  f->dump_int("buckets", ac.m_buckets);
+  switch (ac.m_scale_type) {
+    case SCALE_LINEAR:
+      f->dump_string("scale_type", "linear");
+      break;
+    case SCALE_LOG2:
+      f->dump_string("scale_type", "log2");
+      break;
+    default:
+      ceph_assert(false && "Invalid scale type");
+  }
+
+  {
+    // Dump concrete ranges for axis buckets
+    f->open_array_section("ranges");
+    auto ranges = get_axis_bucket_ranges(ac);
+    for (int i = 0; i < ac.m_buckets; ++i) {
+      f->open_object_section("bucket");
+      if (i > 0) {
+        f->dump_int("min", ranges[i].first);
+      }
+      if (i < ac.m_buckets - 1) {
+        f->dump_int("max", ranges[i].second);
+      }
+      f->close_section();
+    }
+    f->close_section();
+  }
+
+  f->close_section();
+}
+
+int64_t get_quants(int64_t i, PerfHistogramCommon::scale_type_d st) {
+  switch (st) {
+    case PerfHistogramCommon::SCALE_LINEAR:
+      return i;
+    case PerfHistogramCommon::SCALE_LOG2:
+      return int64_t(1) << (i - 1);
+  }
+  ceph_assert(false && "Invalid scale type");
+}
+
+int64_t PerfHistogramCommon::get_bucket_for_axis(
+    int64_t value, const PerfHistogramCommon::axis_config_d &ac) {
+  if (value < ac.m_min) {
+    return 0;
+  }
+
+  value -= ac.m_min;
+  value /= ac.m_quant_size;
+
+  switch (ac.m_scale_type) {
+    case SCALE_LINEAR:
+      return std::min<int64_t>(value + 1, ac.m_buckets - 1);
+
+    case SCALE_LOG2:
+      for (int64_t i = 1; i < ac.m_buckets; ++i) {
+        if (value < get_quants(i, SCALE_LOG2)) {
+          return i;
+        }
+      }
+      return ac.m_buckets - 1;
+  }
+  ceph_assert(false && "Invalid scale type");
+}
+
+std::vector<std::pair<int64_t, int64_t>>
+PerfHistogramCommon::get_axis_bucket_ranges(
+    const PerfHistogramCommon::axis_config_d &ac) {
+  std::vector<std::pair<int64_t, int64_t>> ret;
+  ret.resize(ac.m_buckets);
+
+  // First bucket is for value < min
+  int64_t min = ac.m_min;
+  for (int64_t i = 1; i < ac.m_buckets - 1; i++) {
+    int64_t max_exclusive =
+        ac.m_min + get_quants(i, ac.m_scale_type) * ac.m_quant_size;
+
+    // Dump bucket range
+    ret[i].first = min;
+    ret[i].second = max_exclusive - 1;
+
+    // Shift min to next bucket
+    min = max_exclusive;
+  }
+
+  // Fill up first and last element, note that in case m_buckets == 1
+  // those will point to the same element, the order is important here
+  ret.front().second = ac.m_min - 1;
+  ret.back().first = min;
+
+  ret.front().first = std::numeric_limits<int64_t>::min();
+  ret.back().second = std::numeric_limits<int64_t>::max();
+  return ret;
+}
diff --git a/src/common/perf_histogram.h b/src/common/perf_histogram.h
new file mode 100644
index 00000000..3052106b
--- /dev/null
+++ b/src/common/perf_histogram.h
@@ -0,0 +1,227 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 OVH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_PERF_HISTOGRAM_H
+#define CEPH_COMMON_PERF_HISTOGRAM_H
+
+#include <array>
+#include <atomic>
+#include <memory>
+
+#include "common/Formatter.h"
+#include "include/int_types.h"
+#include "include/ceph_assert.h"
+
+class PerfHistogramCommon {
+public:
+  enum scale_type_d : uint8_t {
+    SCALE_LINEAR = 1,
+    SCALE_LOG2 = 2,
+  };
+
+  struct axis_config_d {
+    const char *m_name = nullptr;
+    scale_type_d m_scale_type = SCALE_LINEAR;
+    int64_t m_min = 0;
+    int64_t m_quant_size = 0;
+    int32_t m_buckets = 0;
+    axis_config_d() = default;
+    axis_config_d(const char* name,
+		  scale_type_d scale_type,
+		  int64_t min,
+		  int64_t quant_size,
+		  int32_t buckets)
+      : m_name(name),
+	m_scale_type(scale_type),
+	m_min(min),
+	m_quant_size(quant_size),
+	m_buckets(buckets)
+    {}
+  };
+
+protected:
+  /// Dump configuration of one axis to a formatter
+  static void dump_formatted_axis(ceph::Formatter *f, const axis_config_d &ac);
+
+  /// Quantize given value and convert to bucket number on given axis
+  static int64_t get_bucket_for_axis(int64_t value, const axis_config_d &ac);
+
+  /// Calculate inclusive ranges of axis values for each bucket on that axis
+  static std::vector<std::pair<int64_t, int64_t>> get_axis_bucket_ranges(
+      const axis_config_d &ac);
+};
+
+/// PerfHistogram does trace a histogram of input values. It's an extended
+/// version of a standard histogram which does trace characteristics of a single
+/// one value only. In this implementation, values can be traced in multiple
+/// dimensions - i.e. we can create a histogram of input request size (first
+/// dimension) and processing latency (second dimension). Creating standard
+/// histogram out of such multidimensional one is trivial and requires summing
+/// values across dimensions we're not interested in.
+template <int DIM = 2>
+class PerfHistogram : public PerfHistogramCommon {
+public:
+  /// Initialize new histogram object
+  PerfHistogram(std::initializer_list<axis_config_d> axes_config) {
+    ceph_assert(axes_config.size() == DIM &&
+		"Invalid number of axis configuration objects");
+
+    int i = 0;
+    for (const auto &ac : axes_config) {
+      ceph_assertf(ac.m_buckets > 0, "Must have at least one bucket on axis");
+      ceph_assertf(ac.m_quant_size > 0,
+             "Quantization unit must be non-zero positive integer value");
+
+      m_axes_config[i++] = ac;
+    }
+
+    m_rawData.reset(new std::atomic<uint64_t>[get_raw_size()] {});
+  }
+
+  /// Copy from other histogram object
+  PerfHistogram(const PerfHistogram &other)
+      : m_axes_config(other.m_axes_config) {
+    int64_t size = get_raw_size();
+    m_rawData.reset(new std::atomic<uint64_t>[size] {});
+    for (int64_t i = 0; i < size; i++) {
+      m_rawData[i] = other.m_rawData[i].load();
+    }
+  }
+
+  /// Set all histogram values to 0
+  void reset() {
+    auto size = get_raw_size();
+    for (auto i = size; --i >= 0;) {
+      m_rawData[i] = 0;
+    }
+  }
+
+  /// Increase counter for given axis values by one
+  template <typename... T>
+  void inc(T... axis) {
+    auto index = get_raw_index_for_value(axis...);
+    m_rawData[index]++;
+  }
+
+  /// Increase counter for given axis buckets by one
+  template <typename... T>
+  void inc_bucket(T... bucket) {
+    auto index = get_raw_index_for_bucket(bucket...);
+    m_rawData[index]++;
+  }
+
+  /// Read value from given bucket
+  template <typename... T>
+  uint64_t read_bucket(T... bucket) const {
+    auto index = get_raw_index_for_bucket(bucket...);
+    return m_rawData[index];
+  }
+
+  /// Dump data to a Formatter object
+  void dump_formatted(ceph::Formatter *f) const {
+    // Dump axes configuration
+    f->open_array_section("axes");
+    for (auto &ac : m_axes_config) {
+      dump_formatted_axis(f, ac);
+    }
+    f->close_section();
+
+    // Dump histogram values
+    dump_formatted_values(f);
+  }
+
+protected:
+  /// Raw data stored as linear space, internal indexes are calculated on
+  /// demand.
+  std::unique_ptr<std::atomic<uint64_t>[]> m_rawData;
+
+  /// Configuration of axes
+  std::array<axis_config_d, DIM> m_axes_config;
+
+  /// Dump histogram counters to a formatter
+  void dump_formatted_values(ceph::Formatter *f) const {
+    visit_values([f](int) { f->open_array_section("values"); },
+                 [f](int64_t value) { f->dump_unsigned("value", value); },
+                 [f](int) { f->close_section(); });
+  }
+
+  /// Get number of all histogram counters
+  int64_t get_raw_size() {
+    int64_t ret = 1;
+    for (const auto &ac : m_axes_config) {
+      ret *= ac.m_buckets;
+    }
+    return ret;
+  }
+
+  /// Calculate m_rawData index from axis values
+  template <typename... T>
+  int64_t get_raw_index_for_value(T... axes) const {
+    static_assert(sizeof...(T) == DIM, "Incorrect number of arguments");
+    return get_raw_index_internal<0>(get_bucket_for_axis, 0, axes...);
+  }
+
+  /// Calculate m_rawData index from axis bucket numbers
+  template <typename... T>
+  int64_t get_raw_index_for_bucket(T... buckets) const {
+    static_assert(sizeof...(T) == DIM, "Incorrect number of arguments");
+    return get_raw_index_internal<0>(
+        [](int64_t bucket, const axis_config_d &ac) {
+          ceph_assertf(bucket >= 0, "Bucket index can not be negative");
+          ceph_assertf(bucket < ac.m_buckets, "Bucket index too large");
+          return bucket;
+        },
+        0, buckets...);
+  }
+
+  template <int level = 0, typename F, typename... T>
+  int64_t get_raw_index_internal(F bucket_evaluator, int64_t startIndex,
+                                 int64_t value, T... tail) const {
+    static_assert(level + 1 + sizeof...(T) == DIM,
+                  "Internal consistency check");
+    auto &ac = m_axes_config[level];
+    auto bucket = bucket_evaluator(value, ac);
+    return get_raw_index_internal<level + 1>(
+        bucket_evaluator, ac.m_buckets * startIndex + bucket, tail...);
+  }
+
+  template <int level, typename F>
+  int64_t get_raw_index_internal(F, int64_t startIndex) const {
+    static_assert(level == DIM, "Internal consistency check");
+    return startIndex;
+  }
+
+  /// Visit all histogram counters, call onDimensionEnter / onDimensionLeave
+  /// when starting / finishing traversal
+  /// on given axis, call onValue when dumping raw histogram counter value.
+  template <typename FDE, typename FV, typename FDL>
+  void visit_values(FDE onDimensionEnter, FV onValue, FDL onDimensionLeave,
+                    int level = 0, int startIndex = 0) const {
+    if (level == DIM) {
+      onValue(m_rawData[startIndex]);
+      return;
+    }
+
+    onDimensionEnter(level);
+    auto &ac = m_axes_config[level];
+    startIndex *= ac.m_buckets;
+    for (int32_t i = 0; i < ac.m_buckets; ++i, ++startIndex) {
+      visit_values(onDimensionEnter, onValue, onDimensionLeave, level + 1,
+                   startIndex);
+    }
+    onDimensionLeave(level);
+  }
+};
+
+#endif
diff --git a/src/common/pick_address.cc b/src/common/pick_address.cc
new file mode 100644
index 00000000..e6da4248
--- /dev/null
+++ b/src/common/pick_address.cc
@@ -0,0 +1,584 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2012 Inktank
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/pick_address.h"
+#include "include/ipaddr.h"
+#include "include/str_list.h"
+#include "common/ceph_context.h"
+#ifndef WITH_SEASTAR
+#include "common/config.h"
+#include "common/config_obs.h"
+#endif
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/numa.h"
+
+#include <netdb.h>
+#include <string>
+#include <string.h>
+#include <vector>
+
+#define dout_subsys ceph_subsys_
+
+const struct sockaddr *find_ip_in_subnet_list(
+  CephContext *cct,
+  const struct ifaddrs *ifa,
+  unsigned ipv,
+  const std::string &networks,
+  const std::string &interfaces,
+  int numa_node)
+{
+  std::list<string> nets;
+  get_str_list(networks, nets);
+  std::list<string> ifs;
+  get_str_list(interfaces, ifs);
+
+  // filter interfaces by name
+  const struct ifaddrs *filtered = nullptr;
+  if (ifs.empty()) {
+    filtered = ifa;
+  } else {
+    if (nets.empty()) {
+      lderr(cct) << "interface names specified but not network names" << dendl;
+      exit(1);
+    }
+    const struct ifaddrs *t = ifa;
+    struct ifaddrs *head = 0;
+    while (t) {
+      bool match = false;
+      for (auto& i : ifs) {
+	if (strcmp(i.c_str(), t->ifa_name) == 0) {
+	  match = true;
+	  break;
+	}
+      }
+      if (match) {
+	struct ifaddrs *n = new ifaddrs;
+	memcpy(n, t, sizeof(*t));
+	n->ifa_next = head;
+	head = n;
+      }
+      t = t->ifa_next;
+    }
+    if (!head) {
+      lderr(cct) << "no interfaces matching " << ifs << dendl;
+      exit(1);
+    }
+    filtered = head;
+  }
+
+  struct sockaddr *r = nullptr;
+  for (auto& s : nets) {
+    struct sockaddr_storage net;
+    unsigned int prefix_len;
+
+    if (!parse_network(s.c_str(), &net, &prefix_len)) {
+      lderr(cct) << "unable to parse network: " << s << dendl;
+      exit(1);
+    }
+
+    switch (net.ss_family) {
+    case AF_INET:
+      if (!(ipv & CEPH_PICK_ADDRESS_IPV4)) {
+	continue;
+      }
+      break;
+    case AF_INET6:
+      if (!(ipv & CEPH_PICK_ADDRESS_IPV6)) {
+	continue;
+      }
+      break;
+    }
+
+    const struct ifaddrs *found = find_ip_in_subnet(
+      filtered,
+      (struct sockaddr *) &net, prefix_len, numa_node);
+    if (found) {
+      r = found->ifa_addr;
+      break;
+    }
+  }
+
+  if (filtered != ifa) {
+    while (filtered) {
+      struct ifaddrs *t = filtered->ifa_next;
+      delete filtered;
+      filtered = t;
+    }
+  }
+
+  return r;
+}
+
+#ifndef WITH_SEASTAR
+// observe this change
+struct Observer : public md_config_obs_t {
+  const char *keys[2];
+  explicit Observer(const char *c) {
+    keys[0] = c;
+    keys[1] = NULL;
+  }
+
+  const char** get_tracked_conf_keys() const override {
+    return (const char **)keys;
+  }
+  void handle_conf_change(const ConfigProxy& conf,
+			  const std::set <std::string> &changed) override {
+    // do nothing.
+  }
+};
+
+static void fill_in_one_address(CephContext *cct,
+				const struct ifaddrs *ifa,
+				const string networks,
+				const string interfaces,
+				const char *conf_var,
+				int numa_node = -1)
+{
+  const struct sockaddr *found = find_ip_in_subnet_list(
+    cct,
+    ifa,
+    CEPH_PICK_ADDRESS_IPV4|CEPH_PICK_ADDRESS_IPV6,
+    networks,
+    interfaces,
+    numa_node);
+  if (!found) {
+    lderr(cct) << "unable to find any IP address in networks '" << networks
+	       << "' interfaces '" << interfaces << "'" << dendl;
+    exit(1);
+  }
+
+  char buf[INET6_ADDRSTRLEN];
+  int err;
+
+  err = getnameinfo(found,
+		    (found->sa_family == AF_INET)
+		    ? sizeof(struct sockaddr_in)
+		    : sizeof(struct sockaddr_in6),
+
+		    buf, sizeof(buf),
+		    nullptr, 0,
+		    NI_NUMERICHOST);
+  if (err != 0) {
+    lderr(cct) << "unable to convert chosen address to string: " << gai_strerror(err) << dendl;
+    exit(1);
+  }
+
+  Observer obs(conf_var);
+
+  cct->_conf.add_observer(&obs);
+
+  cct->_conf.set_val_or_die(conf_var, buf);
+  cct->_conf.apply_changes(nullptr);
+
+  cct->_conf.remove_observer(&obs);
+}
+
+void pick_addresses(CephContext *cct, int needs)
+{
+  struct ifaddrs *ifa;
+  int r = getifaddrs(&ifa);
+  auto public_addr = cct->_conf.get_val<entity_addr_t>("public_addr");
+  auto public_network = cct->_conf.get_val<std::string>("public_network");
+  auto public_network_interface =
+    cct->_conf.get_val<std::string>("public_network_interface");
+  auto cluster_addr = cct->_conf.get_val<entity_addr_t>("cluster_addr");
+  auto cluster_network = cct->_conf.get_val<std::string>("cluster_network");
+  auto cluster_network_interface =
+    cct->_conf.get_val<std::string>("cluster_network_interface");
+
+  if (r < 0) {
+    string err = cpp_strerror(errno);
+    lderr(cct) << "unable to fetch interfaces and addresses: " << err << dendl;
+    exit(1);
+  }
+
+  if ((needs & CEPH_PICK_ADDRESS_PUBLIC) &&
+    public_addr.is_blank_ip() && !public_network.empty()) {
+    fill_in_one_address(cct, ifa, public_network, public_network_interface,
+      "public_addr");
+  }
+
+  if ((needs & CEPH_PICK_ADDRESS_CLUSTER) && cluster_addr.is_blank_ip()) {
+    if (!cluster_network.empty()) {
+      fill_in_one_address(cct, ifa, cluster_network, cluster_network_interface,
+	"cluster_addr");
+    } else {
+      if (!public_network.empty()) {
+        lderr(cct) << "Public network was set, but cluster network was not set " << dendl;
+        lderr(cct) << "    Using public network also for cluster network" << dendl;
+        fill_in_one_address(cct, ifa, public_network, public_network_interface,
+          "cluster_addr");
+      }
+    }
+  }
+
+  freeifaddrs(ifa);
+}
+#endif	// !WITH_SEASTAR
+
+static int fill_in_one_address(
+  CephContext *cct,
+  const struct ifaddrs *ifa,
+  unsigned ipv,
+  const string networks,
+  const string interfaces,
+  entity_addrvec_t *addrs,
+  int numa_node = -1)
+{
+  const struct sockaddr *found = find_ip_in_subnet_list(cct, ifa, ipv, networks,
+							interfaces, numa_node);
+  if (!found) {
+    std::string ip_type = "";
+    if ((ipv & CEPH_PICK_ADDRESS_IPV4) && (ipv & CEPH_PICK_ADDRESS_IPV6)) {
+      ip_type = "IPv4 or IPv6";
+    } else if (ipv & CEPH_PICK_ADDRESS_IPV4) {
+      ip_type = "IPv4";
+    } else {
+      ip_type = "IPv6";
+    }
+    lderr(cct) << "unable to find any " << ip_type << " address in networks '"
+               << networks << "' interfaces '" << interfaces << "'" << dendl;
+    return -1;
+  }
+
+  char buf[INET6_ADDRSTRLEN];
+  int err;
+
+  err = getnameinfo(found,
+		    (found->sa_family == AF_INET)
+		    ? sizeof(struct sockaddr_in)
+		    : sizeof(struct sockaddr_in6),
+
+		    buf, sizeof(buf),
+		    nullptr, 0,
+		    NI_NUMERICHOST);
+  if (err != 0) {
+    lderr(cct) << "unable to convert chosen address to string: " << gai_strerror(err) << dendl;
+    return -1;
+  }
+
+  entity_addr_t addr;
+  const char *end = 0;
+  bool r = addr.parse(buf, &end);
+  if (!r) {
+    return -1;
+  }
+  addrs->v.push_back(addr);
+  return 0;
+}
+
+int pick_addresses(
+  CephContext *cct,
+  unsigned flags,
+  struct ifaddrs *ifa,
+  entity_addrvec_t *addrs,
+  int preferred_numa_node)
+{
+  addrs->v.clear();
+
+  unsigned addrt = (flags & (CEPH_PICK_ADDRESS_PUBLIC |
+			     CEPH_PICK_ADDRESS_CLUSTER));
+  if (addrt == 0 ||
+      addrt == (CEPH_PICK_ADDRESS_PUBLIC |
+		CEPH_PICK_ADDRESS_CLUSTER)) {
+    return -EINVAL;
+  }
+  unsigned msgrv = flags & (CEPH_PICK_ADDRESS_MSGR1 |
+			    CEPH_PICK_ADDRESS_MSGR2);
+  if (msgrv == 0) {
+    if (cct->_conf.get_val<bool>("ms_bind_msgr1")) {
+      msgrv |= CEPH_PICK_ADDRESS_MSGR1;
+    }
+    if (cct->_conf.get_val<bool>("ms_bind_msgr2")) {
+      msgrv |= CEPH_PICK_ADDRESS_MSGR2;
+    }
+    if (msgrv == 0) {
+      return -EINVAL;
+    }
+  }
+  unsigned ipv = flags & (CEPH_PICK_ADDRESS_IPV4 |
+			  CEPH_PICK_ADDRESS_IPV6);
+  if (ipv == 0) {
+    if (cct->_conf.get_val<bool>("ms_bind_ipv4")) {
+      ipv |= CEPH_PICK_ADDRESS_IPV4;
+    }
+    if (cct->_conf.get_val<bool>("ms_bind_ipv6")) {
+      ipv |= CEPH_PICK_ADDRESS_IPV6;
+    }
+    if (ipv == 0) {
+      return -EINVAL;
+    }
+    if (cct->_conf.get_val<bool>("ms_bind_prefer_ipv4")) {
+      flags |= CEPH_PICK_ADDRESS_PREFER_IPV4;
+    } else {
+      flags &= ~CEPH_PICK_ADDRESS_PREFER_IPV4;
+    }
+  }
+
+  entity_addr_t addr;
+  string networks;
+  string interfaces;
+  if (addrt & CEPH_PICK_ADDRESS_PUBLIC) {
+    addr = cct->_conf.get_val<entity_addr_t>("public_addr");
+    networks = cct->_conf.get_val<std::string>("public_network");
+    interfaces =
+      cct->_conf.get_val<std::string>("public_network_interface");
+  } else {
+    addr = cct->_conf.get_val<entity_addr_t>("cluster_addr");
+    networks = cct->_conf.get_val<std::string>("cluster_network");
+    interfaces =
+      cct->_conf.get_val<std::string>("cluster_network_interface");
+    if (networks.empty()) {
+      lderr(cct) << "Falling back to public interface" << dendl;
+      // fall back to public_ network and interface if cluster is not set
+      networks = cct->_conf.get_val<std::string>("public_network");
+      interfaces =
+	cct->_conf.get_val<std::string>("public_network_interface");
+    }
+  }
+  if (addr.is_blank_ip() &&
+      !networks.empty()) {
+    int ipv4_r = !(ipv & CEPH_PICK_ADDRESS_IPV4) ? 0 : -1;
+    int ipv6_r = !(ipv & CEPH_PICK_ADDRESS_IPV6) ? 0 : -1;
+    // first try on preferred numa node (if >= 0), then anywhere.
+    while (true) {
+      // note: pass in ipv to filter the matching addresses
+      if ((ipv & CEPH_PICK_ADDRESS_IPV4) &&
+	  (flags & CEPH_PICK_ADDRESS_PREFER_IPV4)) {
+	ipv4_r = fill_in_one_address(cct, ifa, CEPH_PICK_ADDRESS_IPV4,
+                                     networks, interfaces, addrs,
+                                     preferred_numa_node);
+      }
+      if (ipv & CEPH_PICK_ADDRESS_IPV6) {
+	ipv6_r = fill_in_one_address(cct, ifa, CEPH_PICK_ADDRESS_IPV6,
+                                     networks, interfaces, addrs,
+                                     preferred_numa_node);
+      }
+      if ((ipv & CEPH_PICK_ADDRESS_IPV4) &&
+	  !(flags & CEPH_PICK_ADDRESS_PREFER_IPV4)) {
+	ipv4_r = fill_in_one_address(cct, ifa, CEPH_PICK_ADDRESS_IPV4,
+                                     networks, interfaces, addrs,
+                                     preferred_numa_node);
+      }
+      if (ipv4_r >= 0 && ipv6_r >= 0) {
+	break;
+      }
+      if (preferred_numa_node < 0) {
+	return ipv4_r >= 0 && ipv6_r >= 0 ? 0 : -1;
+      }
+      preferred_numa_node = -1;      // try any numa node
+    }
+  }
+
+  // note: we may have a blank addr here
+
+  // ipv4 and/or ipv6?
+  if (addrs->v.empty()) {
+    addr.set_type(entity_addr_t::TYPE_MSGR2);
+    if ((ipv & CEPH_PICK_ADDRESS_IPV4) &&
+	(flags & CEPH_PICK_ADDRESS_PREFER_IPV4)) {
+      addr.set_family(AF_INET);
+      addrs->v.push_back(addr);
+    }
+    if (ipv & CEPH_PICK_ADDRESS_IPV6) {
+      addr.set_family(AF_INET6);
+      addrs->v.push_back(addr);
+    }
+    if ((ipv & CEPH_PICK_ADDRESS_IPV4) &&
+	!(flags & CEPH_PICK_ADDRESS_PREFER_IPV4)) {
+      addr.set_family(AF_INET);
+      addrs->v.push_back(addr);
+    }
+  }
+
+  // msgr2 or legacy or both?
+  if (msgrv == (CEPH_PICK_ADDRESS_MSGR1 | CEPH_PICK_ADDRESS_MSGR2)) {
+    vector<entity_addr_t> v;
+    v.swap(addrs->v);
+    for (auto a : v) {
+      a.set_type(entity_addr_t::TYPE_MSGR2);
+      if (flags & CEPH_PICK_ADDRESS_DEFAULT_MON_PORTS) {
+	a.set_port(CEPH_MON_PORT_IANA);
+      }
+      addrs->v.push_back(a);
+      a.set_type(entity_addr_t::TYPE_LEGACY);
+      if (flags & CEPH_PICK_ADDRESS_DEFAULT_MON_PORTS) {
+	a.set_port(CEPH_MON_PORT_LEGACY);
+      }
+      addrs->v.push_back(a);
+    }
+  } else if (msgrv == CEPH_PICK_ADDRESS_MSGR1) {
+    for (auto& a : addrs->v) {
+      a.set_type(entity_addr_t::TYPE_LEGACY);
+    }
+  } else {
+    for (auto& a : addrs->v) {
+      a.set_type(entity_addr_t::TYPE_MSGR2);
+    }
+  }
+
+  return 0;
+}
+
+int pick_addresses(
+  CephContext *cct,
+  unsigned flags,
+  entity_addrvec_t *addrs,
+  int preferred_numa_node)
+{
+  struct ifaddrs *ifa;
+  int r = getifaddrs(&ifa);
+  if (r < 0) {
+    r = -errno;
+    string err = cpp_strerror(r);
+    lderr(cct) << "unable to fetch interfaces and addresses: "
+	       <<  cpp_strerror(r) << dendl;
+    return r;
+  }
+  r = pick_addresses(cct, flags, ifa, addrs, preferred_numa_node);
+  freeifaddrs(ifa);
+  return r;
+}
+
+std::string pick_iface(CephContext *cct, const struct sockaddr_storage &network)
+{
+  struct ifaddrs *ifa;
+  int r = getifaddrs(&ifa);
+  if (r < 0) {
+    string err = cpp_strerror(errno);
+    lderr(cct) << "unable to fetch interfaces and addresses: " << err << dendl;
+    return {};
+  }
+
+  const unsigned int prefix_len = max(sizeof(in_addr::s_addr), sizeof(in6_addr::s6_addr)) * CHAR_BIT;
+  const struct ifaddrs *found = find_ip_in_subnet(
+    ifa,
+    (const struct sockaddr *) &network, prefix_len);
+
+  std::string result;
+  if (found) {
+    result = found->ifa_name;
+  }
+
+  freeifaddrs(ifa);
+
+  return result;
+}
+
+
+bool have_local_addr(CephContext *cct, const list<entity_addr_t>& ls, entity_addr_t *match)
+{
+  struct ifaddrs *ifa;
+  int r = getifaddrs(&ifa);
+  if (r < 0) {
+    lderr(cct) << "unable to fetch interfaces and addresses: " << cpp_strerror(errno) << dendl;
+    exit(1);
+  }
+
+  bool found = false;
+  for (struct ifaddrs *addrs = ifa; addrs != nullptr; addrs = addrs->ifa_next) {
+    if (addrs->ifa_addr) {
+      entity_addr_t a;
+      a.set_sockaddr(addrs->ifa_addr);
+      for (auto& p : ls) {
+        if (a.is_same_host(p)) {
+          *match = p;
+          found = true;
+          goto out;
+        }
+      }
+    }
+  }
+
+ out:
+  freeifaddrs(ifa);
+  return found;
+}
+
+int get_iface_numa_node(
+  const std::string& iface,
+  int *node)
+{
+  int ifatype = IFACE_DEFAULT;
+  string ifa = iface;
+  int pos = ifa.find(":");
+  if (pos != string::npos) {
+    ifa.erase(pos);
+  }
+  string fn = std::string("/sys/class/net/") + ifa + "/device/numa_node";
+  int fd = ::open(fn.c_str(), O_RDONLY);
+  if (fd < 0) {
+    fn = std::string("/sys/class/net/") + ifa + "/bonding/slaves";
+    fd = ::open(fn.c_str(), O_RDONLY);
+    if (fd < 0) {
+      return -errno;
+    }
+    ifatype = IFACE_BOND_PORT;
+  } else {
+    ifatype = IFACE_PHY_PORT;
+  }
+
+  int r = 0;
+  char buf[1024];
+  char *endptr = 0;
+  int bond_node = -1;
+  r = safe_read(fd, &buf, sizeof(buf));
+  if (r < 0) {
+    goto out;
+  }
+  buf[r] = 0;
+  while (r > 0 && ::isspace(buf[--r])) {
+    buf[r] = 0;
+  }
+
+  switch (ifatype) {
+  case IFACE_PHY_PORT:
+    *node = strtoll(buf, &endptr, 10);
+    if (endptr != buf + strlen(buf)) {
+      r = -EINVAL;
+      goto out;
+    }
+    r = 0;
+    break;
+  case IFACE_BOND_PORT:
+    std::vector<std::string> sv;
+    char *q, *p = strtok_r(buf, " ", &q);
+    while (p != NULL) {
+      sv.push_back(p);
+      p = strtok_r(NULL, " ", &q);
+    }
+    for (auto& iter : sv) {
+      int bn = -1;
+      r = get_iface_numa_node(iter, &bn);
+      if (r >= 0) {
+        if (bond_node == -1 || bn == bond_node) {
+          bond_node = bn;
+        } else {
+          *node = -2;
+          goto out;
+        }
+      } else {
+        goto out;
+      }
+    }
+    *node = bond_node;
+    break;
+  }
+
+  out:
+  ::close(fd);
+  return r;
+}
+
diff --git a/src/common/pick_address.h b/src/common/pick_address.h
new file mode 100644
index 00000000..ba9473a9
--- /dev/null
+++ b/src/common/pick_address.h
@@ -0,0 +1,88 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_PICK_ADDRESS_H
+#define CEPH_PICK_ADDRESS_H
+
+#include <string>
+#include <list>
+
+class CephContext;
+struct entity_addr_t;
+class entity_addrvec_t;
+
+
+#define CEPH_PICK_ADDRESS_PUBLIC      0x01
+#define CEPH_PICK_ADDRESS_CLUSTER     0x02
+#define CEPH_PICK_ADDRESS_MSGR1       0x04
+#define CEPH_PICK_ADDRESS_MSGR2       0x08
+#define CEPH_PICK_ADDRESS_IPV4        0x10
+#define CEPH_PICK_ADDRESS_IPV6        0x20
+#define CEPH_PICK_ADDRESS_PREFER_IPV4 0x40
+#define CEPH_PICK_ADDRESS_DEFAULT_MON_PORTS  0x80
+
+enum IfaceType {
+  IFACE_DEFAULT = 0,
+  IFACE_PHY_PORT = 1,
+  IFACE_BOND_PORT = 2
+};
+
+#ifndef WITH_SEASTAR
+/*
+  Pick addresses based on subnets if needed.
+
+  If an address is not explicitly given, and a list of subnets is
+  given, find an assigned IP address in the subnets and set that.
+
+  cluster_addr is set based on cluster_network, public_addr is set
+  based on public_network.
+
+  cluster_network and public_network are a list of ip/prefix pairs.
+
+  All IP addresses assigned to all local network interfaces are
+  potential matches.
+
+  If multiple IP addresses match the subnet, one of them will be
+  picked, effectively randomly.
+
+  This function will exit on error.
+ */
+void pick_addresses(CephContext *cct, int needs);
+
+#endif	// !WITH_SEASTAR
+
+int pick_addresses(CephContext *cct, unsigned flags, entity_addrvec_t *addrs,
+		   int preferred_numa_node = -1);
+int pick_addresses(CephContext *cct, unsigned flags, struct ifaddrs *ifa,
+		   entity_addrvec_t *addrs,
+		   int preferred_numa_node = -1);
+
+/**
+ * Find a network interface whose address matches the address/netmask
+ * in `network`.
+ */
+std::string pick_iface(CephContext *cct, const struct sockaddr_storage &network);
+
+/**
+ * check for a locally configured address
+ *
+ * check if any of the listed addresses is configured on the local host.
+ *
+ * @param cct context
+ * @param ls list of addresses
+ * @param match [out] pointer to match, if an item in @a ls is found configured locally.
+ */
+bool have_local_addr(CephContext *cct, const std::list<entity_addr_t>& ls, entity_addr_t *match);
+
+const struct sockaddr *find_ip_in_subnet_list(
+  CephContext *cct,
+  const struct ifaddrs *ifa,
+  unsigned ipv,
+  const std::string &networks,
+  const std::string &interfaces,
+  int numa_node=-1);
+
+int get_iface_numa_node(
+  const std::string& iface,
+  int *node);
+
+#endif
diff --git a/src/common/ppc-opcode.h b/src/common/ppc-opcode.h
new file mode 100644
index 00000000..aabb45c3
--- /dev/null
+++ b/src/common/ppc-opcode.h
@@ -0,0 +1,31 @@
+/* Copyright (C) 2017 International Business Machines Corp.
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef __OPCODES_H
+#define __OPCODES_H
+
+#define __PPC_RA(a)		(((a) & 0x1f) << 16)
+#define __PPC_RB(b)		(((b) & 0x1f) << 11)
+#define __PPC_XA(a)		((((a) & 0x1f) << 16) | (((a) & 0x20) >> 3))
+#define __PPC_XB(b)		((((b) & 0x1f) << 11) | (((b) & 0x20) >> 4))
+#define __PPC_XS(s)		((((s) & 0x1f) << 21) | (((s) & 0x20) >> 5))
+#define __PPC_XT(s)		__PPC_XS(s)
+#define VSX_XX3(t, a, b)	(__PPC_XT(t) | __PPC_XA(a) | __PPC_XB(b))
+#define VSX_XX1(s, a, b)	(__PPC_XS(s) | __PPC_RA(a) | __PPC_RB(b))
+
+#define PPC_INST_VPMSUMW	0x10000488
+#define PPC_INST_VPMSUMD	0x100004c8
+#define PPC_INST_MFVSRD		0x7c000066
+#define PPC_INST_MTVSRD		0x7c000166
+
+#define VPMSUMW(t, a, b)	.long PPC_INST_VPMSUMW | VSX_XX3((t), a, b)
+#define VPMSUMD(t, a, b)	.long PPC_INST_VPMSUMD | VSX_XX3((t), a, b)
+#define MFVRD(a, t)		.long PPC_INST_MFVSRD | VSX_XX1((t)+32, a, 0)
+#define MTVRD(t, a)		.long PPC_INST_MTVSRD | VSX_XX1((t)+32, a, 0)
+
+#endif
diff --git a/src/common/random_string.cc b/src/common/random_string.cc
new file mode 100644
index 00000000..c7289561
--- /dev/null
+++ b/src/common/random_string.cc
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <string_view>
+#include "auth/Crypto.h"
+#include "common/armor.h"
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "random_string.h"
+
+int gen_rand_base64(CephContext *cct, char *dest, size_t size) /* size should be the required string size + 1 */
+{
+  char buf[size];
+  char tmp_dest[size + 4]; /* so that there's space for the extra '=' characters, and some */
+  int ret;
+
+  cct->random()->get_bytes(buf, sizeof(buf));
+
+  ret = ceph_armor(tmp_dest, &tmp_dest[sizeof(tmp_dest)],
+		   (const char *)buf, ((const char *)buf) + ((size - 1) * 3 + 4 - 1) / 4);
+  if (ret < 0) {
+    lderr(cct) << "ceph_armor failed" << dendl;
+    return ret;
+  }
+  tmp_dest[ret] = '\0';
+  memcpy(dest, tmp_dest, size);
+  dest[size-1] = '\0';
+
+  return 0;
+}
+
+// choose 'size' random characters from the given string table
+static void choose_from(CryptoRandom* random, std::string_view table,
+                        char *dest, size_t size)
+{
+  random->get_bytes(dest, size);
+
+  for (size_t i = 0; i < size; i++) {
+    auto pos = static_cast<unsigned>(dest[i]);
+    dest[i] = table[pos % table.size()];
+  }
+}
+
+
+void gen_rand_alphanumeric(CephContext *cct, char *dest, size_t size) /* size should be the required string size + 1 */
+{
+  // this is basically a modified base64 charset, url friendly
+  static constexpr char table[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
+  choose_from(cct->random(), table, dest, size-1);
+  dest[size-1] = 0;
+}
+
+std::string gen_rand_alphanumeric(CephContext *cct, size_t size)
+{
+  std::string str;
+  str.resize(size + 1);
+  gen_rand_alphanumeric(cct, str.data(), str.size());
+  str.pop_back(); // pop the extra \0
+  return str;
+}
+
+void gen_rand_alphanumeric_lower(CephContext *cct, char *dest, size_t size) /* size should be the required string size + 1 */
+{
+  static constexpr char table[] = "0123456789abcdefghijklmnopqrstuvwxyz";
+  choose_from(cct->random(), table, dest, size-1);
+  dest[size-1] = 0;
+}
+
+std::string gen_rand_alphanumeric_lower(CephContext *cct, size_t size)
+{
+  std::string str;
+  str.resize(size + 1);
+  gen_rand_alphanumeric_lower(cct, str.data(), str.size());
+  str.pop_back(); // pop the extra \0
+  return str;
+}
+
+
+void gen_rand_alphanumeric_upper(CephContext *cct, char *dest, size_t size) /* size should be the required string size + 1 */
+{
+  static constexpr char table[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+  choose_from(cct->random(), table, dest, size-1);
+  dest[size-1] = 0;
+}
+
+std::string gen_rand_alphanumeric_upper(CephContext *cct, size_t size)
+{
+  std::string str;
+  str.resize(size + 1);
+  gen_rand_alphanumeric_upper(cct, str.data(), str.size());
+  str.pop_back(); // pop the extra \0
+  return str;
+}
+
+
+void gen_rand_alphanumeric_no_underscore(CephContext *cct, char *dest, size_t size) /* size should be the required string size + 1 */
+{
+  static constexpr char table[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-.";
+  choose_from(cct->random(), table, dest, size-1);
+  dest[size-1] = 0;
+}
+
+std::string gen_rand_alphanumeric_no_underscore(CephContext *cct, size_t size)
+{
+  std::string str;
+  str.resize(size + 1);
+  gen_rand_alphanumeric_no_underscore(cct, str.data(), str.size());
+  str.pop_back(); // pop the extra \0
+  return str;
+}
+
+
+void gen_rand_alphanumeric_plain(CephContext *cct, char *dest, size_t size) /* size should be the required string size + 1 */
+{
+  static constexpr char table[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
+  choose_from(cct->random(), table, dest, size-1);
+  dest[size-1] = 0;
+}
+
+std::string gen_rand_alphanumeric_plain(CephContext *cct, size_t size)
+{
+  std::string str;
+  str.resize(size + 1);
+  gen_rand_alphanumeric_plain(cct, str.data(), str.size());
+  str.pop_back(); // pop the extra \0
+  return str;
+}
diff --git a/src/common/random_string.h b/src/common/random_string.h
new file mode 100644
index 00000000..87c1644c
--- /dev/null
+++ b/src/common/random_string.h
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2009 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2015 Yehuda Sadeh <yehuda@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <string>
+
+class CephContext;
+
+/* size should be the required string size + 1 */
+int gen_rand_base64(CephContext *cct, char *dest, size_t size);
+void gen_rand_alphanumeric(CephContext *cct, char *dest, size_t size);
+void gen_rand_alphanumeric_lower(CephContext *cct, char *dest, size_t size);
+void gen_rand_alphanumeric_upper(CephContext *cct, char *dest, size_t size);
+void gen_rand_alphanumeric_no_underscore(CephContext *cct, char *dest, size_t size);
+void gen_rand_alphanumeric_plain(CephContext *cct, char *dest, size_t size);
+
+// returns a std::string with 'size' random characters
+std::string gen_rand_alphanumeric(CephContext *cct, size_t size);
+std::string gen_rand_alphanumeric_lower(CephContext *cct, size_t size);
+std::string gen_rand_alphanumeric_upper(CephContext *cct, size_t size);
+std::string gen_rand_alphanumeric_no_underscore(CephContext *cct, size_t size);
+std::string gen_rand_alphanumeric_plain(CephContext *cct, size_t size);
diff --git a/src/common/reverse.c b/src/common/reverse.c
new file mode 100644
index 00000000..f65540d5
--- /dev/null
+++ b/src/common/reverse.c
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "reverse.h"
+
+uint32_t reverse_bits(uint32_t v) {
+  if (v == 0)
+    return v;
+
+  /* reverse bits
+   * swap odd and even bits
+   */
+  v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+  /* swap consecutive pairs */
+  v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+  /* swap nibbles ... */
+  v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+  /* swap bytes */
+  v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
+  /* swap 2-byte long pairs */
+  v = ( v >> 16             ) | ( v               << 16);                                
+  return v;
+}
+
+uint32_t reverse_nibbles(uint32_t retval) {
+  /* reverse nibbles */
+  retval = ((retval & 0x0f0f0f0f) << 4) | ((retval & 0xf0f0f0f0) >> 4);
+  retval = ((retval & 0x00ff00ff) << 8) | ((retval & 0xff00ff00) >> 8);
+  retval = ((retval & 0x0000ffff) << 16) | ((retval & 0xffff0000) >> 16);
+  return retval;
+}
diff --git a/src/common/reverse.h b/src/common/reverse.h
new file mode 100644
index 00000000..9a199a84
--- /dev/null
+++ b/src/common/reverse.h
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef __CEPH_OS_REVERSE_H
+#define __CEPH_OS_REVERSE_H
+
+#include "include/int_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint32_t reverse_bits(uint32_t v);
+extern uint32_t reverse_nibbles(uint32_t retval);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif    
diff --git a/src/common/run_cmd.cc b/src/common/run_cmd.cc
new file mode 100644
index 00000000..a84f68ae
--- /dev/null
+++ b/src/common/run_cmd.cc
@@ -0,0 +1,80 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/errno.h"
+
+#include <sstream>
+#include <stdarg.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <vector>
+
+using std::ostringstream;
+
+std::string run_cmd(const char *cmd, ...)
+{
+  std::vector <const char *> arr;
+  va_list ap;
+  va_start(ap, cmd);
+  const char *c = cmd;
+  do {
+    arr.push_back(c);
+    c = va_arg(ap, const char*);
+  } while (c != NULL);
+  va_end(ap);
+  arr.push_back(NULL);
+
+  int fret = fork();
+  if (fret == -1) {
+    int err = errno;
+    ostringstream oss;
+    oss << "run_cmd(" << cmd << "): unable to fork(): " << cpp_strerror(err);
+    return oss.str();
+  }
+  else if (fret == 0) {
+    // execvp doesn't modify its arguments, so the const-cast here is safe.
+    close(STDIN_FILENO);
+    close(STDOUT_FILENO);
+    close(STDERR_FILENO);
+    execvp(cmd, (char * const*)&arr[0]);
+    _exit(127);
+  }
+  int status;
+  while (waitpid(fret, &status, 0) == -1) {
+    int err = errno;
+    if (err == EINTR)
+      continue;
+    ostringstream oss;
+    oss << "run_cmd(" << cmd << "): waitpid error: "
+	 << cpp_strerror(err);
+    return oss.str();
+  }
+  if (WIFEXITED(status)) {
+    int wexitstatus = WEXITSTATUS(status);
+    if (wexitstatus != 0) {
+      ostringstream oss;
+      oss << "run_cmd(" << cmd << "): exited with status " << wexitstatus;
+      return oss.str();
+    }
+    return "";
+  }
+  else if (WIFSIGNALED(status)) {
+    ostringstream oss;
+    oss << "run_cmd(" << cmd << "): terminated by signal";
+    return oss.str();
+  }
+  ostringstream oss;
+  oss << "run_cmd(" << cmd << "): terminated by unknown mechanism";
+  return oss.str();
+}
diff --git a/src/common/run_cmd.h b/src/common/run_cmd.h
new file mode 100644
index 00000000..9d82a649
--- /dev/null
+++ b/src/common/run_cmd.h
@@ -0,0 +1,33 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_RUN_CMD_H
+#define CEPH_COMMON_RUN_CMD_H
+
+#include <string>
+
+//
+// Fork a command and run it. The shell will not be invoked and shell
+// expansions will not be done.
+// This function takes a variable number of arguments. The last argument must
+// be NULL.
+//
+// Example:
+//   run_cmd("rm", "-rf", "foo", NULL)
+//
+// Returns an empty string on success, and an error string otherwise.
+//
+std::string run_cmd(const char *cmd, ...);
+
+#endif
diff --git a/src/common/safe_io.c b/src/common/safe_io.c
new file mode 100644
index 00000000..22369af3
--- /dev/null
+++ b/src/common/safe_io.c
@@ -0,0 +1,229 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/safe_io.h"
+#include "include/compat.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+
+ssize_t safe_read(int fd, void *buf, size_t count)
+{
+	size_t cnt = 0;
+
+	while (cnt < count) {
+		ssize_t r = read(fd, buf, count - cnt);
+		if (r <= 0) {
+			if (r == 0) {
+				// EOF
+				return cnt;
+			}
+			if (errno == EINTR)
+				continue;
+			return -errno;
+		}
+		cnt += r;
+		buf = (char *)buf + r;
+	}
+	return cnt;
+}
+
+ssize_t safe_read_exact(int fd, void *buf, size_t count)
+{
+        ssize_t ret = safe_read(fd, buf, count);
+	if (ret < 0)
+		return ret;
+	if ((size_t)ret != count)
+		return -EDOM;
+	return 0;
+}
+ 
+ssize_t safe_write(int fd, const void *buf, size_t count)
+{
+	while (count > 0) {
+		ssize_t r = write(fd, buf, count);
+		if (r < 0) {
+			if (errno == EINTR)
+				continue;
+			return -errno;
+		}
+		count -= r;
+		buf = (char *)buf + r;
+	}
+	return 0;
+}
+
+ssize_t safe_pread(int fd, void *buf, size_t count, off_t offset)
+{
+	size_t cnt = 0;
+	char *b = (char*)buf;
+
+	while (cnt < count) {
+		ssize_t r = pread(fd, b + cnt, count - cnt, offset + cnt);
+		if (r <= 0) {
+			if (r == 0) {
+				// EOF
+				return cnt;
+			}
+			if (errno == EINTR)
+				continue;
+			return -errno;
+		}
+
+		cnt += r;
+	}
+	return cnt;
+}
+
+ssize_t safe_pread_exact(int fd, void *buf, size_t count, off_t offset)
+{
+	ssize_t ret = safe_pread(fd, buf, count, offset);
+	if (ret < 0)
+		return ret;
+	if ((size_t)ret != count)
+		return -EDOM;
+	return 0;
+}
+
+ssize_t safe_pwrite(int fd, const void *buf, size_t count, off_t offset)
+{
+	while (count > 0) {
+		ssize_t r = pwrite(fd, buf, count, offset);
+		if (r < 0) {
+			if (errno == EINTR)
+				continue;
+			return -errno;
+		}
+		count -= r;
+		buf = (char *)buf + r;
+		offset += r;
+	}
+	return 0;
+}
+
+#ifdef CEPH_HAVE_SPLICE
+ssize_t safe_splice(int fd_in, off_t *off_in, int fd_out, off_t *off_out,
+		    size_t len, unsigned int flags)
+{
+  size_t cnt = 0;
+
+  while (cnt < len) {
+    ssize_t r = splice(fd_in, off_in, fd_out, off_out, len - cnt, flags);
+    if (r <= 0) {
+      if (r == 0) {
+	// EOF
+	return cnt;
+      }
+      if (errno == EINTR)
+	continue;
+      if (errno == EAGAIN)
+	break;
+      return -errno;
+    }
+    cnt += r;
+  }
+  return cnt;
+}
+
+ssize_t safe_splice_exact(int fd_in, off_t *off_in, int fd_out,
+			  off_t *off_out, size_t len, unsigned int flags)
+{
+  ssize_t ret = safe_splice(fd_in, off_in, fd_out, off_out, len, flags);
+  if (ret < 0)
+    return ret;
+  if ((size_t)ret != len)
+    return -EDOM;
+  return 0;
+}
+#endif
+
+int safe_write_file(const char *base, const char *file,
+		    const char *val, size_t vallen,
+		    unsigned mode)
+{
+  int ret;
+  char fn[PATH_MAX];
+  char tmp[PATH_MAX];
+  int fd;
+
+  // does the file already have correct content?
+  char oldval[80];
+  ret = safe_read_file(base, file, oldval, sizeof(oldval));
+  if (ret == (int)vallen && memcmp(oldval, val, vallen) == 0)
+    return 0;  // yes.
+
+  snprintf(fn, sizeof(fn), "%s/%s", base, file);
+  snprintf(tmp, sizeof(tmp), "%s/%s.tmp", base, file);
+  fd = open(tmp, O_WRONLY|O_CREAT|O_TRUNC, mode);
+  if (fd < 0) {
+    ret = errno;
+    return -ret;
+  }
+  ret = safe_write(fd, val, vallen);
+  if (ret) {
+    VOID_TEMP_FAILURE_RETRY(close(fd));
+    return ret;
+  }
+
+  ret = fsync(fd);
+  if (ret < 0) ret = -errno;
+  VOID_TEMP_FAILURE_RETRY(close(fd));
+  if (ret < 0) {
+    unlink(tmp);
+    return ret;
+  }
+  ret = rename(tmp, fn);
+  if (ret < 0) {
+    ret = -errno;
+    unlink(tmp);
+    return ret;
+  }
+
+  fd = open(base, O_RDONLY);
+  if (fd < 0) {
+    ret = -errno;
+    return ret;
+  }
+  ret = fsync(fd);
+  if (ret < 0) ret = -errno;
+  VOID_TEMP_FAILURE_RETRY(close(fd));
+
+  return ret;
+}
+
+int safe_read_file(const char *base, const char *file,
+		   char *val, size_t vallen)
+{
+  char fn[PATH_MAX];
+  int fd, len;
+
+  snprintf(fn, sizeof(fn), "%s/%s", base, file);
+  fd = open(fn, O_RDONLY);
+  if (fd < 0) {
+    return -errno;
+  }
+  len = safe_read(fd, val, vallen);
+  if (len < 0) {
+    VOID_TEMP_FAILURE_RETRY(close(fd));
+    return len;
+  }
+  // close sometimes returns errors, but only after write()
+  VOID_TEMP_FAILURE_RETRY(close(fd));
+
+  return len;
+}
diff --git a/src/common/safe_io.h b/src/common/safe_io.h
new file mode 100644
index 00000000..7ccbf37b
--- /dev/null
+++ b/src/common/safe_io.h
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_SAFE_IO
+#define CEPH_SAFE_IO
+
+#include "common/compiler_extensions.h"
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  /*
+   * Safe functions wrapping the raw read() and write() libc functions.
+   * These retry on EINTR, and on error return -errno instead of returning
+   * -1 and setting errno).
+   */
+  ssize_t safe_read(int fd, void *buf, size_t count)
+      WARN_UNUSED_RESULT;
+  ssize_t safe_write(int fd, const void *buf, size_t count)
+      WARN_UNUSED_RESULT;
+  ssize_t safe_pread(int fd, void *buf, size_t count, off_t offset)
+      WARN_UNUSED_RESULT;
+  ssize_t safe_pwrite(int fd, const void *buf, size_t count, off_t offset)
+      WARN_UNUSED_RESULT;
+#ifdef CEPH_HAVE_SPLICE
+  /*
+   * Similar to the above (non-exact version) and below (exact version).
+   * See splice(2) for parameter descriptions.
+   */
+  ssize_t safe_splice(int fd_in, off_t *off_in, int fd_out, off_t *off_out,
+		      size_t len, unsigned int flags)
+    WARN_UNUSED_RESULT;
+  ssize_t safe_splice_exact(int fd_in, off_t *off_in, int fd_out,
+			    off_t *off_out, size_t len, unsigned int flags)
+    WARN_UNUSED_RESULT;
+#endif
+
+  /*
+   * Same as the above functions, but return -EDOM unless exactly the requested
+   * number of bytes can be read.
+   */
+  ssize_t safe_read_exact(int fd, void *buf, size_t count)
+      WARN_UNUSED_RESULT;
+  ssize_t safe_pread_exact(int fd, void *buf, size_t count, off_t offset)
+      WARN_UNUSED_RESULT;
+
+
+  /*
+   * Safe functions to read and write an entire file.
+   */
+  int safe_write_file(const char *base, const char *file,
+		      const char *val, size_t vallen,
+		      unsigned mode);
+  int safe_read_file(const char *base, const char *file,
+		     char *val, size_t vallen);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/common/scrub_types.cc b/src/common/scrub_types.cc
new file mode 100644
index 00000000..d0385138
--- /dev/null
+++ b/src/common/scrub_types.cc
@@ -0,0 +1,290 @@
+#include "scrub_types.h"
+
+using namespace librados;
+
+void object_id_wrapper::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(name, bl);
+  encode(nspace, bl);
+  encode(locator, bl);
+  encode(snap, bl);
+  ENCODE_FINISH(bl);
+}
+
+void object_id_wrapper::decode(bufferlist::const_iterator& bp)
+{
+  DECODE_START(1, bp);
+  decode(name, bp);
+  decode(nspace, bp);
+  decode(locator, bp);
+  decode(snap, bp);
+  DECODE_FINISH(bp);
+}
+
+namespace librados {
+static void encode(const object_id_t& obj, bufferlist& bl)
+{
+  reinterpret_cast<const object_id_wrapper&>(obj).encode(bl);
+}
+}
+
+void osd_shard_wrapper::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(osd, bl);
+  encode(shard, bl);
+  ENCODE_FINISH(bl);
+}
+
+void osd_shard_wrapper::decode(bufferlist::const_iterator& bp)
+{
+  DECODE_START(1, bp);
+  decode(osd, bp);
+  decode(shard, bp);
+  DECODE_FINISH(bp);
+}
+
+namespace librados {
+static void encode(const osd_shard_t& shard, bufferlist& bl) {
+  reinterpret_cast<const osd_shard_wrapper&>(shard).encode(bl);
+}
+}
+
+void shard_info_wrapper::set_object(const ScrubMap::object& object)
+{
+  for (auto attr : object.attrs) {
+    bufferlist bl;
+    bl.push_back(attr.second);
+    attrs.insert(std::make_pair(attr.first, std::move(bl)));
+  }
+  size = object.size;
+  if (object.omap_digest_present) {
+    omap_digest_present = true;
+    omap_digest = object.omap_digest;
+  }
+  if (object.digest_present) {
+    data_digest_present = true;
+    data_digest = object.digest;
+  }
+}
+
+void shard_info_wrapper::encode(bufferlist& bl) const
+{
+  ENCODE_START(3, 3, bl);
+  encode(errors, bl);
+  encode(primary, bl);
+  if (!has_shard_missing()) {
+    encode(attrs, bl);
+    encode(size, bl);
+    encode(omap_digest_present, bl);
+    encode(omap_digest, bl);
+    encode(data_digest_present, bl);
+    encode(data_digest, bl);
+    encode(selected_oi, bl);
+  }
+  ENCODE_FINISH(bl);
+}
+
+void shard_info_wrapper::decode(bufferlist::const_iterator& bp)
+{
+  DECODE_START(3, bp);
+  decode(errors, bp);
+  decode(primary, bp);
+  if (!has_shard_missing()) {
+    decode(attrs, bp);
+    decode(size, bp);
+    decode(omap_digest_present, bp);
+    decode(omap_digest, bp);
+    decode(data_digest_present, bp);
+    decode(data_digest, bp);
+    decode(selected_oi, bp);
+  }
+  DECODE_FINISH(bp);
+}
+
+inconsistent_obj_wrapper::inconsistent_obj_wrapper(const hobject_t& hoid)
+  : inconsistent_obj_t{librados::object_id_t{hoid.oid.name,
+                                 hoid.nspace,
+                                 hoid.get_key(), hoid.snap}}
+{}
+
+void inconsistent_obj_wrapper::add_shard(const pg_shard_t& pgs,
+                                         const shard_info_wrapper& shard)
+{
+  union_shards.errors |= shard.errors;
+  shards.emplace(osd_shard_t{pgs.osd, int8_t(pgs.shard)}, shard);
+}
+
+void
+inconsistent_obj_wrapper::set_auth_missing(const hobject_t& hoid,
+                                           const map<pg_shard_t, ScrubMap*>& maps,
+					   map<pg_shard_t, shard_info_wrapper> &shard_map,
+					   int &shallow_errors, int &deep_errors,
+					   const pg_shard_t &primary)
+{
+  for (auto pg_map : maps) {
+    auto oid_object = pg_map.second->objects.find(hoid);
+    shard_map[pg_map.first].primary = (pg_map.first == primary);
+    if (oid_object == pg_map.second->objects.end())
+      shard_map[pg_map.first].set_missing();
+    else
+      shard_map[pg_map.first].set_object(oid_object->second);
+    if (shard_map[pg_map.first].has_deep_errors())
+      ++deep_errors;
+    else if (shard_map[pg_map.first].has_shallow_errors())
+      ++shallow_errors;
+    union_shards.errors |= shard_map[pg_map.first].errors;
+    shards.emplace(osd_shard_t{pg_map.first.osd, pg_map.first.shard}, shard_map[pg_map.first]);
+  }
+}
+
+namespace librados {
+  static void encode(const shard_info_t& shard, bufferlist& bl)
+  {
+    reinterpret_cast<const shard_info_wrapper&>(shard).encode(bl);
+  }
+}
+
+void inconsistent_obj_wrapper::encode(bufferlist& bl) const
+{
+  ENCODE_START(2, 2, bl);
+  encode(errors, bl);
+  encode(object, bl);
+  encode(version, bl);
+  encode(shards, bl);
+  encode(union_shards.errors, bl);
+  ENCODE_FINISH(bl);
+}
+
+void inconsistent_obj_wrapper::decode(bufferlist::const_iterator& bp)
+{
+  DECODE_START(2, bp);
+  DECODE_OLDEST(2);
+  decode(errors, bp);
+  decode(object, bp);
+  decode(version, bp);
+  decode(shards, bp);
+  decode(union_shards.errors, bp);
+  DECODE_FINISH(bp);
+}
+
+inconsistent_snapset_wrapper::inconsistent_snapset_wrapper(const hobject_t& hoid)
+  : inconsistent_snapset_t{object_id_t{hoid.oid.name,
+                                       hoid.nspace,
+                                       hoid.get_key(),
+                                       hoid.snap}}
+{}
+
+using inc_snapset_t = inconsistent_snapset_t;
+
+void inconsistent_snapset_wrapper::set_headless()
+{
+  errors |= inc_snapset_t::HEADLESS_CLONE;
+}
+
+void inconsistent_snapset_wrapper::set_snapset_missing()
+{
+  errors |= inc_snapset_t::SNAPSET_MISSING;
+}
+
+void inconsistent_snapset_wrapper::set_info_missing()
+{
+  errors |= inc_snapset_t::INFO_MISSING;
+}
+
+void inconsistent_snapset_wrapper::set_snapset_corrupted()
+{
+  errors |= inc_snapset_t::SNAPSET_CORRUPTED;
+}
+
+void inconsistent_snapset_wrapper::set_info_corrupted()
+{
+  errors |= inc_snapset_t::INFO_CORRUPTED;
+}
+
+void inconsistent_snapset_wrapper::set_clone_missing(snapid_t snap)
+{
+  errors |= inc_snapset_t::CLONE_MISSING;
+  missing.push_back(snap);
+}
+
+void inconsistent_snapset_wrapper::set_clone(snapid_t snap)
+{
+  errors |= inc_snapset_t::EXTRA_CLONES;
+  clones.push_back(snap);
+}
+
+void inconsistent_snapset_wrapper::set_snapset_error()
+{
+  errors |= inc_snapset_t::SNAP_ERROR;
+}
+
+void inconsistent_snapset_wrapper::set_size_mismatch()
+{
+  errors |= inc_snapset_t::SIZE_MISMATCH;
+}
+
+void inconsistent_snapset_wrapper::encode(bufferlist& bl) const
+{
+  ENCODE_START(2, 1, bl);
+  encode(errors, bl);
+  encode(object, bl);
+  encode(clones, bl);
+  encode(missing, bl);
+  encode(ss_bl, bl);
+  ENCODE_FINISH(bl);
+}
+
+void inconsistent_snapset_wrapper::decode(bufferlist::const_iterator& bp)
+{
+  DECODE_START(2, bp);
+  decode(errors, bp);
+  decode(object, bp);
+  decode(clones, bp);
+  decode(missing, bp);
+  if (struct_v >= 2) {
+    decode(ss_bl, bp);
+  }
+  DECODE_FINISH(bp);
+}
+
+void scrub_ls_arg_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(interval, bl);
+  encode(get_snapsets, bl);
+  encode(start_after.name, bl);
+  encode(start_after.nspace, bl);
+  encode(start_after.snap, bl);
+  encode(max_return, bl);
+  ENCODE_FINISH(bl);
+}
+
+void scrub_ls_arg_t::decode(bufferlist::const_iterator& bp)
+{
+  DECODE_START(1, bp);
+  decode(interval, bp);
+  decode(get_snapsets, bp);
+  decode(start_after.name, bp);
+  decode(start_after.nspace, bp);
+  decode(start_after.snap, bp);
+  decode(max_return, bp);
+  DECODE_FINISH(bp);
+}
+
+void scrub_ls_result_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(interval, bl);
+  encode(vals, bl);
+  ENCODE_FINISH(bl);
+}
+
+void scrub_ls_result_t::decode(bufferlist::const_iterator& bp)
+{
+  DECODE_START(1, bp);
+  decode(interval, bp);
+  decode(vals, bp);
+  DECODE_FINISH(bp);
+}
diff --git a/src/common/scrub_types.h b/src/common/scrub_types.h
new file mode 100644
index 00000000..61a74b81
--- /dev/null
+++ b/src/common/scrub_types.h
@@ -0,0 +1,210 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_SCRUB_TYPES_H
+#define CEPH_SCRUB_TYPES_H
+
+#include "osd/osd_types.h"
+
+// wrappers around scrub types to offer the necessary bits other than
+// the minimal set that the lirados requires
+struct object_id_wrapper : public librados::object_id_t {
+  explicit object_id_wrapper(const hobject_t& hoid)
+    : object_id_t{hoid.oid.name, hoid.nspace, hoid.get_key(), hoid.snap}
+  {}
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& bl);
+};
+
+WRITE_CLASS_ENCODER(object_id_wrapper)
+
+namespace librados {
+inline void decode(object_id_t& obj, bufferlist::const_iterator& bp) {
+  reinterpret_cast<object_id_wrapper&>(obj).decode(bp);
+}
+}
+
+struct osd_shard_wrapper : public librados::osd_shard_t {
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& bp);
+};
+
+WRITE_CLASS_ENCODER(osd_shard_wrapper)
+
+namespace librados {
+  inline void decode(librados::osd_shard_t& shard, bufferlist::const_iterator& bp) {
+    reinterpret_cast<osd_shard_wrapper&>(shard).decode(bp);
+  }
+}
+
+struct shard_info_wrapper : public librados::shard_info_t {
+public:
+  shard_info_wrapper() = default;
+  explicit shard_info_wrapper(const ScrubMap::object& object) {
+    set_object(object);
+  }
+  void set_object(const ScrubMap::object& object);
+  void set_missing() {
+    errors |= err_t::SHARD_MISSING;
+  }
+  void set_omap_digest_mismatch_info() {
+    errors |= err_t::OMAP_DIGEST_MISMATCH_INFO;
+  }
+  void set_size_mismatch_info() {
+    errors |= err_t::SIZE_MISMATCH_INFO;
+  }
+  void set_data_digest_mismatch_info() {
+    errors |= err_t::DATA_DIGEST_MISMATCH_INFO;
+  }
+  void set_read_error() {
+    errors |= err_t::SHARD_READ_ERR;
+  }
+  void set_stat_error() {
+    errors |= err_t::SHARD_STAT_ERR;
+  }
+  void set_ec_hash_mismatch() {
+    errors |= err_t::SHARD_EC_HASH_MISMATCH;
+  }
+  void set_ec_size_mismatch() {
+    errors |= err_t::SHARD_EC_SIZE_MISMATCH;
+  }
+  void set_info_missing() {
+    errors |= err_t::INFO_MISSING;
+  }
+  void set_info_corrupted() {
+    errors |= err_t::INFO_CORRUPTED;
+  }
+  void set_snapset_missing() {
+    errors |= err_t::SNAPSET_MISSING;
+  }
+  void set_snapset_corrupted() {
+    errors |= err_t::SNAPSET_CORRUPTED;
+  }
+  void set_obj_size_info_mismatch() {
+    errors |= err_t::OBJ_SIZE_INFO_MISMATCH;
+  }
+  void set_hinfo_missing() {
+    errors |= err_t::HINFO_MISSING;
+  }
+  void set_hinfo_corrupted() {
+    errors |= err_t::HINFO_CORRUPTED;
+  }
+  bool only_data_digest_mismatch_info() const {
+    return errors == err_t::DATA_DIGEST_MISMATCH_INFO;
+  }
+  void clear_data_digest_mismatch_info() {
+    errors &= ~err_t::DATA_DIGEST_MISMATCH_INFO;
+  }
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& bp);
+};
+
+WRITE_CLASS_ENCODER(shard_info_wrapper)
+
+namespace librados {
+  inline void decode(librados::shard_info_t& shard,
+		     bufferlist::const_iterator& bp) {
+    reinterpret_cast<shard_info_wrapper&>(shard).decode(bp);
+  }
+}
+
+struct inconsistent_obj_wrapper : librados::inconsistent_obj_t {
+  explicit inconsistent_obj_wrapper(const hobject_t& hoid);
+
+  void set_object_info_inconsistency() {
+    errors |= obj_err_t::OBJECT_INFO_INCONSISTENCY;
+  }
+  void set_omap_digest_mismatch() {
+    errors |= obj_err_t::OMAP_DIGEST_MISMATCH;
+  }
+  void set_data_digest_mismatch() {
+    errors |= obj_err_t::DATA_DIGEST_MISMATCH;
+  }
+  void set_size_mismatch() {
+    errors |= obj_err_t::SIZE_MISMATCH;
+  }
+  void set_attr_value_mismatch() {
+    errors |= obj_err_t::ATTR_VALUE_MISMATCH;
+  }
+  void set_attr_name_mismatch() {
+    errors |= obj_err_t::ATTR_NAME_MISMATCH;
+  }
+  void set_snapset_inconsistency() {
+    errors |= obj_err_t::SNAPSET_INCONSISTENCY;
+  }
+  void set_hinfo_inconsistency() {
+    errors |= obj_err_t::HINFO_INCONSISTENCY;
+  }
+  void set_size_too_large() {
+    errors |= obj_err_t::SIZE_TOO_LARGE;
+  }
+  void add_shard(const pg_shard_t& pgs, const shard_info_wrapper& shard);
+  void set_auth_missing(const hobject_t& hoid,
+                        const map<pg_shard_t, ScrubMap*>&,
+			map<pg_shard_t, shard_info_wrapper>&,
+			int &shallow_errors, int &deep_errors,
+			const pg_shard_t &primary);
+  void set_version(uint64_t ver) { version = ver; }
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& bp);
+};
+
+WRITE_CLASS_ENCODER(inconsistent_obj_wrapper)
+
+inline void decode(librados::inconsistent_obj_t& obj,
+		   bufferlist::const_iterator& bp) {
+  reinterpret_cast<inconsistent_obj_wrapper&>(obj).decode(bp);
+}
+
+struct inconsistent_snapset_wrapper : public librados::inconsistent_snapset_t {
+  inconsistent_snapset_wrapper() = default;
+  explicit inconsistent_snapset_wrapper(const hobject_t& head);
+  void set_headless();
+  // soid claims that it is a head or a snapdir, but its SS_ATTR
+  // is missing.
+  void set_snapset_missing();
+  void set_info_missing();
+  void set_snapset_corrupted();
+  void set_info_corrupted();
+  // snapset with missing clone
+  void set_clone_missing(snapid_t);
+  // Clones that are there
+  void set_clone(snapid_t);
+  // the snapset is not consistent with itself
+  void set_snapset_error();
+  void set_size_mismatch();
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& bp);
+};
+
+WRITE_CLASS_ENCODER(inconsistent_snapset_wrapper)
+
+namespace librados {
+  inline void decode(librados::inconsistent_snapset_t& snapset,
+		     bufferlist::const_iterator& bp) {
+    reinterpret_cast<inconsistent_snapset_wrapper&>(snapset).decode(bp);
+  }
+}
+
+struct scrub_ls_arg_t {
+  uint32_t interval;
+  uint32_t get_snapsets;
+  librados::object_id_t start_after;
+  uint64_t max_return;
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& bl);
+};
+
+WRITE_CLASS_ENCODER(scrub_ls_arg_t);
+
+struct scrub_ls_result_t {
+  epoch_t interval;
+  std::vector<bufferlist> vals;
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& bl);
+};
+
+WRITE_CLASS_ENCODER(scrub_ls_result_t);
+
+#endif
diff --git a/src/common/sctp_crc32.c b/src/common/sctp_crc32.c
new file mode 100644
index 00000000..964216d7
--- /dev/null
+++ b/src/common/sctp_crc32.c
@@ -0,0 +1,789 @@
+/*-
+ * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * a) Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * b) Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the distribution.
+ *
+ * c) Neither the name of Cisco Systems, Inc. nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* $KAME: sctp_crc32.c,v 1.12 2005/03/06 16:04:17 itojun Exp $	 */
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/cdefs.h>
+#endif
+#if 0
+__FBSDID("$FreeBSD: src/sys/netinet/sctp_crc32.c,v 1.8 2007/05/08 17:01:10 rrs Exp $");
+
+
+#include <netinet/sctp_os.h>
+#include <netinet/sctp_crc32.h>
+#endif
+
+#include <stdint.h>
+
+#include "acconfig.h"
+
+#ifndef SCTP_USE_ADLER32
+
+
+/**
+ *
+ * Routine Description:
+ *
+ * Computes the CRC32c checksum for the specified buffer using the slicing by 8
+ * algorithm over 64 bit quantities.
+ *
+ * Arguments:
+ *
+ *		p_running_crc - pointer to the initial or final remainder value
+ *				used in CRC computations. It should be set to
+ *				non-NULL if the mode argument is equal to CONT or END
+ *		p_buf - the packet buffer where crc computations are being performed
+ *		length - the length of p_buf in bytes
+ *		init_bytes - the number of initial bytes that need to be processed before
+ *					 aligning p_buf to multiples of 4 bytes
+ *		mode - can be any of the following: BEGIN, CONT, END, BODY, ALIGN
+ *
+ * Return value:
+ *
+ *		The computed CRC32c value
+ */
+
+
+/*
+ * Copyright (c) 2004-2006 Intel Corporation - All Rights Reserved
+ *
+ *
+ * This software program is licensed subject to the BSD License, available at
+ * http://www.opensource.org/licenses/bsd-license.html.
+ *
+ * Abstract:
+ *
+ * Tables for software CRC generation
+ */
+
+/*
+ * The following CRC lookup table was generated automagically using the
+ * following model parameters:
+ *
+ * Generator Polynomial = ................. 0x1EDC6F41 Generator Polynomial
+ * Length = .......... 32 bits Reflected Bits = ....................... TRUE
+ * Table Generation Offset = .............. 32 bits Number of Slices =
+ * ..................... 8 slices Slice Lengths = ........................ 8
+ * 8 8 8 8 8 8 8 Directory Name = ....................... .\ File Name =
+ * ............................ 8x256_tables.c
+ */
+
+uint32_t sctp_crc_tableil8_o32[256] =
+{
+	0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB,
+	0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24,
+	0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B, 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384,
+	0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54, 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B,
+	0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A, 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35,
+	0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5, 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA,
+	0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45, 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A,
+	0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A, 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595,
+	0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48, 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957,
+	0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687, 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198,
+	0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927, 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38,
+	0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8, 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7,
+	0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096, 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789,
+	0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859, 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46,
+	0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9, 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6,
+	0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36, 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829,
+	0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C, 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93,
+	0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043, 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C,
+	0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3, 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC,
+	0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C, 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033,
+	0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652, 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D,
+	0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D, 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982,
+	0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D, 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622,
+	0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2, 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED,
+	0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530, 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F,
+	0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF, 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0,
+	0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F, 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540,
+	0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90, 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F,
+	0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE, 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1,
+	0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321, 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E,
+	0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81, 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E,
+	0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351
+};
+
+/*
+ * end of the CRC lookup table crc_tableil8_o32
+ */
+
+
+
+/*
+ * The following CRC lookup table was generated automagically using the
+ * following model parameters:
+ *
+ * Generator Polynomial = ................. 0x1EDC6F41 Generator Polynomial
+ * Length = .......... 32 bits Reflected Bits = ....................... TRUE
+ * Table Generation Offset = .............. 32 bits Number of Slices =
+ * ..................... 8 slices Slice Lengths = ........................ 8
+ * 8 8 8 8 8 8 8 Directory Name = ....................... .\ File Name =
+ * ............................ 8x256_tables.c
+ */
+
+uint32_t sctp_crc_tableil8_o40[256] =
+{
+	0x00000000, 0x13A29877, 0x274530EE, 0x34E7A899, 0x4E8A61DC, 0x5D28F9AB, 0x69CF5132, 0x7A6DC945,
+	0x9D14C3B8, 0x8EB65BCF, 0xBA51F356, 0xA9F36B21, 0xD39EA264, 0xC03C3A13, 0xF4DB928A, 0xE7790AFD,
+	0x3FC5F181, 0x2C6769F6, 0x1880C16F, 0x0B225918, 0x714F905D, 0x62ED082A, 0x560AA0B3, 0x45A838C4,
+	0xA2D13239, 0xB173AA4E, 0x859402D7, 0x96369AA0, 0xEC5B53E5, 0xFFF9CB92, 0xCB1E630B, 0xD8BCFB7C,
+	0x7F8BE302, 0x6C297B75, 0x58CED3EC, 0x4B6C4B9B, 0x310182DE, 0x22A31AA9, 0x1644B230, 0x05E62A47,
+	0xE29F20BA, 0xF13DB8CD, 0xC5DA1054, 0xD6788823, 0xAC154166, 0xBFB7D911, 0x8B507188, 0x98F2E9FF,
+	0x404E1283, 0x53EC8AF4, 0x670B226D, 0x74A9BA1A, 0x0EC4735F, 0x1D66EB28, 0x298143B1, 0x3A23DBC6,
+	0xDD5AD13B, 0xCEF8494C, 0xFA1FE1D5, 0xE9BD79A2, 0x93D0B0E7, 0x80722890, 0xB4958009, 0xA737187E,
+	0xFF17C604, 0xECB55E73, 0xD852F6EA, 0xCBF06E9D, 0xB19DA7D8, 0xA23F3FAF, 0x96D89736, 0x857A0F41,
+	0x620305BC, 0x71A19DCB, 0x45463552, 0x56E4AD25, 0x2C896460, 0x3F2BFC17, 0x0BCC548E, 0x186ECCF9,
+	0xC0D23785, 0xD370AFF2, 0xE797076B, 0xF4359F1C, 0x8E585659, 0x9DFACE2E, 0xA91D66B7, 0xBABFFEC0,
+	0x5DC6F43D, 0x4E646C4A, 0x7A83C4D3, 0x69215CA4, 0x134C95E1, 0x00EE0D96, 0x3409A50F, 0x27AB3D78,
+	0x809C2506, 0x933EBD71, 0xA7D915E8, 0xB47B8D9F, 0xCE1644DA, 0xDDB4DCAD, 0xE9537434, 0xFAF1EC43,
+	0x1D88E6BE, 0x0E2A7EC9, 0x3ACDD650, 0x296F4E27, 0x53028762, 0x40A01F15, 0x7447B78C, 0x67E52FFB,
+	0xBF59D487, 0xACFB4CF0, 0x981CE469, 0x8BBE7C1E, 0xF1D3B55B, 0xE2712D2C, 0xD69685B5, 0xC5341DC2,
+	0x224D173F, 0x31EF8F48, 0x050827D1, 0x16AABFA6, 0x6CC776E3, 0x7F65EE94, 0x4B82460D, 0x5820DE7A,
+	0xFBC3FAF9, 0xE861628E, 0xDC86CA17, 0xCF245260, 0xB5499B25, 0xA6EB0352, 0x920CABCB, 0x81AE33BC,
+	0x66D73941, 0x7575A136, 0x419209AF, 0x523091D8, 0x285D589D, 0x3BFFC0EA, 0x0F186873, 0x1CBAF004,
+	0xC4060B78, 0xD7A4930F, 0xE3433B96, 0xF0E1A3E1, 0x8A8C6AA4, 0x992EF2D3, 0xADC95A4A, 0xBE6BC23D,
+	0x5912C8C0, 0x4AB050B7, 0x7E57F82E, 0x6DF56059, 0x1798A91C, 0x043A316B, 0x30DD99F2, 0x237F0185,
+	0x844819FB, 0x97EA818C, 0xA30D2915, 0xB0AFB162, 0xCAC27827, 0xD960E050, 0xED8748C9, 0xFE25D0BE,
+	0x195CDA43, 0x0AFE4234, 0x3E19EAAD, 0x2DBB72DA, 0x57D6BB9F, 0x447423E8, 0x70938B71, 0x63311306,
+	0xBB8DE87A, 0xA82F700D, 0x9CC8D894, 0x8F6A40E3, 0xF50789A6, 0xE6A511D1, 0xD242B948, 0xC1E0213F,
+	0x26992BC2, 0x353BB3B5, 0x01DC1B2C, 0x127E835B, 0x68134A1E, 0x7BB1D269, 0x4F567AF0, 0x5CF4E287,
+	0x04D43CFD, 0x1776A48A, 0x23910C13, 0x30339464, 0x4A5E5D21, 0x59FCC556, 0x6D1B6DCF, 0x7EB9F5B8,
+	0x99C0FF45, 0x8A626732, 0xBE85CFAB, 0xAD2757DC, 0xD74A9E99, 0xC4E806EE, 0xF00FAE77, 0xE3AD3600,
+	0x3B11CD7C, 0x28B3550B, 0x1C54FD92, 0x0FF665E5, 0x759BACA0, 0x663934D7, 0x52DE9C4E, 0x417C0439,
+	0xA6050EC4, 0xB5A796B3, 0x81403E2A, 0x92E2A65D, 0xE88F6F18, 0xFB2DF76F, 0xCFCA5FF6, 0xDC68C781,
+	0x7B5FDFFF, 0x68FD4788, 0x5C1AEF11, 0x4FB87766, 0x35D5BE23, 0x26772654, 0x12908ECD, 0x013216BA,
+	0xE64B1C47, 0xF5E98430, 0xC10E2CA9, 0xD2ACB4DE, 0xA8C17D9B, 0xBB63E5EC, 0x8F844D75, 0x9C26D502,
+	0x449A2E7E, 0x5738B609, 0x63DF1E90, 0x707D86E7, 0x0A104FA2, 0x19B2D7D5, 0x2D557F4C, 0x3EF7E73B,
+	0xD98EEDC6, 0xCA2C75B1, 0xFECBDD28, 0xED69455F, 0x97048C1A, 0x84A6146D, 0xB041BCF4, 0xA3E32483
+};
+
+/*
+ * end of the CRC lookup table crc_tableil8_o40
+ */
+
+
+
+/*
+ * The following CRC lookup table was generated automagically using the
+ * following model parameters:
+ *
+ * Generator Polynomial = ................. 0x1EDC6F41 Generator Polynomial
+ * Length = .......... 32 bits Reflected Bits = ....................... TRUE
+ * Table Generation Offset = .............. 32 bits Number of Slices =
+ * ..................... 8 slices Slice Lengths = ........................ 8
+ * 8 8 8 8 8 8 8 Directory Name = ....................... .\ File Name =
+ * ............................ 8x256_tables.c
+ */
+
+uint32_t sctp_crc_tableil8_o48[256] =
+{
+	0x00000000, 0xA541927E, 0x4F6F520D, 0xEA2EC073, 0x9EDEA41A, 0x3B9F3664, 0xD1B1F617, 0x74F06469,
+	0x38513EC5, 0x9D10ACBB, 0x773E6CC8, 0xD27FFEB6, 0xA68F9ADF, 0x03CE08A1, 0xE9E0C8D2, 0x4CA15AAC,
+	0x70A27D8A, 0xD5E3EFF4, 0x3FCD2F87, 0x9A8CBDF9, 0xEE7CD990, 0x4B3D4BEE, 0xA1138B9D, 0x045219E3,
+	0x48F3434F, 0xEDB2D131, 0x079C1142, 0xA2DD833C, 0xD62DE755, 0x736C752B, 0x9942B558, 0x3C032726,
+	0xE144FB14, 0x4405696A, 0xAE2BA919, 0x0B6A3B67, 0x7F9A5F0E, 0xDADBCD70, 0x30F50D03, 0x95B49F7D,
+	0xD915C5D1, 0x7C5457AF, 0x967A97DC, 0x333B05A2, 0x47CB61CB, 0xE28AF3B5, 0x08A433C6, 0xADE5A1B8,
+	0x91E6869E, 0x34A714E0, 0xDE89D493, 0x7BC846ED, 0x0F382284, 0xAA79B0FA, 0x40577089, 0xE516E2F7,
+	0xA9B7B85B, 0x0CF62A25, 0xE6D8EA56, 0x43997828, 0x37691C41, 0x92288E3F, 0x78064E4C, 0xDD47DC32,
+	0xC76580D9, 0x622412A7, 0x880AD2D4, 0x2D4B40AA, 0x59BB24C3, 0xFCFAB6BD, 0x16D476CE, 0xB395E4B0,
+	0xFF34BE1C, 0x5A752C62, 0xB05BEC11, 0x151A7E6F, 0x61EA1A06, 0xC4AB8878, 0x2E85480B, 0x8BC4DA75,
+	0xB7C7FD53, 0x12866F2D, 0xF8A8AF5E, 0x5DE93D20, 0x29195949, 0x8C58CB37, 0x66760B44, 0xC337993A,
+	0x8F96C396, 0x2AD751E8, 0xC0F9919B, 0x65B803E5, 0x1148678C, 0xB409F5F2, 0x5E273581, 0xFB66A7FF,
+	0x26217BCD, 0x8360E9B3, 0x694E29C0, 0xCC0FBBBE, 0xB8FFDFD7, 0x1DBE4DA9, 0xF7908DDA, 0x52D11FA4,
+	0x1E704508, 0xBB31D776, 0x511F1705, 0xF45E857B, 0x80AEE112, 0x25EF736C, 0xCFC1B31F, 0x6A802161,
+	0x56830647, 0xF3C29439, 0x19EC544A, 0xBCADC634, 0xC85DA25D, 0x6D1C3023, 0x8732F050, 0x2273622E,
+	0x6ED23882, 0xCB93AAFC, 0x21BD6A8F, 0x84FCF8F1, 0xF00C9C98, 0x554D0EE6, 0xBF63CE95, 0x1A225CEB,
+	0x8B277743, 0x2E66E53D, 0xC448254E, 0x6109B730, 0x15F9D359, 0xB0B84127, 0x5A968154, 0xFFD7132A,
+	0xB3764986, 0x1637DBF8, 0xFC191B8B, 0x595889F5, 0x2DA8ED9C, 0x88E97FE2, 0x62C7BF91, 0xC7862DEF,
+	0xFB850AC9, 0x5EC498B7, 0xB4EA58C4, 0x11ABCABA, 0x655BAED3, 0xC01A3CAD, 0x2A34FCDE, 0x8F756EA0,
+	0xC3D4340C, 0x6695A672, 0x8CBB6601, 0x29FAF47F, 0x5D0A9016, 0xF84B0268, 0x1265C21B, 0xB7245065,
+	0x6A638C57, 0xCF221E29, 0x250CDE5A, 0x804D4C24, 0xF4BD284D, 0x51FCBA33, 0xBBD27A40, 0x1E93E83E,
+	0x5232B292, 0xF77320EC, 0x1D5DE09F, 0xB81C72E1, 0xCCEC1688, 0x69AD84F6, 0x83834485, 0x26C2D6FB,
+	0x1AC1F1DD, 0xBF8063A3, 0x55AEA3D0, 0xF0EF31AE, 0x841F55C7, 0x215EC7B9, 0xCB7007CA, 0x6E3195B4,
+	0x2290CF18, 0x87D15D66, 0x6DFF9D15, 0xC8BE0F6B, 0xBC4E6B02, 0x190FF97C, 0xF321390F, 0x5660AB71,
+	0x4C42F79A, 0xE90365E4, 0x032DA597, 0xA66C37E9, 0xD29C5380, 0x77DDC1FE, 0x9DF3018D, 0x38B293F3,
+	0x7413C95F, 0xD1525B21, 0x3B7C9B52, 0x9E3D092C, 0xEACD6D45, 0x4F8CFF3B, 0xA5A23F48, 0x00E3AD36,
+	0x3CE08A10, 0x99A1186E, 0x738FD81D, 0xD6CE4A63, 0xA23E2E0A, 0x077FBC74, 0xED517C07, 0x4810EE79,
+	0x04B1B4D5, 0xA1F026AB, 0x4BDEE6D8, 0xEE9F74A6, 0x9A6F10CF, 0x3F2E82B1, 0xD50042C2, 0x7041D0BC,
+	0xAD060C8E, 0x08479EF0, 0xE2695E83, 0x4728CCFD, 0x33D8A894, 0x96993AEA, 0x7CB7FA99, 0xD9F668E7,
+	0x9557324B, 0x3016A035, 0xDA386046, 0x7F79F238, 0x0B899651, 0xAEC8042F, 0x44E6C45C, 0xE1A75622,
+	0xDDA47104, 0x78E5E37A, 0x92CB2309, 0x378AB177, 0x437AD51E, 0xE63B4760, 0x0C158713, 0xA954156D,
+	0xE5F54FC1, 0x40B4DDBF, 0xAA9A1DCC, 0x0FDB8FB2, 0x7B2BEBDB, 0xDE6A79A5, 0x3444B9D6, 0x91052BA8
+};
+
+/*
+ * end of the CRC lookup table crc_tableil8_o48
+ */
+
+
+
+/*
+ * The following CRC lookup table was generated automagically using the
+ * following model parameters:
+ *
+ * Generator Polynomial = ................. 0x1EDC6F41 Generator Polynomial
+ * Length = .......... 32 bits Reflected Bits = ....................... TRUE
+ * Table Generation Offset = .............. 32 bits Number of Slices =
+ * ..................... 8 slices Slice Lengths = ........................ 8
+ * 8 8 8 8 8 8 8 Directory Name = ....................... .\ File Name =
+ * ............................ 8x256_tables.c
+ */
+
+uint32_t sctp_crc_tableil8_o56[256] =
+{
+	0x00000000, 0xDD45AAB8, 0xBF672381, 0x62228939, 0x7B2231F3, 0xA6679B4B, 0xC4451272, 0x1900B8CA,
+	0xF64463E6, 0x2B01C95E, 0x49234067, 0x9466EADF, 0x8D665215, 0x5023F8AD, 0x32017194, 0xEF44DB2C,
+	0xE964B13D, 0x34211B85, 0x560392BC, 0x8B463804, 0x924680CE, 0x4F032A76, 0x2D21A34F, 0xF06409F7,
+	0x1F20D2DB, 0xC2657863, 0xA047F15A, 0x7D025BE2, 0x6402E328, 0xB9474990, 0xDB65C0A9, 0x06206A11,
+	0xD725148B, 0x0A60BE33, 0x6842370A, 0xB5079DB2, 0xAC072578, 0x71428FC0, 0x136006F9, 0xCE25AC41,
+	0x2161776D, 0xFC24DDD5, 0x9E0654EC, 0x4343FE54, 0x5A43469E, 0x8706EC26, 0xE524651F, 0x3861CFA7,
+	0x3E41A5B6, 0xE3040F0E, 0x81268637, 0x5C632C8F, 0x45639445, 0x98263EFD, 0xFA04B7C4, 0x27411D7C,
+	0xC805C650, 0x15406CE8, 0x7762E5D1, 0xAA274F69, 0xB327F7A3, 0x6E625D1B, 0x0C40D422, 0xD1057E9A,
+	0xABA65FE7, 0x76E3F55F, 0x14C17C66, 0xC984D6DE, 0xD0846E14, 0x0DC1C4AC, 0x6FE34D95, 0xB2A6E72D,
+	0x5DE23C01, 0x80A796B9, 0xE2851F80, 0x3FC0B538, 0x26C00DF2, 0xFB85A74A, 0x99A72E73, 0x44E284CB,
+	0x42C2EEDA, 0x9F874462, 0xFDA5CD5B, 0x20E067E3, 0x39E0DF29, 0xE4A57591, 0x8687FCA8, 0x5BC25610,
+	0xB4868D3C, 0x69C32784, 0x0BE1AEBD, 0xD6A40405, 0xCFA4BCCF, 0x12E11677, 0x70C39F4E, 0xAD8635F6,
+	0x7C834B6C, 0xA1C6E1D4, 0xC3E468ED, 0x1EA1C255, 0x07A17A9F, 0xDAE4D027, 0xB8C6591E, 0x6583F3A6,
+	0x8AC7288A, 0x57828232, 0x35A00B0B, 0xE8E5A1B3, 0xF1E51979, 0x2CA0B3C1, 0x4E823AF8, 0x93C79040,
+	0x95E7FA51, 0x48A250E9, 0x2A80D9D0, 0xF7C57368, 0xEEC5CBA2, 0x3380611A, 0x51A2E823, 0x8CE7429B,
+	0x63A399B7, 0xBEE6330F, 0xDCC4BA36, 0x0181108E, 0x1881A844, 0xC5C402FC, 0xA7E68BC5, 0x7AA3217D,
+	0x52A0C93F, 0x8FE56387, 0xEDC7EABE, 0x30824006, 0x2982F8CC, 0xF4C75274, 0x96E5DB4D, 0x4BA071F5,
+	0xA4E4AAD9, 0x79A10061, 0x1B838958, 0xC6C623E0, 0xDFC69B2A, 0x02833192, 0x60A1B8AB, 0xBDE41213,
+	0xBBC47802, 0x6681D2BA, 0x04A35B83, 0xD9E6F13B, 0xC0E649F1, 0x1DA3E349, 0x7F816A70, 0xA2C4C0C8,
+	0x4D801BE4, 0x90C5B15C, 0xF2E73865, 0x2FA292DD, 0x36A22A17, 0xEBE780AF, 0x89C50996, 0x5480A32E,
+	0x8585DDB4, 0x58C0770C, 0x3AE2FE35, 0xE7A7548D, 0xFEA7EC47, 0x23E246FF, 0x41C0CFC6, 0x9C85657E,
+	0x73C1BE52, 0xAE8414EA, 0xCCA69DD3, 0x11E3376B, 0x08E38FA1, 0xD5A62519, 0xB784AC20, 0x6AC10698,
+	0x6CE16C89, 0xB1A4C631, 0xD3864F08, 0x0EC3E5B0, 0x17C35D7A, 0xCA86F7C2, 0xA8A47EFB, 0x75E1D443,
+	0x9AA50F6F, 0x47E0A5D7, 0x25C22CEE, 0xF8878656, 0xE1873E9C, 0x3CC29424, 0x5EE01D1D, 0x83A5B7A5,
+	0xF90696D8, 0x24433C60, 0x4661B559, 0x9B241FE1, 0x8224A72B, 0x5F610D93, 0x3D4384AA, 0xE0062E12,
+	0x0F42F53E, 0xD2075F86, 0xB025D6BF, 0x6D607C07, 0x7460C4CD, 0xA9256E75, 0xCB07E74C, 0x16424DF4,
+	0x106227E5, 0xCD278D5D, 0xAF050464, 0x7240AEDC, 0x6B401616, 0xB605BCAE, 0xD4273597, 0x09629F2F,
+	0xE6264403, 0x3B63EEBB, 0x59416782, 0x8404CD3A, 0x9D0475F0, 0x4041DF48, 0x22635671, 0xFF26FCC9,
+	0x2E238253, 0xF36628EB, 0x9144A1D2, 0x4C010B6A, 0x5501B3A0, 0x88441918, 0xEA669021, 0x37233A99,
+	0xD867E1B5, 0x05224B0D, 0x6700C234, 0xBA45688C, 0xA345D046, 0x7E007AFE, 0x1C22F3C7, 0xC167597F,
+	0xC747336E, 0x1A0299D6, 0x782010EF, 0xA565BA57, 0xBC65029D, 0x6120A825, 0x0302211C, 0xDE478BA4,
+	0x31035088, 0xEC46FA30, 0x8E647309, 0x5321D9B1, 0x4A21617B, 0x9764CBC3, 0xF54642FA, 0x2803E842
+};
+
+/*
+ * end of the CRC lookup table crc_tableil8_o56
+ */
+
+
+
+/*
+ * The following CRC lookup table was generated automagically using the
+ * following model parameters:
+ *
+ * Generator Polynomial = ................. 0x1EDC6F41 Generator Polynomial
+ * Length = .......... 32 bits Reflected Bits = ....................... TRUE
+ * Table Generation Offset = .............. 32 bits Number of Slices =
+ * ..................... 8 slices Slice Lengths = ........................ 8
+ * 8 8 8 8 8 8 8 Directory Name = ....................... .\ File Name =
+ * ............................ 8x256_tables.c
+ */
+
+uint32_t sctp_crc_tableil8_o64[256] =
+{
+	0x00000000, 0x38116FAC, 0x7022DF58, 0x4833B0F4, 0xE045BEB0, 0xD854D11C, 0x906761E8, 0xA8760E44,
+	0xC5670B91, 0xFD76643D, 0xB545D4C9, 0x8D54BB65, 0x2522B521, 0x1D33DA8D, 0x55006A79, 0x6D1105D5,
+	0x8F2261D3, 0xB7330E7F, 0xFF00BE8B, 0xC711D127, 0x6F67DF63, 0x5776B0CF, 0x1F45003B, 0x27546F97,
+	0x4A456A42, 0x725405EE, 0x3A67B51A, 0x0276DAB6, 0xAA00D4F2, 0x9211BB5E, 0xDA220BAA, 0xE2336406,
+	0x1BA8B557, 0x23B9DAFB, 0x6B8A6A0F, 0x539B05A3, 0xFBED0BE7, 0xC3FC644B, 0x8BCFD4BF, 0xB3DEBB13,
+	0xDECFBEC6, 0xE6DED16A, 0xAEED619E, 0x96FC0E32, 0x3E8A0076, 0x069B6FDA, 0x4EA8DF2E, 0x76B9B082,
+	0x948AD484, 0xAC9BBB28, 0xE4A80BDC, 0xDCB96470, 0x74CF6A34, 0x4CDE0598, 0x04EDB56C, 0x3CFCDAC0,
+	0x51EDDF15, 0x69FCB0B9, 0x21CF004D, 0x19DE6FE1, 0xB1A861A5, 0x89B90E09, 0xC18ABEFD, 0xF99BD151,
+	0x37516AAE, 0x0F400502, 0x4773B5F6, 0x7F62DA5A, 0xD714D41E, 0xEF05BBB2, 0xA7360B46, 0x9F2764EA,
+	0xF236613F, 0xCA270E93, 0x8214BE67, 0xBA05D1CB, 0x1273DF8F, 0x2A62B023, 0x625100D7, 0x5A406F7B,
+	0xB8730B7D, 0x806264D1, 0xC851D425, 0xF040BB89, 0x5836B5CD, 0x6027DA61, 0x28146A95, 0x10050539,
+	0x7D1400EC, 0x45056F40, 0x0D36DFB4, 0x3527B018, 0x9D51BE5C, 0xA540D1F0, 0xED736104, 0xD5620EA8,
+	0x2CF9DFF9, 0x14E8B055, 0x5CDB00A1, 0x64CA6F0D, 0xCCBC6149, 0xF4AD0EE5, 0xBC9EBE11, 0x848FD1BD,
+	0xE99ED468, 0xD18FBBC4, 0x99BC0B30, 0xA1AD649C, 0x09DB6AD8, 0x31CA0574, 0x79F9B580, 0x41E8DA2C,
+	0xA3DBBE2A, 0x9BCAD186, 0xD3F96172, 0xEBE80EDE, 0x439E009A, 0x7B8F6F36, 0x33BCDFC2, 0x0BADB06E,
+	0x66BCB5BB, 0x5EADDA17, 0x169E6AE3, 0x2E8F054F, 0x86F90B0B, 0xBEE864A7, 0xF6DBD453, 0xCECABBFF,
+	0x6EA2D55C, 0x56B3BAF0, 0x1E800A04, 0x269165A8, 0x8EE76BEC, 0xB6F60440, 0xFEC5B4B4, 0xC6D4DB18,
+	0xABC5DECD, 0x93D4B161, 0xDBE70195, 0xE3F66E39, 0x4B80607D, 0x73910FD1, 0x3BA2BF25, 0x03B3D089,
+	0xE180B48F, 0xD991DB23, 0x91A26BD7, 0xA9B3047B, 0x01C50A3F, 0x39D46593, 0x71E7D567, 0x49F6BACB,
+	0x24E7BF1E, 0x1CF6D0B2, 0x54C56046, 0x6CD40FEA, 0xC4A201AE, 0xFCB36E02, 0xB480DEF6, 0x8C91B15A,
+	0x750A600B, 0x4D1B0FA7, 0x0528BF53, 0x3D39D0FF, 0x954FDEBB, 0xAD5EB117, 0xE56D01E3, 0xDD7C6E4F,
+	0xB06D6B9A, 0x887C0436, 0xC04FB4C2, 0xF85EDB6E, 0x5028D52A, 0x6839BA86, 0x200A0A72, 0x181B65DE,
+	0xFA2801D8, 0xC2396E74, 0x8A0ADE80, 0xB21BB12C, 0x1A6DBF68, 0x227CD0C4, 0x6A4F6030, 0x525E0F9C,
+	0x3F4F0A49, 0x075E65E5, 0x4F6DD511, 0x777CBABD, 0xDF0AB4F9, 0xE71BDB55, 0xAF286BA1, 0x9739040D,
+	0x59F3BFF2, 0x61E2D05E, 0x29D160AA, 0x11C00F06, 0xB9B60142, 0x81A76EEE, 0xC994DE1A, 0xF185B1B6,
+	0x9C94B463, 0xA485DBCF, 0xECB66B3B, 0xD4A70497, 0x7CD10AD3, 0x44C0657F, 0x0CF3D58B, 0x34E2BA27,
+	0xD6D1DE21, 0xEEC0B18D, 0xA6F30179, 0x9EE26ED5, 0x36946091, 0x0E850F3D, 0x46B6BFC9, 0x7EA7D065,
+	0x13B6D5B0, 0x2BA7BA1C, 0x63940AE8, 0x5B856544, 0xF3F36B00, 0xCBE204AC, 0x83D1B458, 0xBBC0DBF4,
+	0x425B0AA5, 0x7A4A6509, 0x3279D5FD, 0x0A68BA51, 0xA21EB415, 0x9A0FDBB9, 0xD23C6B4D, 0xEA2D04E1,
+	0x873C0134, 0xBF2D6E98, 0xF71EDE6C, 0xCF0FB1C0, 0x6779BF84, 0x5F68D028, 0x175B60DC, 0x2F4A0F70,
+	0xCD796B76, 0xF56804DA, 0xBD5BB42E, 0x854ADB82, 0x2D3CD5C6, 0x152DBA6A, 0x5D1E0A9E, 0x650F6532,
+	0x081E60E7, 0x300F0F4B, 0x783CBFBF, 0x402DD013, 0xE85BDE57, 0xD04AB1FB, 0x9879010F, 0xA0686EA3
+};
+
+/*
+ * end of the CRC lookup table crc_tableil8_o64
+ */
+
+
+
+/*
+ * The following CRC lookup table was generated automagically using the
+ * following model parameters:
+ *
+ * Generator Polynomial = ................. 0x1EDC6F41 Generator Polynomial
+ * Length = .......... 32 bits Reflected Bits = ....................... TRUE
+ * Table Generation Offset = .............. 32 bits Number of Slices =
+ * ..................... 8 slices Slice Lengths = ........................ 8
+ * 8 8 8 8 8 8 8 Directory Name = ....................... .\ File Name =
+ * ............................ 8x256_tables.c
+ */
+
+uint32_t sctp_crc_tableil8_o72[256] =
+{
+	0x00000000, 0xEF306B19, 0xDB8CA0C3, 0x34BCCBDA, 0xB2F53777, 0x5DC55C6E, 0x697997B4, 0x8649FCAD,
+	0x6006181F, 0x8F367306, 0xBB8AB8DC, 0x54BAD3C5, 0xD2F32F68, 0x3DC34471, 0x097F8FAB, 0xE64FE4B2,
+	0xC00C303E, 0x2F3C5B27, 0x1B8090FD, 0xF4B0FBE4, 0x72F90749, 0x9DC96C50, 0xA975A78A, 0x4645CC93,
+	0xA00A2821, 0x4F3A4338, 0x7B8688E2, 0x94B6E3FB, 0x12FF1F56, 0xFDCF744F, 0xC973BF95, 0x2643D48C,
+	0x85F4168D, 0x6AC47D94, 0x5E78B64E, 0xB148DD57, 0x370121FA, 0xD8314AE3, 0xEC8D8139, 0x03BDEA20,
+	0xE5F20E92, 0x0AC2658B, 0x3E7EAE51, 0xD14EC548, 0x570739E5, 0xB83752FC, 0x8C8B9926, 0x63BBF23F,
+	0x45F826B3, 0xAAC84DAA, 0x9E748670, 0x7144ED69, 0xF70D11C4, 0x183D7ADD, 0x2C81B107, 0xC3B1DA1E,
+	0x25FE3EAC, 0xCACE55B5, 0xFE729E6F, 0x1142F576, 0x970B09DB, 0x783B62C2, 0x4C87A918, 0xA3B7C201,
+	0x0E045BEB, 0xE13430F2, 0xD588FB28, 0x3AB89031, 0xBCF16C9C, 0x53C10785, 0x677DCC5F, 0x884DA746,
+	0x6E0243F4, 0x813228ED, 0xB58EE337, 0x5ABE882E, 0xDCF77483, 0x33C71F9A, 0x077BD440, 0xE84BBF59,
+	0xCE086BD5, 0x213800CC, 0x1584CB16, 0xFAB4A00F, 0x7CFD5CA2, 0x93CD37BB, 0xA771FC61, 0x48419778,
+	0xAE0E73CA, 0x413E18D3, 0x7582D309, 0x9AB2B810, 0x1CFB44BD, 0xF3CB2FA4, 0xC777E47E, 0x28478F67,
+	0x8BF04D66, 0x64C0267F, 0x507CEDA5, 0xBF4C86BC, 0x39057A11, 0xD6351108, 0xE289DAD2, 0x0DB9B1CB,
+	0xEBF65579, 0x04C63E60, 0x307AF5BA, 0xDF4A9EA3, 0x5903620E, 0xB6330917, 0x828FC2CD, 0x6DBFA9D4,
+	0x4BFC7D58, 0xA4CC1641, 0x9070DD9B, 0x7F40B682, 0xF9094A2F, 0x16392136, 0x2285EAEC, 0xCDB581F5,
+	0x2BFA6547, 0xC4CA0E5E, 0xF076C584, 0x1F46AE9D, 0x990F5230, 0x763F3929, 0x4283F2F3, 0xADB399EA,
+	0x1C08B7D6, 0xF338DCCF, 0xC7841715, 0x28B47C0C, 0xAEFD80A1, 0x41CDEBB8, 0x75712062, 0x9A414B7B,
+	0x7C0EAFC9, 0x933EC4D0, 0xA7820F0A, 0x48B26413, 0xCEFB98BE, 0x21CBF3A7, 0x1577387D, 0xFA475364,
+	0xDC0487E8, 0x3334ECF1, 0x0788272B, 0xE8B84C32, 0x6EF1B09F, 0x81C1DB86, 0xB57D105C, 0x5A4D7B45,
+	0xBC029FF7, 0x5332F4EE, 0x678E3F34, 0x88BE542D, 0x0EF7A880, 0xE1C7C399, 0xD57B0843, 0x3A4B635A,
+	0x99FCA15B, 0x76CCCA42, 0x42700198, 0xAD406A81, 0x2B09962C, 0xC439FD35, 0xF08536EF, 0x1FB55DF6,
+	0xF9FAB944, 0x16CAD25D, 0x22761987, 0xCD46729E, 0x4B0F8E33, 0xA43FE52A, 0x90832EF0, 0x7FB345E9,
+	0x59F09165, 0xB6C0FA7C, 0x827C31A6, 0x6D4C5ABF, 0xEB05A612, 0x0435CD0B, 0x308906D1, 0xDFB96DC8,
+	0x39F6897A, 0xD6C6E263, 0xE27A29B9, 0x0D4A42A0, 0x8B03BE0D, 0x6433D514, 0x508F1ECE, 0xBFBF75D7,
+	0x120CEC3D, 0xFD3C8724, 0xC9804CFE, 0x26B027E7, 0xA0F9DB4A, 0x4FC9B053, 0x7B757B89, 0x94451090,
+	0x720AF422, 0x9D3A9F3B, 0xA98654E1, 0x46B63FF8, 0xC0FFC355, 0x2FCFA84C, 0x1B736396, 0xF443088F,
+	0xD200DC03, 0x3D30B71A, 0x098C7CC0, 0xE6BC17D9, 0x60F5EB74, 0x8FC5806D, 0xBB794BB7, 0x544920AE,
+	0xB206C41C, 0x5D36AF05, 0x698A64DF, 0x86BA0FC6, 0x00F3F36B, 0xEFC39872, 0xDB7F53A8, 0x344F38B1,
+	0x97F8FAB0, 0x78C891A9, 0x4C745A73, 0xA344316A, 0x250DCDC7, 0xCA3DA6DE, 0xFE816D04, 0x11B1061D,
+	0xF7FEE2AF, 0x18CE89B6, 0x2C72426C, 0xC3422975, 0x450BD5D8, 0xAA3BBEC1, 0x9E87751B, 0x71B71E02,
+	0x57F4CA8E, 0xB8C4A197, 0x8C786A4D, 0x63480154, 0xE501FDF9, 0x0A3196E0, 0x3E8D5D3A, 0xD1BD3623,
+	0x37F2D291, 0xD8C2B988, 0xEC7E7252, 0x034E194B, 0x8507E5E6, 0x6A378EFF, 0x5E8B4525, 0xB1BB2E3C
+};
+
+/*
+ * end of the CRC lookup table crc_tableil8_o72
+ */
+
+
+
+/*
+ * The following CRC lookup table was generated automagically using the
+ * following model parameters:
+ *
+ * Generator Polynomial = ................. 0x1EDC6F41 Generator Polynomial
+ * Length = .......... 32 bits Reflected Bits = ....................... TRUE
+ * Table Generation Offset = .............. 32 bits Number of Slices =
+ * ..................... 8 slices Slice Lengths = ........................ 8
+ * 8 8 8 8 8 8 8 Directory Name = ....................... .\ File Name =
+ * ............................ 8x256_tables.c
+ */
+
+uint32_t sctp_crc_tableil8_o80[256] =
+{
+	0x00000000, 0x68032CC8, 0xD0065990, 0xB8057558, 0xA5E0C5D1, 0xCDE3E919, 0x75E69C41, 0x1DE5B089,
+	0x4E2DFD53, 0x262ED19B, 0x9E2BA4C3, 0xF628880B, 0xEBCD3882, 0x83CE144A, 0x3BCB6112, 0x53C84DDA,
+	0x9C5BFAA6, 0xF458D66E, 0x4C5DA336, 0x245E8FFE, 0x39BB3F77, 0x51B813BF, 0xE9BD66E7, 0x81BE4A2F,
+	0xD27607F5, 0xBA752B3D, 0x02705E65, 0x6A7372AD, 0x7796C224, 0x1F95EEEC, 0xA7909BB4, 0xCF93B77C,
+	0x3D5B83BD, 0x5558AF75, 0xED5DDA2D, 0x855EF6E5, 0x98BB466C, 0xF0B86AA4, 0x48BD1FFC, 0x20BE3334,
+	0x73767EEE, 0x1B755226, 0xA370277E, 0xCB730BB6, 0xD696BB3F, 0xBE9597F7, 0x0690E2AF, 0x6E93CE67,
+	0xA100791B, 0xC90355D3, 0x7106208B, 0x19050C43, 0x04E0BCCA, 0x6CE39002, 0xD4E6E55A, 0xBCE5C992,
+	0xEF2D8448, 0x872EA880, 0x3F2BDDD8, 0x5728F110, 0x4ACD4199, 0x22CE6D51, 0x9ACB1809, 0xF2C834C1,
+	0x7AB7077A, 0x12B42BB2, 0xAAB15EEA, 0xC2B27222, 0xDF57C2AB, 0xB754EE63, 0x0F519B3B, 0x6752B7F3,
+	0x349AFA29, 0x5C99D6E1, 0xE49CA3B9, 0x8C9F8F71, 0x917A3FF8, 0xF9791330, 0x417C6668, 0x297F4AA0,
+	0xE6ECFDDC, 0x8EEFD114, 0x36EAA44C, 0x5EE98884, 0x430C380D, 0x2B0F14C5, 0x930A619D, 0xFB094D55,
+	0xA8C1008F, 0xC0C22C47, 0x78C7591F, 0x10C475D7, 0x0D21C55E, 0x6522E996, 0xDD279CCE, 0xB524B006,
+	0x47EC84C7, 0x2FEFA80F, 0x97EADD57, 0xFFE9F19F, 0xE20C4116, 0x8A0F6DDE, 0x320A1886, 0x5A09344E,
+	0x09C17994, 0x61C2555C, 0xD9C72004, 0xB1C40CCC, 0xAC21BC45, 0xC422908D, 0x7C27E5D5, 0x1424C91D,
+	0xDBB77E61, 0xB3B452A9, 0x0BB127F1, 0x63B20B39, 0x7E57BBB0, 0x16549778, 0xAE51E220, 0xC652CEE8,
+	0x959A8332, 0xFD99AFFA, 0x459CDAA2, 0x2D9FF66A, 0x307A46E3, 0x58796A2B, 0xE07C1F73, 0x887F33BB,
+	0xF56E0EF4, 0x9D6D223C, 0x25685764, 0x4D6B7BAC, 0x508ECB25, 0x388DE7ED, 0x808892B5, 0xE88BBE7D,
+	0xBB43F3A7, 0xD340DF6F, 0x6B45AA37, 0x034686FF, 0x1EA33676, 0x76A01ABE, 0xCEA56FE6, 0xA6A6432E,
+	0x6935F452, 0x0136D89A, 0xB933ADC2, 0xD130810A, 0xCCD53183, 0xA4D61D4B, 0x1CD36813, 0x74D044DB,
+	0x27180901, 0x4F1B25C9, 0xF71E5091, 0x9F1D7C59, 0x82F8CCD0, 0xEAFBE018, 0x52FE9540, 0x3AFDB988,
+	0xC8358D49, 0xA036A181, 0x1833D4D9, 0x7030F811, 0x6DD54898, 0x05D66450, 0xBDD31108, 0xD5D03DC0,
+	0x8618701A, 0xEE1B5CD2, 0x561E298A, 0x3E1D0542, 0x23F8B5CB, 0x4BFB9903, 0xF3FEEC5B, 0x9BFDC093,
+	0x546E77EF, 0x3C6D5B27, 0x84682E7F, 0xEC6B02B7, 0xF18EB23E, 0x998D9EF6, 0x2188EBAE, 0x498BC766,
+	0x1A438ABC, 0x7240A674, 0xCA45D32C, 0xA246FFE4, 0xBFA34F6D, 0xD7A063A5, 0x6FA516FD, 0x07A63A35,
+	0x8FD9098E, 0xE7DA2546, 0x5FDF501E, 0x37DC7CD6, 0x2A39CC5F, 0x423AE097, 0xFA3F95CF, 0x923CB907,
+	0xC1F4F4DD, 0xA9F7D815, 0x11F2AD4D, 0x79F18185, 0x6414310C, 0x0C171DC4, 0xB412689C, 0xDC114454,
+	0x1382F328, 0x7B81DFE0, 0xC384AAB8, 0xAB878670, 0xB66236F9, 0xDE611A31, 0x66646F69, 0x0E6743A1,
+	0x5DAF0E7B, 0x35AC22B3, 0x8DA957EB, 0xE5AA7B23, 0xF84FCBAA, 0x904CE762, 0x2849923A, 0x404ABEF2,
+	0xB2828A33, 0xDA81A6FB, 0x6284D3A3, 0x0A87FF6B, 0x17624FE2, 0x7F61632A, 0xC7641672, 0xAF673ABA,
+	0xFCAF7760, 0x94AC5BA8, 0x2CA92EF0, 0x44AA0238, 0x594FB2B1, 0x314C9E79, 0x8949EB21, 0xE14AC7E9,
+	0x2ED97095, 0x46DA5C5D, 0xFEDF2905, 0x96DC05CD, 0x8B39B544, 0xE33A998C, 0x5B3FECD4, 0x333CC01C,
+	0x60F48DC6, 0x08F7A10E, 0xB0F2D456, 0xD8F1F89E, 0xC5144817, 0xAD1764DF, 0x15121187, 0x7D113D4F
+};
+
+/*
+ * end of the CRC lookup table crc_tableil8_o80
+ */
+
+
+
+/*
+ * The following CRC lookup table was generated automagically using the
+ * following model parameters:
+ *
+ * Generator Polynomial = ................. 0x1EDC6F41 Generator Polynomial
+ * Length = .......... 32 bits Reflected Bits = ....................... TRUE
+ * Table Generation Offset = .............. 32 bits Number of Slices =
+ * ..................... 8 slices Slice Lengths = ........................ 8
+ * 8 8 8 8 8 8 8 Directory Name = ....................... .\ File Name =
+ * ............................ 8x256_tables.c
+ */
+
+uint32_t sctp_crc_tableil8_o88[256] =
+{
+	0x00000000, 0x493C7D27, 0x9278FA4E, 0xDB448769, 0x211D826D, 0x6821FF4A, 0xB3657823, 0xFA590504,
+	0x423B04DA, 0x0B0779FD, 0xD043FE94, 0x997F83B3, 0x632686B7, 0x2A1AFB90, 0xF15E7CF9, 0xB86201DE,
+	0x847609B4, 0xCD4A7493, 0x160EF3FA, 0x5F328EDD, 0xA56B8BD9, 0xEC57F6FE, 0x37137197, 0x7E2F0CB0,
+	0xC64D0D6E, 0x8F717049, 0x5435F720, 0x1D098A07, 0xE7508F03, 0xAE6CF224, 0x7528754D, 0x3C14086A,
+	0x0D006599, 0x443C18BE, 0x9F789FD7, 0xD644E2F0, 0x2C1DE7F4, 0x65219AD3, 0xBE651DBA, 0xF759609D,
+	0x4F3B6143, 0x06071C64, 0xDD439B0D, 0x947FE62A, 0x6E26E32E, 0x271A9E09, 0xFC5E1960, 0xB5626447,
+	0x89766C2D, 0xC04A110A, 0x1B0E9663, 0x5232EB44, 0xA86BEE40, 0xE1579367, 0x3A13140E, 0x732F6929,
+	0xCB4D68F7, 0x827115D0, 0x593592B9, 0x1009EF9E, 0xEA50EA9A, 0xA36C97BD, 0x782810D4, 0x31146DF3,
+	0x1A00CB32, 0x533CB615, 0x8878317C, 0xC1444C5B, 0x3B1D495F, 0x72213478, 0xA965B311, 0xE059CE36,
+	0x583BCFE8, 0x1107B2CF, 0xCA4335A6, 0x837F4881, 0x79264D85, 0x301A30A2, 0xEB5EB7CB, 0xA262CAEC,
+	0x9E76C286, 0xD74ABFA1, 0x0C0E38C8, 0x453245EF, 0xBF6B40EB, 0xF6573DCC, 0x2D13BAA5, 0x642FC782,
+	0xDC4DC65C, 0x9571BB7B, 0x4E353C12, 0x07094135, 0xFD504431, 0xB46C3916, 0x6F28BE7F, 0x2614C358,
+	0x1700AEAB, 0x5E3CD38C, 0x857854E5, 0xCC4429C2, 0x361D2CC6, 0x7F2151E1, 0xA465D688, 0xED59ABAF,
+	0x553BAA71, 0x1C07D756, 0xC743503F, 0x8E7F2D18, 0x7426281C, 0x3D1A553B, 0xE65ED252, 0xAF62AF75,
+	0x9376A71F, 0xDA4ADA38, 0x010E5D51, 0x48322076, 0xB26B2572, 0xFB575855, 0x2013DF3C, 0x692FA21B,
+	0xD14DA3C5, 0x9871DEE2, 0x4335598B, 0x0A0924AC, 0xF05021A8, 0xB96C5C8F, 0x6228DBE6, 0x2B14A6C1,
+	0x34019664, 0x7D3DEB43, 0xA6796C2A, 0xEF45110D, 0x151C1409, 0x5C20692E, 0x8764EE47, 0xCE589360,
+	0x763A92BE, 0x3F06EF99, 0xE44268F0, 0xAD7E15D7, 0x572710D3, 0x1E1B6DF4, 0xC55FEA9D, 0x8C6397BA,
+	0xB0779FD0, 0xF94BE2F7, 0x220F659E, 0x6B3318B9, 0x916A1DBD, 0xD856609A, 0x0312E7F3, 0x4A2E9AD4,
+	0xF24C9B0A, 0xBB70E62D, 0x60346144, 0x29081C63, 0xD3511967, 0x9A6D6440, 0x4129E329, 0x08159E0E,
+	0x3901F3FD, 0x703D8EDA, 0xAB7909B3, 0xE2457494, 0x181C7190, 0x51200CB7, 0x8A648BDE, 0xC358F6F9,
+	0x7B3AF727, 0x32068A00, 0xE9420D69, 0xA07E704E, 0x5A27754A, 0x131B086D, 0xC85F8F04, 0x8163F223,
+	0xBD77FA49, 0xF44B876E, 0x2F0F0007, 0x66337D20, 0x9C6A7824, 0xD5560503, 0x0E12826A, 0x472EFF4D,
+	0xFF4CFE93, 0xB67083B4, 0x6D3404DD, 0x240879FA, 0xDE517CFE, 0x976D01D9, 0x4C2986B0, 0x0515FB97,
+	0x2E015D56, 0x673D2071, 0xBC79A718, 0xF545DA3F, 0x0F1CDF3B, 0x4620A21C, 0x9D642575, 0xD4585852,
+	0x6C3A598C, 0x250624AB, 0xFE42A3C2, 0xB77EDEE5, 0x4D27DBE1, 0x041BA6C6, 0xDF5F21AF, 0x96635C88,
+	0xAA7754E2, 0xE34B29C5, 0x380FAEAC, 0x7133D38B, 0x8B6AD68F, 0xC256ABA8, 0x19122CC1, 0x502E51E6,
+	0xE84C5038, 0xA1702D1F, 0x7A34AA76, 0x3308D751, 0xC951D255, 0x806DAF72, 0x5B29281B, 0x1215553C,
+	0x230138CF, 0x6A3D45E8, 0xB179C281, 0xF845BFA6, 0x021CBAA2, 0x4B20C785, 0x906440EC, 0xD9583DCB,
+	0x613A3C15, 0x28064132, 0xF342C65B, 0xBA7EBB7C, 0x4027BE78, 0x091BC35F, 0xD25F4436, 0x9B633911,
+	0xA777317B, 0xEE4B4C5C, 0x350FCB35, 0x7C33B612, 0x866AB316, 0xCF56CE31, 0x14124958, 0x5D2E347F,
+	0xE54C35A1, 0xAC704886, 0x7734CFEF, 0x3E08B2C8, 0xC451B7CC, 0x8D6DCAEB, 0x56294D82, 0x1F1530A5
+};
+
+/*
+ * end of the CRC lookup table crc_tableil8_o88
+ */
+
+static uint32_t
+sctp_crc32c_sb8_64_bit(uint32_t crc,
+    unsigned char const *p_buf,
+    uint32_t length,
+    uint32_t offset)
+{
+	uint32_t li;
+	uint32_t term1, term2;
+	uint32_t running_length;
+	uint32_t end_bytes;
+	uint32_t init_bytes;
+
+	init_bytes = (4-offset) & 0x3;
+
+	if (init_bytes > length)
+		init_bytes = length;
+
+	running_length = ((length - init_bytes) / 8) * 8;
+	end_bytes = length - init_bytes - running_length;
+
+	for (li = 0; li < init_bytes; li++)
+		crc = sctp_crc_tableil8_o32[(crc ^ *p_buf++) & 0x000000FF] ^
+		    (crc >> 8);
+	for (li = 0; li < running_length / 8; li++) {
+#ifdef CEPH_BIG_ENDIAN
+		crc ^= *p_buf++;
+		crc ^= (*p_buf++) << 8;
+		crc ^= (*p_buf++) << 16;
+		crc ^= (*p_buf++) << 24;
+#else
+		crc ^= *(uint32_t *) p_buf;
+		p_buf += 4;
+#endif
+		term1 = sctp_crc_tableil8_o88[crc & 0x000000FF] ^
+		    sctp_crc_tableil8_o80[(crc >> 8) & 0x000000FF];
+		term2 = crc >> 16;
+		crc = term1 ^
+		    sctp_crc_tableil8_o72[term2 & 0x000000FF] ^
+		    sctp_crc_tableil8_o64[(term2 >> 8) & 0x000000FF];
+
+#ifdef CEPH_BIG_ENDIAN
+		crc ^= sctp_crc_tableil8_o56[*p_buf++];
+		crc ^= sctp_crc_tableil8_o48[*p_buf++];
+		crc ^= sctp_crc_tableil8_o40[*p_buf++];
+		crc ^= sctp_crc_tableil8_o32[*p_buf++];
+#else
+		term1 = sctp_crc_tableil8_o56[(*(uint32_t *) p_buf) & 0x000000FF] ^
+		    sctp_crc_tableil8_o48[((*(uint32_t *) p_buf) >> 8) & 0x000000FF];
+
+		term2 = (*(uint32_t *) p_buf) >> 16;
+		crc = crc ^
+		    term1 ^
+		    sctp_crc_tableil8_o40[term2 & 0x000000FF] ^
+		    sctp_crc_tableil8_o32[(term2 >> 8) & 0x000000FF];
+		p_buf += 4;
+#endif
+	}
+	for (li = 0; li < end_bytes; li++)
+		crc = sctp_crc_tableil8_o32[(crc ^ *p_buf++) & 0x000000FF] ^
+		    (crc >> 8);
+	return crc;
+}
+
+static uint32_t
+sctp_crc32c_sb8_64_bit_zero(uint32_t crc,
+    uint32_t length,
+    uint32_t offset)
+{
+	uint32_t li;
+	uint32_t term1, term2;
+	uint32_t running_length;
+	uint32_t end_bytes;
+	uint32_t init_bytes;
+
+	init_bytes = (4-offset) & 0x3;
+
+	if (init_bytes > length)
+		init_bytes = length;
+
+	running_length = ((length - init_bytes) / 8) * 8;
+	end_bytes = length - init_bytes - running_length;
+
+	for (li = 0; li < init_bytes; li++)
+		crc = sctp_crc_tableil8_o32[crc & 0x000000FF] ^
+		    (crc >> 8);
+	for (li = 0; li < running_length / 8; li++) {
+		term1 = sctp_crc_tableil8_o88[crc & 0x000000FF] ^
+		    sctp_crc_tableil8_o80[(crc >> 8) & 0x000000FF];
+		term2 = crc >> 16;
+		crc = term1 ^
+		    sctp_crc_tableil8_o72[term2 & 0x000000FF] ^
+		    sctp_crc_tableil8_o64[(term2 >> 8) & 0x000000FF];
+
+#ifdef CEPH_BIG_ENDIAN
+		crc ^= sctp_crc_tableil8_o56[0];
+		crc ^= sctp_crc_tableil8_o48[0];
+		crc ^= sctp_crc_tableil8_o40[0];
+		crc ^= sctp_crc_tableil8_o32[0];
+#else
+		term1 = sctp_crc_tableil8_o56[0] ^
+			sctp_crc_tableil8_o48[0];
+
+		term2 = 0;
+		crc = crc ^
+		    term1 ^
+		    sctp_crc_tableil8_o40[term2 & 0x000000FF] ^
+		    sctp_crc_tableil8_o32[(term2 >> 8) & 0x000000FF];
+#endif
+	}
+	for (li = 0; li < end_bytes; li++)
+		crc = sctp_crc_tableil8_o32[crc & 0x000000FF] ^
+		    (crc >> 8);
+	return crc;
+}
+
+
+/**
+ *
+ * Routine Description:
+ *
+ * warms the tables
+ *
+ * Arguments:
+ *
+ *		none
+ *
+ * Return value:
+ *
+ *		none
+ */
+static uint32_t
+update_crc32(uint32_t crc32c,
+    unsigned char const *buffer,
+    unsigned int length)
+{
+	uint32_t offset;
+
+	if (length == 0) {
+		return (crc32c);
+	}
+	offset = ((uintptr_t) buffer) & 0x3;
+	if (buffer)
+		return (sctp_crc32c_sb8_64_bit(crc32c, buffer, length, offset));
+	else
+		return (sctp_crc32c_sb8_64_bit_zero(crc32c, length, offset));
+}
+
+uint32_t sctp_crc_c[256] = {
+	0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4,
+	0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB,
+	0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B,
+	0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24,
+	0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B,
+	0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384,
+	0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54,
+	0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B,
+	0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A,
+	0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35,
+	0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5,
+	0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA,
+	0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45,
+	0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A,
+	0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A,
+	0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595,
+	0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48,
+	0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957,
+	0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687,
+	0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198,
+	0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927,
+	0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38,
+	0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8,
+	0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7,
+	0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096,
+	0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789,
+	0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859,
+	0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46,
+	0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9,
+	0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6,
+	0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36,
+	0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829,
+	0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C,
+	0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93,
+	0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043,
+	0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C,
+	0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3,
+	0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC,
+	0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C,
+	0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033,
+	0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652,
+	0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D,
+	0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D,
+	0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982,
+	0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D,
+	0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622,
+	0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2,
+	0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED,
+	0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530,
+	0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F,
+	0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF,
+	0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0,
+	0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F,
+	0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540,
+	0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90,
+	0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F,
+	0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE,
+	0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1,
+	0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321,
+	0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E,
+	0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81,
+	0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E,
+	0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E,
+	0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351,
+};
+
+
+#define SCTP_CRC32C(c,d) (c=(c>>8)^sctp_crc_c[(c^(d))&0xFF])
+
+#if 0
+static uint32_t
+old_update_crc32(uint32_t crc32c,
+    unsigned char const *buffer,
+    unsigned int length)
+{
+	unsigned int i;
+
+	for (i = 0; i < length; i++) {
+		SCTP_CRC32C(crc32c, buffer[i]);
+	}
+	return (crc32c);
+}
+
+
+static uint32_t
+sctp_csum_finalize(uint32_t crc32c)
+{
+	uint32_t result;
+
+#if BYTE_ORDER == BIG_ENDIAN
+	uint8_t byte0, byte1, byte2, byte3;
+
+#endif
+	/* Complement the result */
+	result = ~crc32c;
+#if BYTE_ORDER == BIG_ENDIAN
+	/*
+	 * For BIG-ENDIAN.. aka Motorola byte order the result is in
+	 * little-endian form. So we must manually swap the bytes. Then we
+	 * can call htonl() which does nothing...
+	 */
+	byte0 = result & 0x000000ff;
+	byte1 = (result >> 8) & 0x000000ff;
+	byte2 = (result >> 16) & 0x000000ff;
+	byte3 = (result >> 24) & 0x000000ff;
+	crc32c = ((byte0 << 24) | (byte1 << 16) | (byte2 << 8) | byte3);
+#else
+	/*
+	 * For INTEL platforms the result comes out in network order. No
+	 * htonl is required or the swap above. So we optimize out both the
+	 * htonl and the manual swap above.
+	 */
+	crc32c = result;
+#endif
+	return (crc32c);
+}
+#endif
+
+uint32_t ceph_crc32c_sctp(uint32_t crc, unsigned char const *data, unsigned length)
+{
+	return update_crc32(crc, data, length);
+}
+
+
+#endif
diff --git a/src/common/sctp_crc32.h b/src/common/sctp_crc32.h
new file mode 100644
index 00000000..92d20bcb
--- /dev/null
+++ b/src/common/sctp_crc32.h
@@ -0,0 +1,14 @@
+#ifndef CEPH_COMMON_SCTP_CRC32_H
+#define CEPH_COMMON_SCTP_CRC32_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint32_t ceph_crc32c_sctp(uint32_t crc, unsigned char const *data, unsigned length);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/common/secret.c b/src/common/secret.c
new file mode 100644
index 00000000..2f491510
--- /dev/null
+++ b/src/common/secret.c
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <keyutils.h>
+
+#include "common/armor.h"
+#include "common/safe_io.h"
+
+int read_secret_from_file(const char *filename, char *secret, size_t max_len)
+{
+  char *end;
+  int fd;
+  int len;
+
+  fd = open(filename, O_RDONLY);
+  if (fd < 0) {
+    perror("unable to read secretfile");
+    return -errno;
+  }
+  len = safe_read(fd, secret, max_len);
+  if (len <= 0) {
+    perror("unable to read secret from file");
+    close(fd);
+    return len ? len : -ENODATA;
+  }
+  end = secret;
+  while (end < secret + len && *end && *end != '\n' && *end != '\r')
+    end++;
+  *end = '\0';
+  close(fd);
+
+  return 0;
+}
+
+int set_kernel_secret(const char *secret, const char *key_name)
+{
+  /* try to submit key to kernel via the keys api */
+  key_serial_t serial;
+  int ret;
+  int secret_len = strlen(secret);
+  char payload[((secret_len * 3) / 4) + 4];
+
+  if (!secret_len) {
+    fprintf(stderr, "secret is empty.\n");
+    return -EINVAL;
+  }
+
+  ret = ceph_unarmor(payload, payload+sizeof(payload), secret, secret+secret_len);
+  if (ret < 0) {
+    char error_buf[80];
+    fprintf(stderr, "secret is not valid base64: %s.\n",
+	    strerror_r(-ret, error_buf, sizeof(error_buf)));
+    return ret;
+  }
+
+  serial = add_key("ceph", key_name, payload, sizeof(payload), KEY_SPEC_PROCESS_KEYRING);
+  if (serial == -1) {
+    ret = -errno;
+  }
+
+  return ret;
+}
+
+int is_kernel_secret(const char *key_name)
+{
+  key_serial_t serial;
+  serial = request_key("ceph", key_name, NULL, KEY_SPEC_USER_KEYRING);
+  return serial != -1;
+}
diff --git a/src/common/secret.h b/src/common/secret.h
new file mode 100644
index 00000000..5d2ad179
--- /dev/null
+++ b/src/common/secret.h
@@ -0,0 +1,18 @@
+#ifndef CEPH_SECRET_H
+#define CEPH_SECRET_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int read_secret_from_file(const char *filename, char *secret, size_t max_len);
+
+int set_kernel_secret(const char *secret, const char *key_name);
+
+int is_kernel_secret(const char *key_name);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/common/shared_cache.hpp b/src/common/shared_cache.hpp
new file mode 100644
index 00000000..b8623d9d
--- /dev/null
+++ b/src/common/shared_cache.hpp
@@ -0,0 +1,369 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_SHAREDCACHE_H
+#define CEPH_SHAREDCACHE_H
+
+#include <map>
+#include <list>
+#ifdef WITH_SEASTAR
+#include <boost/smart_ptr/local_shared_ptr.hpp>
+#else
+#include <memory>
+#endif
+#include "common/ceph_mutex.h"
+#include "common/dout.h"
+#include "include/unordered_map.h"
+
+// re-include our assert to clobber the system one; fix dout:
+#include "include/ceph_assert.h"
+
+template <class K, class V>
+class SharedLRU {
+  CephContext *cct;
+#ifdef WITH_SEASTAR
+  using VPtr = boost::local_shared_ptr<V>;
+  using WeakVPtr = boost::weak_ptr<V>;
+#else
+  using VPtr = std::shared_ptr<V>;
+  using WeakVPtr = std::weak_ptr<V>;
+#endif
+  ceph::mutex lock;
+  size_t max_size;
+  ceph::condition_variable cond;
+  unsigned size;
+public:
+  int waiting;
+private:
+  using C = std::less<K>;
+  using H = std::hash<K>;
+  ceph::unordered_map<K, typename std::list<std::pair<K, VPtr> >::iterator, H> contents;
+  std::list<std::pair<K, VPtr> > lru;
+
+  std::map<K, std::pair<WeakVPtr, V*>, C> weak_refs;
+
+  void trim_cache(std::list<VPtr> *to_release) {
+    while (size > max_size) {
+      to_release->push_back(lru.back().second);
+      lru_remove(lru.back().first);
+    }
+  }
+
+  void lru_remove(const K& key) {
+    auto i = contents.find(key);
+    if (i == contents.end())
+      return;
+    lru.erase(i->second);
+    --size;
+    contents.erase(i);
+  }
+
+  void lru_add(const K& key, const VPtr& val, std::list<VPtr> *to_release) {
+    auto i = contents.find(key);
+    if (i != contents.end()) {
+      lru.splice(lru.begin(), lru, i->second);
+    } else {
+      ++size;
+      lru.push_front(make_pair(key, val));
+      contents[key] = lru.begin();
+      trim_cache(to_release);
+    }
+  }
+
+  void remove(const K& key, V *valptr) {
+    std::lock_guard l{lock};
+    auto i = weak_refs.find(key);
+    if (i != weak_refs.end() && i->second.second == valptr) {
+      weak_refs.erase(i);
+    }
+    cond.notify_all();
+  }
+
+  class Cleanup {
+  public:
+    SharedLRU<K, V> *cache;
+    K key;
+    Cleanup(SharedLRU<K, V> *cache, K key) : cache(cache), key(key) {}
+    void operator()(V *ptr) {
+      cache->remove(key, ptr);
+      delete ptr;
+    }
+  };
+
+public:
+  SharedLRU(CephContext *cct = NULL, size_t max_size = 20)
+    : cct(cct),
+      lock{ceph::make_mutex("SharedLRU::lock")},
+      max_size(max_size),
+      size(0), waiting(0) {
+    contents.rehash(max_size); 
+  }
+  
+  ~SharedLRU() {
+    contents.clear();
+    lru.clear();
+    if (!weak_refs.empty()) {
+      lderr(cct) << "leaked refs:\n";
+      dump_weak_refs(*_dout);
+      *_dout << dendl;
+      if (cct->_conf.get_val<bool>("debug_asserts_on_shutdown")) {
+	ceph_assert(weak_refs.empty());
+      }
+    }
+  }
+
+  int get_count() {
+    std::lock_guard locker{lock};
+    return size;
+  }
+
+  void set_cct(CephContext *c) {
+    cct = c;
+  }
+
+  void dump_weak_refs() {
+    lderr(cct) << "leaked refs:\n";
+    dump_weak_refs(*_dout);
+    *_dout << dendl;
+  }
+
+  void dump_weak_refs(std::ostream& out) {
+    for (const auto& [key, ref] : weak_refs) {
+      out << __func__ << " " << this << " weak_refs: "
+	  << key << " = " << ref.second
+	  << " with " << ref.first.use_count() << " refs"
+	  << std::endl;
+    }
+  }
+
+  //clear all strong reference from the lru.
+  void clear() {
+    while (true) {
+      VPtr val; // release any ref we have after we drop the lock
+      std::lock_guard locker{lock};
+      if (size == 0)
+        break;
+
+      val = lru.back().second;
+      lru_remove(lru.back().first);
+    }
+  }
+
+  void clear(const K& key) {
+    VPtr val; // release any ref we have after we drop the lock
+    {
+      std::lock_guard l{lock};
+      typename map<K, pair<WeakVPtr, V*>, C>::iterator i = weak_refs.find(key);
+      if (i != weak_refs.end()) {
+	val = i->second.first.lock();
+      }
+      lru_remove(key);
+    }
+  }
+
+  void purge(const K &key) {
+    VPtr val; // release any ref we have after we drop the lock
+    {
+      std::lock_guard l{lock};
+      typename map<K, pair<WeakVPtr, V*>, C>::iterator i = weak_refs.find(key);
+      if (i != weak_refs.end()) {
+	val = i->second.first.lock();
+        weak_refs.erase(i);
+      }
+      lru_remove(key);
+    }
+  }
+
+  void set_size(size_t new_size) {
+    list<VPtr> to_release;
+    {
+      std::lock_guard l{lock};
+      max_size = new_size;
+      trim_cache(&to_release);
+    }
+  }
+
+  // Returns K key s.t. key <= k for all currently cached k,v
+  K cached_key_lower_bound() {
+    std::lock_guard l{lock};
+    return weak_refs.begin()->first;
+  }
+
+  VPtr lower_bound(const K& key) {
+    VPtr val;
+    list<VPtr> to_release;
+    {
+      std::unique_lock l{lock};
+      ++waiting;
+      cond.wait(l, [this, &key, &val, &to_release] {
+        if (weak_refs.empty()) {
+          return true;
+        }
+        auto i = weak_refs.lower_bound(key);
+        if (i == weak_refs.end()) {
+          --i;
+        }
+        if (val = i->second.first.lock(); val) {
+          lru_add(i->first, val, &to_release);
+          return true;
+        } else {
+          return false;
+        }
+      });
+      --waiting;
+    }
+    return val;
+  }
+  bool get_next(const K &key, std::pair<K, VPtr> *next) {
+    std::pair<K, VPtr> r;
+    {
+      std::lock_guard l{lock};
+      VPtr next_val;
+      typename std::map<K, std::pair<WeakVPtr, V*>, C>::iterator i = weak_refs.upper_bound(key);
+
+      while (i != weak_refs.end() &&
+	     !(next_val = i->second.first.lock()))
+	++i;
+
+      if (i == weak_refs.end())
+	return false;
+
+      if (next)
+	r = make_pair(i->first, next_val);
+    }
+    if (next)
+      *next = r;
+    return true;
+  }
+  bool get_next(const K &key, std::pair<K, V> *next) {
+    std::pair<K, VPtr> r;
+    bool found = get_next(key, &r);
+    if (!found || !next)
+      return found;
+    next->first = r.first;
+    ceph_assert(r.second);
+    next->second = *(r.second);
+    return found;
+  }
+
+  VPtr lookup(const K& key) {
+    VPtr val;
+    std::list<VPtr> to_release;
+    {
+      std::unique_lock l{lock};
+      ++waiting;
+      cond.wait(l, [this, &key, &val, &to_release] {
+        if (auto i = weak_refs.find(key); i != weak_refs.end()) {
+          if (val = i->second.first.lock(); val) {
+            lru_add(key, val, &to_release);
+            return true;
+          } else {
+            return false;
+          }
+        } else {
+          return true;
+        }
+      });
+      --waiting;
+    }
+    return val;
+  }
+  VPtr lookup_or_create(const K &key) {
+    VPtr val;
+    list<VPtr> to_release;
+    {
+      std::unique_lock l{lock};
+      cond.wait(l, [this, &key, &val] {
+        if (auto i = weak_refs.find(key); i != weak_refs.end()) {
+          if (val = i->second.first.lock(); val) {
+            return true;
+          } else {
+            return false;
+          }
+        } else {
+          return true;
+        }
+      });
+      if (!val) {
+        val = VPtr{new V{}, Cleanup{this, key}};
+        weak_refs.insert(make_pair(key, make_pair(val, val.get())));
+      }
+      lru_add(key, val, &to_release);
+    }
+    return val;
+  }
+
+  /**
+   * empty()
+   *
+   * Returns true iff there are no live references left to anything that has been
+   * in the cache.
+   */
+  bool empty() {
+    std::lock_guard l{lock};
+    return weak_refs.empty();
+  }
+
+  /***
+   * Inserts a key if not present, or bumps it to the front of the LRU if
+   * it is, and then gives you a reference to the value. If the key already
+   * existed, you are responsible for deleting the new value you tried to
+   * insert.
+   *
+   * @param key The key to insert
+   * @param value The value that goes with the key
+   * @param existed Set to true if the value was already in the
+   * map, false otherwise
+   * @return A reference to the map's value for the given key
+   */
+  VPtr add(const K& key, V *value, bool *existed = NULL) {
+    VPtr val;
+    list<VPtr> to_release;
+    {
+      typename map<K, pair<WeakVPtr, V*>, C>::iterator actual;
+      std::unique_lock l{lock};
+      cond.wait(l, [this, &key, &actual, &val] {
+	  actual = weak_refs.lower_bound(key);
+	  if (actual != weak_refs.end() && actual->first == key) {
+	    val = actual->second.first.lock();
+	    if (val) {
+	      return true;
+	    } else {
+	      return false;
+	    }
+	  } else {
+	    return true;
+	  }
+      });
+
+      if (val) {
+	if (existed) {
+	  *existed = true;
+	}
+      } else {
+	if (existed) {
+	  *existed = false;
+	}
+	val = VPtr(value, Cleanup(this, key));
+	weak_refs.insert(actual, make_pair(key, make_pair(val, value)));
+      }
+      lru_add(key, val, &to_release);
+    }
+    return val;
+  }
+
+  friend class SharedLRUTest;
+};
+
+#endif
diff --git a/src/common/shared_mutex_debug.cc b/src/common/shared_mutex_debug.cc
new file mode 100644
index 00000000..c0a031a4
--- /dev/null
+++ b/src/common/shared_mutex_debug.cc
@@ -0,0 +1,165 @@
+#include "shared_mutex_debug.h"
+
+#include <system_error>
+
+#include "acconfig.h"
+#include "common/valgrind.h"
+
+namespace ceph {
+
+shared_mutex_debug::shared_mutex_debug(const std::string& n,
+                                       bool track_lock,
+                                       bool enable_lock_dep,
+                                       bool prioritize_write)
+  : mutex_debugging_base{n, false /* backtrace */},
+    track(track_lock),
+    lockdep(enable_lock_dep)
+{
+#ifdef HAVE_PTHREAD_RWLOCKATTR_SETKIND_NP
+  if (prioritize_write) {
+    pthread_rwlockattr_t attr;
+    pthread_rwlockattr_init(&attr);
+    // PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP
+    //   Setting the lock kind to this avoids writer starvation as long as
+    //   long as any read locking is not done in a recursive fashion.
+    pthread_rwlockattr_setkind_np(&attr,
+                                  PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP);
+    pthread_rwlock_init(&rwlock, &attr);
+    pthread_rwlockattr_destroy(&attr);
+  } else
+#endif
+  // Next block is in {} to possibly connect to the above if when code is used.
+  {
+    pthread_rwlock_init(&rwlock, NULL);
+  }
+  ANNOTATE_BENIGN_RACE_SIZED(&id, sizeof(id), "shared_mutex_debug lockdep id");
+  ANNOTATE_BENIGN_RACE_SIZED(&nlock, sizeof(nlock), "shared_mutex_debug nwlock");
+  ANNOTATE_BENIGN_RACE_SIZED(&nrlock, sizeof(nrlock), "shared_mutex_debug nrlock");
+}
+
+// exclusive
+void shared_mutex_debug::lock()
+{
+  if (g_lockdep && lockdep) {
+    _will_lock();
+  }
+  if (int r = pthread_rwlock_wrlock(&rwlock); r != 0) {
+    throw std::system_error(r, std::generic_category());
+  }
+  if (lockdep && g_lockdep) {
+    _locked();
+  }
+  _post_lock();
+}
+
+bool shared_mutex_debug::try_lock()
+{
+  int r = pthread_rwlock_trywrlock(&rwlock);
+  switch (r) {
+  case 0:
+    if (lockdep && g_lockdep) {
+      _locked();
+    }
+    _post_lock();
+    return true;
+  case EBUSY:
+    return false;
+  default:
+    throw std::system_error(r, std::generic_category());
+  }
+}
+
+void shared_mutex_debug::unlock()
+{
+  _pre_unlock();
+  if (lockdep && g_lockdep) {
+    _will_unlock();
+  }
+  if (int r = pthread_rwlock_unlock(&rwlock); r != 0) {
+    throw std::system_error(r, std::generic_category());
+  }
+}
+
+// shared locking
+void shared_mutex_debug::lock_shared()
+{
+  if (lockdep && g_lockdep) {
+    _will_lock();
+  }
+  if (int r = pthread_rwlock_rdlock(&rwlock); r != 0) {
+    throw std::system_error(r, std::generic_category());
+  }
+  if (lockdep && g_lockdep) {
+    _locked();
+  }
+  _post_lock_shared();
+}
+
+bool shared_mutex_debug::try_lock_shared()
+{
+  if (lockdep && g_lockdep) {
+    _will_unlock();
+  }
+  switch (int r = pthread_rwlock_rdlock(&rwlock); r) {
+  case 0:
+    if (lockdep && g_lockdep) {
+      _locked();
+    }
+    _post_lock_shared();
+    return true;
+  case EBUSY:
+    return false;
+  default:
+    throw std::system_error(r, std::generic_category());
+  }
+}
+
+void shared_mutex_debug::unlock_shared()
+{
+  _pre_unlock_shared();
+  if (lockdep && g_lockdep) {
+    _will_unlock();
+  }
+  if (int r = pthread_rwlock_unlock(&rwlock); r != 0) {
+    throw std::system_error(r, std::generic_category());
+  }
+}
+
+// exclusive locking
+void shared_mutex_debug::_pre_unlock()
+{
+  if (track) {
+    ceph_assert(nlock > 0);
+    --nlock;
+    ceph_assert(locked_by == std::this_thread::get_id());
+    ceph_assert(nlock == 0);
+    locked_by = std::thread::id();
+  }
+}
+
+void shared_mutex_debug::_post_lock()
+{
+  if (track) {
+    ceph_assert(nlock == 0);
+    locked_by = std::this_thread::get_id();
+    ++nlock;
+  }
+}
+
+// shared locking
+void shared_mutex_debug::_pre_unlock_shared()
+{
+  if (track) {
+    ceph_assert(nrlock > 0);
+    nrlock--;
+  }
+}
+
+void shared_mutex_debug::_post_lock_shared()
+{
+  if (track) {
+    ++nrlock;
+  }
+}
+
+} // namespace ceph
diff --git a/src/common/shared_mutex_debug.h b/src/common/shared_mutex_debug.h
new file mode 100644
index 00000000..57d95b0c
--- /dev/null
+++ b/src/common/shared_mutex_debug.h
@@ -0,0 +1,57 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <pthread.h>
+#include <atomic>
+
+#include "common/mutex_debug.h"
+
+namespace ceph {
+
+class shared_mutex_debug :
+    public ceph::mutex_debug_detail::mutex_debugging_base
+{
+  pthread_rwlock_t rwlock;
+  const bool track;
+  const bool lockdep;
+  std::atomic<unsigned> nrlock{0};
+
+public:
+  // Mutex concept is DefaultConstructible
+  shared_mutex_debug()
+    : shared_mutex_debug{std::string{}}
+  {}
+  shared_mutex_debug(const std::string& n,
+		     bool track_lock=true,
+		     bool enable_lock_dep=true,
+		     bool prioritize_write=false);
+  // exclusive locking
+  void lock();
+  bool try_lock();
+  void unlock();
+  bool is_wlocked() const {
+    return nlock > 0;
+  }
+  // shared locking
+  void lock_shared();
+  bool try_lock_shared();
+  void unlock_shared();
+  bool is_rlocked() const {
+    return nrlock > 0;
+  }
+  // either of them
+  bool is_locked() const {
+    return nlock > 0 || nrlock > 0;
+  }
+private:
+  // exclusive locking
+  void _pre_unlock();
+  void _post_lock();
+  // shared locking
+  void _pre_unlock_shared();
+  void _post_lock_shared();
+};
+
+} // namespace ceph
diff --git a/src/common/sharedptr_registry.hpp b/src/common/sharedptr_registry.hpp
new file mode 100644
index 00000000..3b3cf01b
--- /dev/null
+++ b/src/common/sharedptr_registry.hpp
@@ -0,0 +1,189 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_SHAREDPTR_REGISTRY_H
+#define CEPH_SHAREDPTR_REGISTRY_H
+
+#include <map>
+#include <memory>
+#include "common/ceph_mutex.h"
+
+/**
+ * Provides a registry of shared_ptr<V> indexed by K while
+ * the references are alive.
+ */
+template <class K, class V, class C = std::less<K> >
+class SharedPtrRegistry {
+public:
+  typedef std::shared_ptr<V> VPtr;
+  typedef std::weak_ptr<V> WeakVPtr;
+  int waiting;
+private:
+  ceph::mutex lock = ceph::make_mutex("SharedPtrRegistry::lock");
+  ceph::condition_variable cond;
+  std::map<K, std::pair<WeakVPtr, V*>, C> contents;
+
+  class OnRemoval {
+    SharedPtrRegistry<K,V,C> *parent;
+    K key;
+  public:
+    OnRemoval(SharedPtrRegistry<K,V,C> *parent, K key) :
+      parent(parent), key(key) {}
+    void operator()(V *to_remove) {
+      {
+	std::lock_guard l(parent->lock);
+	typename std::map<K, std::pair<WeakVPtr, V*>, C>::iterator i =
+	  parent->contents.find(key);
+	if (i != parent->contents.end() &&
+	    i->second.second == to_remove) {
+	  parent->contents.erase(i);
+	  parent->cond.notify_all();
+	}
+      }
+      delete to_remove;
+    }
+  };
+  friend class OnRemoval;
+
+public:
+  SharedPtrRegistry() :
+    waiting(0)
+  {}
+
+  bool empty() {
+    std::lock_guard l(lock);
+    return contents.empty();
+  }
+
+  bool get_next(const K &key, std::pair<K, VPtr> *next) {
+    std::pair<K, VPtr> r;
+    {
+      std::lock_guard l(lock);
+      VPtr next_val;
+      typename std::map<K, std::pair<WeakVPtr, V*>, C>::iterator i =
+	contents.upper_bound(key);
+      while (i != contents.end() &&
+	     !(next_val = i->second.first.lock()))
+	++i;
+      if (i == contents.end())
+	return false;
+      if (next)
+	r = std::make_pair(i->first, next_val);
+    }
+    if (next)
+      *next = r;
+    return true;
+  }
+
+  
+  bool get_next(const K &key, std::pair<K, V> *next) {
+    VPtr next_val;
+    std::lock_guard l(lock);
+    typename std::map<K, std::pair<WeakVPtr, V*>, C>::iterator i =
+      contents.upper_bound(key);
+    while (i != contents.end() &&
+	   !(next_val = i->second.first.lock()))
+      ++i;
+    if (i == contents.end())
+      return false;
+    if (next)
+      *next = std::make_pair(i->first, *next_val);
+    return true;
+  }
+
+  VPtr lookup(const K &key) {
+    std::unique_lock l(lock);
+    waiting++;
+    while (1) {
+      typename std::map<K, std::pair<WeakVPtr, V*>, C>::iterator i =
+	contents.find(key);
+      if (i != contents.end()) {
+	VPtr retval = i->second.first.lock();
+	if (retval) {
+	  waiting--;
+	  return retval;
+	}
+      } else {
+	break;
+      }
+      cond.wait(l);
+    }
+    waiting--;
+    return VPtr();
+  }
+
+  VPtr lookup_or_create(const K &key) {
+    std::unique_lock l(lock);
+    waiting++;
+    while (1) {
+      typename std::map<K, std::pair<WeakVPtr, V*>, C>::iterator i =
+	contents.find(key);
+      if (i != contents.end()) {
+	VPtr retval = i->second.first.lock();
+	if (retval) {
+	  waiting--;
+	  return retval;
+	}
+      } else {
+	break;
+      }
+      cond.wait(l);
+    }
+    V *ptr = new V();
+    VPtr retval(ptr, OnRemoval(this, key));
+    contents.insert(std::make_pair(key, make_pair(retval, ptr)));
+    waiting--;
+    return retval;
+  }
+
+  unsigned size() {
+    std::lock_guard l(lock);
+    return contents.size();
+  }
+
+  void remove(const K &key) {
+    std::lock_guard l(lock);
+    contents.erase(key);
+    cond.notify_all();
+  }
+
+  template<class A>
+  VPtr lookup_or_create(const K &key, const A &arg) {
+    std::unique_lock l(lock);
+    waiting++;
+    while (1) {
+      typename std::map<K, std::pair<WeakVPtr, V*>, C>::iterator i =
+	contents.find(key);
+      if (i != contents.end()) {
+	VPtr retval = i->second.first.lock();
+	if (retval) {
+	  waiting--;
+	  return retval;
+	}
+      } else {
+	break;
+      }
+      cond.wait(l);
+    }
+    V *ptr = new V(arg);
+    VPtr retval(ptr, OnRemoval(this, key));
+    contents.insert(std::make_pair(key, make_pair(retval, ptr)));
+    waiting--;
+    return retval;
+  }
+
+  friend class SharedPtrRegistryTest;
+};
+
+#endif
diff --git a/src/common/shunique_lock.h b/src/common/shunique_lock.h
new file mode 100644
index 00000000..2a8da953
--- /dev/null
+++ b/src/common/shunique_lock.h
@@ -0,0 +1,394 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_COMMON_SHUNIQUE_LOCK_H
+#define CEPH_COMMON_SHUNIQUE_LOCK_H
+
+#include <mutex>
+#include <shared_mutex>
+#include <system_error>
+
+namespace ceph {
+// This is a 'lock' class in the style of shared_lock and
+// unique_lock. Like shared_mutex it implements both Lockable and
+// SharedLockable.
+
+// My rationale is thus: one of the advantages of unique_lock is that
+// I can pass a thread of execution's control of a lock around as a
+// parameter. So that methods further down the call stack can unlock
+// it, do something, relock it, and have the lock state be known by
+// the caller afterward, explicitly. The shared_lock class offers a
+// similar advantage to shared_lock, but each class is one or the
+// other. In Objecter we have calls that in most cases need /a/ lock
+// on the shared mutex, and whether it's shared or exclusive doesn't
+// matter. In some circumstances they may drop the shared lock and
+// reacquire an exclusive one. This could be handled by passing both a
+// shared and unique lock down the call stack. This is vexacious and
+// shameful.
+
+// Wanting to avoid heaping shame and vexation upon myself, I threw
+// this class together.
+
+// This class makes no attempt to support atomic upgrade or
+// downgrade. I don't want either. Matt has convinced me that if you
+// think you want them you've usually made a mistake somewhere. It is
+// exactly and only a reification of the state held on a shared mutex.
+
+/// Acquire unique ownership of the mutex.
+struct acquire_unique_t { };
+
+/// Acquire shared ownership of the mutex.
+struct acquire_shared_t { };
+
+constexpr acquire_unique_t acquire_unique { };
+constexpr acquire_shared_t acquire_shared { };
+
+template<typename Mutex>
+class shunique_lock {
+public:
+  typedef Mutex mutex_type;
+  typedef std::unique_lock<Mutex> unique_lock_type;
+  typedef std::shared_lock<Mutex> shared_lock_type;
+
+  shunique_lock() noexcept : m(nullptr), o(ownership::none) { }
+
+  // We do not provide a default locking/try_locking constructor that
+  // takes only the mutex, since it is not clear whether to take it
+  // shared or unique. We explicitly require the use of lock_deferred
+  // to prevent Nasty Surprises.
+
+  shunique_lock(mutex_type& m, std::defer_lock_t) noexcept
+    : m(&m), o(ownership::none) { }
+
+  shunique_lock(mutex_type& m, acquire_unique_t)
+    : m(&m), o(ownership::none) {
+    lock();
+  }
+
+  shunique_lock(mutex_type& m, acquire_shared_t)
+    : m(&m), o(ownership::none) {
+    lock_shared();
+  }
+
+  template<typename AcquireType>
+  shunique_lock(mutex_type& m, AcquireType at, std::try_to_lock_t)
+    : m(&m), o(ownership::none) {
+    try_lock(at);
+  }
+
+  shunique_lock(mutex_type& m, acquire_unique_t, std::adopt_lock_t)
+    : m(&m), o(ownership::unique) {
+    // You'd better actually have a lock, or I will find you and I
+    // will hunt you down.
+  }
+
+  shunique_lock(mutex_type& m, acquire_shared_t, std::adopt_lock_t)
+    : m(&m), o(ownership::shared) {
+  }
+
+  template<typename AcquireType, typename Clock, typename Duration>
+  shunique_lock(mutex_type& m, AcquireType at,
+		const std::chrono::time_point<Clock, Duration>& t)
+    : m(&m), o(ownership::none) {
+    try_lock_until(at, t);
+  }
+
+  template<typename AcquireType, typename Rep, typename Period>
+  shunique_lock(mutex_type& m, AcquireType at,
+		const std::chrono::duration<Rep, Period>& dur)
+    : m(&m), o(ownership::none) {
+    try_lock_for(at, dur);
+  }
+
+  ~shunique_lock() {
+    switch (o) {
+    case ownership::none:
+      return;
+      break;
+    case ownership::unique:
+      m->unlock();
+      break;
+    case ownership::shared:
+      m->unlock_shared();
+      break;
+    }
+  }
+
+  shunique_lock(shunique_lock const&) = delete;
+  shunique_lock& operator=(shunique_lock const&) = delete;
+
+  shunique_lock(shunique_lock&& l) noexcept : shunique_lock() {
+    swap(l);
+  }
+
+  shunique_lock(unique_lock_type&& l) noexcept {
+    if (l.owns_lock())
+      o = ownership::unique;
+    else
+      o = ownership::none;
+    m = l.release();
+  }
+
+  shunique_lock(shared_lock_type&& l) noexcept {
+    if (l.owns_lock())
+      o = ownership::shared;
+    else
+      o = ownership::none;
+    m = l.release();
+  }
+
+  shunique_lock& operator=(shunique_lock&& l) noexcept {
+    shunique_lock(std::move(l)).swap(*this);
+    return *this;
+  }
+
+  shunique_lock& operator=(unique_lock_type&& l) noexcept {
+    shunique_lock(std::move(l)).swap(*this);
+    return *this;
+  }
+
+  shunique_lock& operator=(shared_lock_type&& l) noexcept {
+    shunique_lock(std::move(l)).swap(*this);
+    return *this;
+  }
+
+  void lock() {
+    lockable();
+    m->lock();
+    o = ownership::unique;
+  }
+
+  void lock_shared() {
+    lockable();
+    m->lock_shared();
+    o = ownership::shared;
+  }
+
+  void lock(ceph::acquire_unique_t) {
+    lock();
+  }
+
+  void lock(ceph::acquire_shared_t) {
+    lock_shared();
+  }
+
+  bool try_lock() {
+    lockable();
+    if (m->try_lock()) {
+      o = ownership::unique;
+      return true;
+    }
+    return false;
+  }
+
+  bool try_lock_shared() {
+    lockable();
+    if (m->try_lock_shared()) {
+      o = ownership::shared;
+      return true;
+    }
+    return false;
+  }
+
+  bool try_lock(ceph::acquire_unique_t) {
+    return try_lock();
+  }
+
+  bool try_lock(ceph::acquire_shared_t) {
+    return try_lock_shared();
+  }
+
+  template<typename Rep, typename Period>
+  bool try_lock_for(const std::chrono::duration<Rep, Period>& dur) {
+    lockable();
+    if (m->try_lock_for(dur)) {
+      o = ownership::unique;
+      return true;
+    }
+    return false;
+  }
+
+  template<typename Rep, typename Period>
+  bool try_lock_shared_for(const std::chrono::duration<Rep, Period>& dur) {
+    lockable();
+    if (m->try_lock_shared_for(dur)) {
+      o = ownership::shared;
+      return true;
+    }
+    return false;
+  }
+
+  template<typename Rep, typename Period>
+  bool try_lock_for(ceph::acquire_unique_t,
+		    const std::chrono::duration<Rep, Period>& dur) {
+    return try_lock_for(dur);
+  }
+
+  template<typename Rep, typename Period>
+  bool try_lock_for(ceph::acquire_shared_t,
+		    const std::chrono::duration<Rep, Period>& dur) {
+    return try_lock_shared_for(dur);
+  }
+
+  template<typename Clock, typename Duration>
+  bool try_lock_until(const std::chrono::time_point<Clock, Duration>& time) {
+    lockable();
+    if (m->try_lock_until(time)) {
+      o = ownership::unique;
+      return true;
+    }
+    return false;
+  }
+
+  template<typename Clock, typename Duration>
+  bool try_lock_shared_until(const std::chrono::time_point<Clock,
+			     Duration>& time) {
+    lockable();
+    if (m->try_lock_shared_until(time)) {
+      o = ownership::shared;
+      return true;
+    }
+    return false;
+  }
+
+  template<typename Clock, typename Duration>
+  bool try_lock_until(ceph::acquire_unique_t,
+		      const std::chrono::time_point<Clock, Duration>& time) {
+    return try_lock_until(time);
+  }
+
+  template<typename Clock, typename Duration>
+  bool try_lock_until(ceph::acquire_shared_t,
+		      const std::chrono::time_point<Clock, Duration>& time) {
+    return try_lock_shared_until(time);
+  }
+
+  // Only have a single unlock method. Otherwise we'd be building an
+  // Acme lock class suitable only for ravenous coyotes desparate to
+  // devour a road runner. It would be bad. It would be disgusting. It
+  // would be infelicitous as heck. It would leave our developers in a
+  // state of seeming safety unaware of the yawning chasm of failure
+  // that had opened beneath their feet that would soon transition
+  // into a sickening realization of the error they made and a brief
+  // moment of blinking self pity before their program hurled itself
+  // into undefined behaviour and plummeted up the stack with core
+  // dumps trailing behind it.
+
+  void unlock() {
+    switch (o) {
+    case ownership::none:
+      throw std::system_error((int)std::errc::resource_deadlock_would_occur,
+			      std::generic_category());
+      break;
+
+    case ownership::unique:
+      m->unlock();
+      break;
+
+    case ownership::shared:
+      m->unlock_shared();
+      break;
+    }
+    o = ownership::none;
+  }
+
+  // Setters
+
+  void swap(shunique_lock& u) noexcept {
+    std::swap(m, u.m);
+    std::swap(o, u.o);
+  }
+
+  mutex_type* release() noexcept {
+    o = ownership::none;
+    mutex_type* tm = m;
+    m = nullptr;
+    return tm;
+  }
+
+  // Ideally I'd rather make a move constructor for std::unique_lock
+  // that took a shunique_lock, but obviously I can't.
+  unique_lock_type release_to_unique() {
+    if (o == ownership::unique) {
+      o = ownership::none;
+      unique_lock_type tu(*m, std::adopt_lock);
+      m = nullptr;
+      return tu;
+    } else if (o == ownership::none) {
+      unique_lock_type tu(*m, std::defer_lock);
+      m = nullptr;
+      return tu;
+    } else if (m == nullptr) {
+      return unique_lock_type();
+    }
+    throw std::system_error((int)std::errc::operation_not_permitted,
+			    std::generic_category());
+  }
+
+  shared_lock_type release_to_shared() {
+    if (o == ownership::shared) {
+      o = ownership::none;
+      shared_lock_type ts(*m, std::adopt_lock);
+      m = nullptr;
+      return ts;
+    } else if (o == ownership::none) {
+      shared_lock_type ts(*m, std::defer_lock);
+      m = nullptr;
+      return ts;
+    } else if (m == nullptr) {
+      return shared_lock_type();
+    }
+    throw std::system_error((int)std::errc::operation_not_permitted,
+			    std::generic_category());
+    return shared_lock_type();
+  }
+
+  // Getters
+
+  // Note that this returns true if the lock UNIQUE, it will return
+  // false for shared
+  bool owns_lock() const noexcept {
+    return o == ownership::unique;
+  }
+
+  bool owns_lock_shared() const noexcept {
+    return o == ownership::shared;
+  }
+
+  // If you want to make sure you have a lock of some sort on the
+  // mutex, just treat as a bool.
+  explicit operator bool() const noexcept {
+    return o != ownership::none;
+  }
+
+  mutex_type* mutex() const noexcept {
+    return m;
+  }
+
+private:
+  void lockable() const {
+    if (m == nullptr)
+      throw std::system_error((int)std::errc::operation_not_permitted,
+			      std::generic_category());
+    if (o != ownership::none)
+      throw std::system_error((int)std::errc::resource_deadlock_would_occur,
+			      std::generic_category());
+  }
+
+  mutex_type*	m;
+  enum struct ownership : uint8_t {
+    none, unique, shared
+      };
+  ownership o;
+};
+} // namespace ceph
+
+namespace std {
+  template<typename Mutex>
+  void swap(ceph::shunique_lock<Mutex> sh1,
+	    ceph::shunique_lock<Mutex> sha) {
+    sh1.swap(sha);
+  }
+} // namespace std
+
+#endif // CEPH_COMMON_SHUNIQUE_LOCK_H
diff --git a/src/common/signal.cc b/src/common/signal.cc
new file mode 100644
index 00000000..31e33d46
--- /dev/null
+++ b/src/common/signal.cc
@@ -0,0 +1,80 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/BackTrace.h"
+#include "common/perf_counters.h"
+#include "global/pidfile.h"
+#include "common/debug.h"
+#include "common/signal.h"
+#include "common/config.h"
+
+#include <signal.h>
+#include <sstream>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+std::string signal_mask_to_str()
+{
+  sigset_t old_sigset;
+  if (pthread_sigmask(SIG_SETMASK, NULL, &old_sigset)) {
+    return "(pthread_signmask failed)";
+  }
+
+  ostringstream oss;
+  oss << "show_signal_mask: { ";
+  string sep("");
+  for (int signum = 0; signum < NSIG; ++signum) {
+    if (sigismember(&old_sigset, signum) == 1) {
+      oss << sep << signum;
+      sep = ", ";
+    }
+  }
+  oss << " }";
+  return oss.str();
+}
+
+/* Block the signals in 'siglist'. If siglist == NULL, block all signals. */
+void block_signals(const int *siglist, sigset_t *old_sigset)
+{
+  sigset_t sigset;
+  if (!siglist) {
+    sigfillset(&sigset);
+  }
+  else {
+    int i = 0;
+    sigemptyset(&sigset);
+    while (siglist[i]) {
+      sigaddset(&sigset, siglist[i]);
+      ++i;
+    }
+  }
+  int ret = pthread_sigmask(SIG_BLOCK, &sigset, old_sigset);
+  ceph_assert(ret == 0);
+}
+
+void restore_sigset(const sigset_t *old_sigset)
+{
+  int ret = pthread_sigmask(SIG_SETMASK, old_sigset, NULL);
+  ceph_assert(ret == 0);
+}
+
+void unblock_all_signals(sigset_t *old_sigset)
+{
+  sigset_t sigset;
+  sigfillset(&sigset);
+  sigdelset(&sigset, SIGKILL);
+  int ret = pthread_sigmask(SIG_UNBLOCK, &sigset, old_sigset);
+  ceph_assert(ret == 0);
+}
diff --git a/src/common/signal.h b/src/common/signal.h
new file mode 100644
index 00000000..4b323de4
--- /dev/null
+++ b/src/common/signal.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_SIGNAL_H
+#define CEPH_COMMON_SIGNAL_H
+
+#include <signal.h>
+#include <string>
+
+// Returns a string showing the set of blocked signals for the calling thread.
+// Other threads may have a different set (this is per-thread thing).
+extern std::string signal_mask_to_str();
+
+// Block a list of signals. If siglist == NULL, blocks all signals.
+// If not, the list is terminated with a 0 element.
+//
+// On success, stores the old set of blocked signals in
+// old_sigset. On failure, stores an invalid set of blocked signals in
+// old_sigset.
+extern void block_signals(const int *siglist, sigset_t *old_sigset);
+
+// Restore the set of blocked signals. Will not restore an invalid set of
+// blocked signals.
+extern void restore_sigset(const sigset_t *old_sigset);
+
+// Unblock all signals. On success, stores the old set of blocked signals in
+// old_sigset. On failure, stores an invalid set of blocked signals in
+// old_sigset.
+extern void unblock_all_signals(sigset_t *old_sigset);
+
+#endif
diff --git a/src/common/simple_cache.hpp b/src/common/simple_cache.hpp
new file mode 100644
index 00000000..6e8a452e
--- /dev/null
+++ b/src/common/simple_cache.hpp
@@ -0,0 +1,144 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_SIMPLECACHE_H
+#define CEPH_SIMPLECACHE_H
+
+#include "common/ceph_mutex.h"
+#include "include/unordered_map.h"
+
+template <class K, class V, class C = std::less<K>, class H = std::hash<K> >
+class SimpleLRU {
+  ceph::mutex lock = ceph::make_mutex("SimpleLRU::lock");
+  size_t max_size;
+  size_t max_bytes = 0;
+  size_t total_bytes = 0;
+  ceph::unordered_map<K, typename list<pair<K, V> >::iterator, H> contents;
+  list<pair<K, V> > lru;
+  map<K, V, C> pinned;
+
+  void trim_cache() {
+    while (contents.size() > max_size) {
+      contents.erase(lru.back().first);
+      lru.pop_back();
+    }
+  }
+
+  void trim_cache_bytes() {
+    while(total_bytes > max_bytes) {
+      total_bytes -= lru.back().second.length();
+      contents.erase(lru.back().first);
+      lru.pop_back();
+    }
+  }
+
+  void _add(K key, V&& value) {
+    lru.emplace_front(key, std::move(value)); // can't move key because we access it below
+    contents[key] = lru.begin();
+    trim_cache();
+  }
+
+  void _add_bytes(K key, V&& value) {
+    lru.emplace_front(key, std::move(value)); // can't move key because we access it below
+    contents[key] = lru.begin();
+    trim_cache_bytes();
+  }
+
+public:
+  SimpleLRU(size_t max_size) : max_size(max_size) {
+    contents.rehash(max_size);
+  }
+
+  void pin(K key, V val) {
+    std::lock_guard l(lock);
+    pinned.emplace(std::move(key), std::move(val));
+  }
+
+  void clear_pinned(K e) {
+    std::lock_guard l(lock);
+    for (typename map<K, V, C>::iterator i = pinned.begin();
+	 i != pinned.end() && i->first <= e;
+	 pinned.erase(i++)) {
+      typename ceph::unordered_map<K, typename list<pair<K, V> >::iterator, H>::iterator iter =
+        contents.find(i->first);
+      if (iter == contents.end())
+	_add(i->first, std::move(i->second));
+      else
+	lru.splice(lru.begin(), lru, iter->second);
+    }
+  }
+
+  void clear(K key) {
+    std::lock_guard l(lock);
+    typename ceph::unordered_map<K, typename list<pair<K, V> >::iterator, H>::iterator i =
+      contents.find(key);
+    if (i == contents.end())
+      return;
+    total_bytes -= i->second->second.length();
+    lru.erase(i->second);
+    contents.erase(i);
+  }
+
+  void set_size(size_t new_size) {
+    std::lock_guard l(lock);
+    max_size = new_size;
+    trim_cache();
+  }
+
+  size_t get_size() {
+    std::lock_guard l(lock);
+    return contents.size();
+  }
+
+  void set_bytes(size_t num_bytes) {
+    std::lock_guard l(lock);
+    max_bytes = num_bytes;
+    trim_cache_bytes();
+  }
+
+  size_t get_bytes() {
+    std::lock_guard l(lock);
+    return total_bytes;
+  }
+
+  bool lookup(K key, V *out) {
+    std::lock_guard l(lock);
+    typename ceph::unordered_map<K, typename list<pair<K, V> >::iterator, H>::iterator i =
+      contents.find(key);
+    if (i != contents.end()) {
+      *out = i->second->second;
+      lru.splice(lru.begin(), lru, i->second);
+      return true;
+    }
+    typename map<K, V, C>::iterator i_pinned = pinned.find(key);
+    if (i_pinned != pinned.end()) {
+      *out = i_pinned->second;
+      return true;
+    }
+    return false;
+  }
+
+  void add(K key, V value) {
+    std::lock_guard l(lock);
+    _add(std::move(key), std::move(value));
+  }
+
+  void add_bytes(K key, V value) {
+    std::lock_guard l(lock);
+    total_bytes += value.length();
+    _add_bytes(std::move(key), std::move(value));
+  }
+};
+
+#endif
diff --git a/src/common/snap_types.cc b/src/common/snap_types.cc
new file mode 100644
index 00000000..3e33583d
--- /dev/null
+++ b/src/common/snap_types.cc
@@ -0,0 +1,95 @@
+
+#include "snap_types.h"
+#include "common/Formatter.h"
+
+void SnapRealmInfo::encode(bufferlist& bl) const
+{
+  h.num_snaps = my_snaps.size();
+  h.num_prior_parent_snaps = prior_parent_snaps.size();
+  using ceph::encode;
+  encode(h, bl);
+  encode_nohead(my_snaps, bl);
+  encode_nohead(prior_parent_snaps, bl);
+}
+
+void SnapRealmInfo::decode(bufferlist::const_iterator& bl)
+{
+  using ceph::decode;
+  decode(h, bl);
+  decode_nohead(h.num_snaps, my_snaps, bl);
+  decode_nohead(h.num_prior_parent_snaps, prior_parent_snaps, bl);
+}
+
+void SnapRealmInfo::dump(Formatter *f) const
+{
+  f->dump_unsigned("ino", ino());
+  f->dump_unsigned("parent", parent());
+  f->dump_unsigned("seq", seq());
+  f->dump_unsigned("parent_since", parent_since());
+  f->dump_unsigned("created", created());
+
+  f->open_array_section("snaps");
+  for (vector<snapid_t>::const_iterator p = my_snaps.begin(); p != my_snaps.end(); ++p)
+    f->dump_unsigned("snap", *p);
+  f->close_section();
+
+  f->open_array_section("prior_parent_snaps");
+  for (vector<snapid_t>::const_iterator p = prior_parent_snaps.begin(); p != prior_parent_snaps.end(); ++p)
+    f->dump_unsigned("snap", *p);
+  f->close_section();  
+}
+
+void SnapRealmInfo::generate_test_instances(list<SnapRealmInfo*>& o)
+{
+  o.push_back(new SnapRealmInfo);
+  o.push_back(new SnapRealmInfo(1, 10, 10, 0));
+  o.push_back(new SnapRealmInfo(1, 10, 10, 0));
+  o.back()->my_snaps.push_back(10);
+  o.push_back(new SnapRealmInfo(1, 10, 10, 5));
+  o.back()->my_snaps.push_back(10);
+  o.back()->prior_parent_snaps.push_back(3);
+  o.back()->prior_parent_snaps.push_back(5);
+}
+
+
+// -----
+
+bool SnapContext::is_valid() const
+{
+  // seq is a valid snapid
+  if (seq > CEPH_MAXSNAP)
+    return false;
+  if (!snaps.empty()) {
+    // seq >= snaps[0]
+    if (snaps[0] > seq)
+      return false;
+    // snaps[] is descending
+    snapid_t t = snaps[0];
+    for (unsigned i=1; i<snaps.size(); i++) {
+      if (snaps[i] >= t || t == 0)
+	return false;
+      t = snaps[i];
+    }
+  }
+  return true;
+}
+
+void SnapContext::dump(Formatter *f) const
+{
+  f->dump_unsigned("seq", seq);
+  f->open_array_section("snaps");
+  for (vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p)
+    f->dump_unsigned("snap", *p);
+  f->close_section();
+}
+
+void SnapContext::generate_test_instances(list<SnapContext*>& o)
+{
+  o.push_back(new SnapContext);
+  vector<snapid_t> v;
+  o.push_back(new SnapContext(10, v));
+  v.push_back(18);
+  v.push_back(3);
+  v.push_back(1);
+  o.push_back(new SnapContext(20, v));
+}
diff --git a/src/common/snap_types.h b/src/common/snap_types.h
new file mode 100644
index 00000000..0f526993
--- /dev/null
+++ b/src/common/snap_types.h
@@ -0,0 +1,79 @@
+#ifndef __CEPH_SNAP_TYPES_H
+#define __CEPH_SNAP_TYPES_H
+
+#include "include/types.h"
+#include "include/fs_types.h"
+
+namespace ceph {
+
+class Formatter;
+}
+struct SnapRealmInfo {
+  mutable ceph_mds_snap_realm h;
+  vector<snapid_t> my_snaps;
+  vector<snapid_t> prior_parent_snaps;  // before parent_since
+
+  SnapRealmInfo() {
+    // FIPS zeroization audit 20191115: this memset is not security related.
+    memset(&h, 0, sizeof(h));
+  }
+  SnapRealmInfo(inodeno_t ino_, snapid_t created_, snapid_t seq_, snapid_t current_parent_since_) {
+    // FIPS zeroization audit 20191115: this memset is not security related.
+    memset(&h, 0, sizeof(h));
+    h.ino = ino_;
+    h.created = created_;
+    h.seq = seq_;
+    h.parent_since = current_parent_since_;
+  }
+  
+  inodeno_t ino() const { return inodeno_t(h.ino); }
+  inodeno_t parent() const { return inodeno_t(h.parent); }
+  snapid_t seq() const { return snapid_t(h.seq); }
+  snapid_t parent_since() const { return snapid_t(h.parent_since); }
+  snapid_t created() const { return snapid_t(h.created); }
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& bl);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(list<SnapRealmInfo*>& o);
+};
+WRITE_CLASS_ENCODER(SnapRealmInfo)
+
+
+struct SnapContext {
+  snapid_t seq;            // 'time' stamp
+  vector<snapid_t> snaps;  // existent snaps, in descending order
+
+  SnapContext() {}
+  SnapContext(snapid_t s, const vector<snapid_t>& v) : seq(s), snaps(v) {}    
+
+  bool is_valid() const;
+
+  void clear() {
+    seq = 0;
+    snaps.clear();
+  }
+  bool empty() { return seq == 0; }
+
+  void encode(bufferlist& bl) const {
+    using ceph::encode;
+    encode(seq, bl);
+    encode(snaps, bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    using ceph::decode;
+    decode(seq, bl);
+    decode(snaps, bl);
+  }
+  void dump(Formatter *f) const;
+  static void generate_test_instances(list<SnapContext*>& o);
+};
+WRITE_CLASS_ENCODER(SnapContext)
+
+inline ostream& operator<<(ostream& out, const SnapContext& snapc) {
+  return out << snapc.seq << "=" << snapc.snaps;
+}
+
+//}
+
+#endif
diff --git a/src/common/solaris_errno.cc b/src/common/solaris_errno.cc
new file mode 100644
index 00000000..bb68e68f
--- /dev/null
+++ b/src/common/solaris_errno.cc
@@ -0,0 +1,233 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/types.h"
+
+
+// converts from linux errno values to host values
+__s32 ceph_to_hostos_errno(__s32 r) 
+{
+  if (r < -34) {
+    switch (r) {
+      case -35:
+        return -EDEADLK;
+      case -36:
+        return -ENAMETOOLONG;
+      case -37:
+        return -ENOLCK;
+      case -38:
+        return -ENOSYS;
+      case -39:
+        return -ENOTEMPTY;
+      case -40:
+        return -ELOOP;
+      case -42:
+        return -ENOMSG;
+      case -43:
+        return -EIDRM;
+      case -44:
+        return -ECHRNG;
+      case -45:
+        return -EL2NSYNC;
+      case -46:
+        return -EL3HLT;
+      case -47:
+        return -EL3RST;
+      case -48:
+        return -ELNRNG;
+      case -49:
+        return -EUNATCH;
+      case -50:
+        return -ENOCSI;
+      case -51:
+        return -EL2HLT;
+      case -52:
+        return -EBADE;
+      case -53:
+        return -EBADR;
+      case -54:
+        return -EXFULL;
+      case -55:
+        return -ENOANO;
+      case -56:
+        return -EBADRQC;
+      case -57:
+        return -EBADSLT;
+      case -59:
+        return -EBFONT;
+      case -60:
+        return -ENOSTR;
+      case -61:
+        return -ENODATA;
+      case -62:
+        return -ETIME;
+      case -63:
+        return -ENOSR;
+      //case -64:
+      //  return -EPERM; //TODO ENONET
+      //case -65:
+      //  return -EPERM; //TODO ENOPKG
+      //case -66:
+      //  return -EREMOTE;
+      //case -67:
+      //  return -ENOLINK;
+      //case -68:
+      //  return -EPERM; //TODO EADV 
+      //case -69:
+      //  return -EPERM; //TODO ESRMNT 
+      //case -70:
+      //  return -EPERM; //TODO ECOMM
+      case -71:
+        return -EPROTO;
+      case -72:
+        return -EMULTIHOP;
+      case -73:
+        return -EPERM; //TODO EDOTDOT 
+      case -74:
+        return -EBADMSG;
+      case -75:
+        return -EOVERFLOW;
+      case -76:
+        return -ENOTUNIQ;
+      case -77:
+        return -EBADFD;
+      case -78:
+        return -EREMCHG;
+      case -79:
+        return -ELIBACC;
+      case -80:
+        return -ELIBBAD;
+      case -81:
+        return -ELIBSCN;
+      case -82:
+        return -ELIBMAX;
+      case -83:
+	return -ELIBEXEC;
+      case -84:
+        return -EILSEQ;
+      case -85:
+        return -ERESTART;
+      case -86:
+        return -ESTRPIPE; 
+      case -87:
+        return -EUSERS;
+      case -88:
+        return -ENOTSOCK;
+      case -89:
+        return -EDESTADDRREQ;
+      case -90:
+        return -EMSGSIZE;
+      case -91:
+        return -EPROTOTYPE;
+      case -92:
+        return -ENOPROTOOPT;
+      case -93:
+        return -EPROTONOSUPPORT;
+      case -94:
+        return -ESOCKTNOSUPPORT;
+      case -95:
+        return -EOPNOTSUPP;
+      case -96:
+        return -EPFNOSUPPORT;
+      case -97:
+        return -EAFNOSUPPORT;
+      case -98:
+        return -EADDRINUSE;
+      case -99:
+        return -EADDRNOTAVAIL;
+      case -100:
+        return -ENETDOWN;
+      case -101:
+        return -ENETUNREACH;
+      case -102:
+        return -ENETRESET;
+      case -103:
+        return -ECONNABORTED;
+      case -104:
+        return -ECONNRESET;
+      case -105:
+        return -ENOBUFS;
+      case -106:
+        return -EISCONN;
+      case -107:
+        return -ENOTCONN;
+      case -108:
+        return -ESHUTDOWN;
+      case -109:
+        return -ETOOMANYREFS;
+      case -110:
+        return -ETIMEDOUT;
+      case -111:
+        return -ECONNREFUSED;
+      case -112:
+        return -EHOSTDOWN;
+      case -113:
+        return -EHOSTUNREACH;
+      case -114:
+        return -EALREADY;
+      case -115:
+        return -EINPROGRESS;
+      case -116:
+        return -ESTALE;
+      case -117:
+        return -EPERM; //TODO EUCLEAN 
+      case -118:
+        return -EPERM; //TODO ENOTNAM
+      case -119:
+        return -EPERM; //TODO ENAVAIL
+      case -120:
+        return -EPERM; //TODO EISNAM
+      case -121:
+        return -EPERM; //TODO EREMOTEIO
+      case -122:
+        return -EDQUOT;
+      case -123:
+        return -EPERM; //TODO ENOMEDIUM
+      case -124:
+        return -EPERM; //TODO EMEDIUMTYPE - not used
+      case -125:
+        return -ECANCELED;
+      case -126:
+        return -EPERM; //TODO ENOKEY
+      case -127:
+        return -EPERM; //TODO EKEYEXPIRED
+      case -128:
+        return -EPERM; //TODO EKEYREVOKED
+      case -129:
+        return -EPERM; //TODO EKEYREJECTED
+      case -130:
+        return -EOWNERDEAD;
+      case -131:
+        return -ENOTRECOVERABLE;
+      case -132:
+        return -EPERM; //TODO ERFKILL
+      case -133:
+        return -EPERM; //TODO EHWPOISON
+
+      default: { 
+        break;
+      }
+    }
+  } 
+  return r; // otherwise return original value
+}
+
+// converts Host OS errno values to linux/Ceph values
+// XXX Currently not worked out
+__s32 hostos_to_ceph_errno(__s32 r)
+{
+  return r;
+}
+
+
diff --git a/src/common/sstring.hh b/src/common/sstring.hh
new file mode 100644
index 00000000..fe5409ec
--- /dev/null
+++ b/src/common/sstring.hh
@@ -0,0 +1,717 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2014 Cloudius Systems
+ */
+/*
+ * C++2014 dependencies removed.  Uses of std::string_view adapted to
+ * boost::string_ref.  Matt Benjamin <mbenjamin@redhat.com>
+ */
+
+#ifndef SSTRING_HH_
+#define SSTRING_HH_
+
+#include <type_traits>
+#include <boost/utility/string_view.hpp>
+
+#include "include/buffer.h"
+#include "include/denc.h"
+
+template <typename char_type, typename Size, Size max_size>
+class basic_sstring;
+
+using sstring = basic_sstring<char, uint32_t, 15>;
+
+template <typename string_type = sstring, typename T>
+inline string_type to_sstring(T value);
+
+template <typename char_type, typename Size, Size max_size>
+class basic_sstring {
+    static_assert(
+            (std::is_same<char_type, char>::value
+             || std::is_same<char_type, signed char>::value
+             || std::is_same<char_type, unsigned char>::value),
+            "basic_sstring only supports single byte char types");
+    union contents {
+        struct external_type {
+            char_type* str;
+            Size size;
+            int8_t pad;
+        } external;
+        struct internal_type {
+            char_type str[max_size];
+            int8_t size;
+        } internal;
+        static_assert(sizeof(external_type) <= sizeof(internal_type), "max_size too small");
+        static_assert(max_size <= 127, "max_size too large");
+    } u;
+    bool is_internal() const noexcept {
+        return u.internal.size >= 0;
+    }
+    bool is_external() const noexcept {
+        return !is_internal();
+    }
+    const char_type* str() const {
+        return is_internal() ? u.internal.str : u.external.str;
+    }
+    char_type* str() {
+        return is_internal() ? u.internal.str : u.external.str;
+    }
+
+    template <typename string_type, typename T>
+    static inline string_type to_sstring_sprintf(T value, const char* fmt) {
+        char tmp[sizeof(value) * 3 + 2];
+        auto len = std::sprintf(tmp, fmt, value);
+        using ch_type = typename string_type::value_type;
+        return string_type(reinterpret_cast<ch_type*>(tmp), len);
+    }
+
+    template <typename string_type>
+    static inline string_type to_sstring(int value) {
+        return to_sstring_sprintf<string_type>(value, "%d");
+    }
+
+    template <typename string_type>
+    static inline string_type to_sstring(unsigned value) {
+        return to_sstring_sprintf<string_type>(value, "%u");
+    }
+
+    template <typename string_type>
+    static inline string_type to_sstring(long value) {
+        return to_sstring_sprintf<string_type>(value, "%ld");
+    }
+
+    template <typename string_type>
+    static inline string_type to_sstring(unsigned long value) {
+        return to_sstring_sprintf<string_type>(value, "%lu");
+    }
+
+    template <typename string_type>
+    static inline string_type to_sstring(long long value) {
+        return to_sstring_sprintf<string_type>(value, "%lld");
+    }
+
+    template <typename string_type>
+    static inline string_type to_sstring(unsigned long long value) {
+        return to_sstring_sprintf<string_type>(value, "%llu");
+    }
+
+    template <typename string_type>
+    static inline string_type to_sstring(float value) {
+        return to_sstring_sprintf<string_type>(value, "%g");
+    }
+
+    template <typename string_type>
+    static inline string_type to_sstring(double value) {
+        return to_sstring_sprintf<string_type>(value, "%g");
+    }
+
+    template <typename string_type>
+    static inline string_type to_sstring(long double value) {
+        return to_sstring_sprintf<string_type>(value, "%Lg");
+    }
+
+    template <typename string_type>
+    static inline string_type to_sstring(const char* value) {
+        return string_type(value);
+    }
+
+    template <typename string_type>
+    static inline string_type to_sstring(sstring value) {
+        return value;
+    }
+
+public:
+    using value_type = char_type;
+    using traits_type = std::char_traits<char_type>;
+    using allocator_type = std::allocator<char_type>;
+    using reference = char_type&;
+    using const_reference = const char_type&;
+    using pointer = char_type*;
+    using const_pointer = const char_type*;
+    using iterator = char_type*;
+    using const_iterator = const char_type*;
+    // FIXME: add reverse_iterator and friend
+    using difference_type = ssize_t;  // std::make_signed_t<Size> can be too small
+    using size_type = Size;
+    static constexpr size_type  npos = static_cast<size_type>(-1);
+public:
+    struct initialized_later {};
+
+    basic_sstring() noexcept {
+        u.internal.size = 0;
+        u.internal.str[0] = '\0';
+    }
+    basic_sstring(const basic_sstring& x) {
+        if (x.is_internal()) {
+            u.internal = x.u.internal;
+        } else {
+            u.internal.size = -1;
+            u.external.str = reinterpret_cast<char_type*>(std::malloc(x.u.external.size + 1));
+            if (!u.external.str) {
+                throw std::bad_alloc();
+            }
+            std::copy(x.u.external.str, x.u.external.str + x.u.external.size + 1, u.external.str);
+            u.external.size = x.u.external.size;
+        }
+    }
+    basic_sstring(basic_sstring&& x) noexcept {
+        u = x.u;
+        x.u.internal.size = 0;
+        x.u.internal.str[0] = '\0';
+    }
+    basic_sstring(initialized_later, size_t size) {
+        if (size_type(size) != size) {
+            throw std::overflow_error("sstring overflow");
+        }
+        if (size + 1 <= sizeof(u.internal.str)) {
+            u.internal.str[size] = '\0';
+            u.internal.size = size;
+        } else {
+            u.internal.size = -1;
+            u.external.str = reinterpret_cast<char_type*>(std::malloc(size + 1));
+            if (!u.external.str) {
+                throw std::bad_alloc();
+            }
+            u.external.size = size;
+            u.external.str[size] = '\0';
+        }
+    }
+    basic_sstring(const char_type* x, size_t size) {
+        if (size_type(size) != size) {
+            throw std::overflow_error("sstring overflow");
+        }
+        if (size + 1 <= sizeof(u.internal.str)) {
+            std::copy(x, x + size, u.internal.str);
+            u.internal.str[size] = '\0';
+            u.internal.size = size;
+        } else {
+            u.internal.size = -1;
+            u.external.str = reinterpret_cast<char_type*>(std::malloc(size + 1));
+            if (!u.external.str) {
+                throw std::bad_alloc();
+            }
+            u.external.size = size;
+            std::copy(x, x + size, u.external.str);
+            u.external.str[size] = '\0';
+        }
+    }
+
+    basic_sstring(size_t size, char_type x) : basic_sstring(initialized_later(), size) {
+        memset(begin(), x, size);
+    }
+
+    basic_sstring(const char* x) : basic_sstring(reinterpret_cast<const char_type*>(x), std::strlen(x)) {}
+    basic_sstring(std::basic_string<char_type>& x) : basic_sstring(x.c_str(), x.size()) {}
+    basic_sstring(std::initializer_list<char_type> x) : basic_sstring(x.begin(), x.end() - x.begin()) {}
+    basic_sstring(const char_type* b, const char_type* e) : basic_sstring(b, e - b) {}
+    basic_sstring(const std::basic_string<char_type>& s)
+        : basic_sstring(s.data(), s.size()) {}
+    template <typename InputIterator>
+    basic_sstring(InputIterator first, InputIterator last)
+            : basic_sstring(initialized_later(), std::distance(first, last)) {
+        std::copy(first, last, begin());
+    }
+    ~basic_sstring() noexcept {
+        if (is_external()) {
+            std::free(u.external.str);
+        }
+    }
+    basic_sstring& operator=(const basic_sstring& x) {
+        basic_sstring tmp(x);
+        swap(tmp);
+        return *this;
+    }
+    basic_sstring& operator=(basic_sstring&& x) noexcept {
+        if (this != &x) {
+            swap(x);
+            x.reset();
+        }
+        return *this;
+    }
+    operator std::basic_string<char_type>() const {
+        return { str(), size() };
+    }
+    size_t size() const noexcept {
+        return is_internal() ? u.internal.size : u.external.size;
+    }
+
+    size_t length() const noexcept {
+        return size();
+    }
+
+    size_t find(char_type t, size_t pos = 0) const noexcept {
+        const char_type* it = str() + pos;
+        const char_type* end = str() + size();
+        while (it < end) {
+            if (*it == t) {
+                return it - str();
+            }
+            it++;
+        }
+        return npos;
+    }
+
+    size_t find(const basic_sstring& s, size_t pos = 0) const noexcept {
+        const char_type* it = str() + pos;
+        const char_type* end = str() + size();
+        const char_type* c_str = s.str();
+        const char_type* c_str_end = s.str() + s.size();
+
+        while (it < end) {
+            auto i = it;
+            auto j = c_str;
+            while ( i < end && j < c_str_end && *i == *j) {
+                i++;
+                j++;
+            }
+            if (j == c_str_end) {
+                return it - str();
+            }
+            it++;
+        }
+        return npos;
+    }
+
+    /**
+     * find_last_of find the last occurrence of c in the string.
+     * When pos is specified, the search only includes characters
+     * at or before position pos.
+     *
+     */
+    size_t find_last_of (char_type c, size_t pos = npos) const noexcept {
+        const char_type* str_start = str();
+        if (size()) {
+            if (pos >= size()) {
+                pos = size() - 1;
+            }
+            const char_type* p = str_start + pos + 1;
+            do {
+                p--;
+                if (*p == c) {
+                    return (p - str_start);
+                }
+            } while (p != str_start);
+        }
+        return npos;
+    }
+
+    /**
+     *  Append a C substring.
+     *  @param s  The C string to append.
+     *  @param n  The number of characters to append.
+     *  @return  Reference to this string.
+     */
+    basic_sstring& append (const char_type* s, size_t n) {
+        basic_sstring ret(initialized_later(), size() + n);
+        std::copy(begin(), end(), ret.begin());
+        std::copy(s, s + n, ret.begin() + size());
+        *this = std::move(ret);
+        return *this;
+    }
+
+    /**
+     *  Replace characters with a value of a C style substring.
+     *
+     */
+    basic_sstring& replace(size_type pos, size_type n1, const char_type* s,
+             size_type n2) {
+        if (pos > size()) {
+            throw std::out_of_range("sstring::replace out of range");
+        }
+
+        if (n1 > size() - pos) {
+            n1 = size() - pos;
+        }
+
+        if (n1 == n2) {
+            if (n2) {
+                std::copy(s, s + n2, begin() + pos);
+            }
+            return *this;
+        }
+        basic_sstring ret(initialized_later(), size() + n2 - n1);
+        char_type* p= ret.begin();
+        std::copy(begin(), begin() + pos, p);
+        p += pos;
+        if (n2) {
+            std::copy(s, s + n2, p);
+        }
+        p += n2;
+        std::copy(begin() + pos + n1, end(), p);
+        *this = std::move(ret);
+        return *this;
+    }
+
+    template <class InputIterator>
+    basic_sstring& replace (const_iterator i1, const_iterator i2,
+            InputIterator first, InputIterator last) {
+        if (i1 < begin() || i1 > end() || i2 < begin()) {
+            throw std::out_of_range("sstring::replace out of range");
+        }
+        if (i2 > end()) {
+            i2 = end();
+        }
+
+        if (i2 - i1 == last - first) {
+            //in place replacement
+            std::copy(first, last, const_cast<char_type*>(i1));
+            return *this;
+        }
+        basic_sstring ret(initialized_later(), size() + (last - first) - (i2 - i1));
+        char_type* p = ret.begin();
+        p = std::copy(cbegin(), i1, p);
+        p = std::copy(first, last, p);
+        std::copy(i2, cend(), p);
+        *this = std::move(ret);
+        return *this;
+    }
+
+    iterator erase(iterator first, iterator last) {
+        size_t pos = first - begin();
+        replace(pos, last - first, nullptr, 0);
+        return begin() + pos;
+    }
+
+    /**
+     * Inserts additional characters into the string right before
+     * the character indicated by p.
+     */
+    template <class InputIterator>
+    void insert(const_iterator p, InputIterator beg, InputIterator end) {
+        replace(p, p, beg, end);
+    }
+
+    /**
+     *  Returns a read/write reference to the data at the last
+     *  element of the string.
+     *  This function shall not be called on empty strings.
+     */
+    reference
+    back() noexcept {
+        return operator[](size() - 1);
+    }
+
+    /**
+     *  Returns a  read-only (constant) reference to the data at the last
+     *  element of the string.
+     *  This function shall not be called on empty strings.
+     */
+    const_reference
+    back() const noexcept {
+        return operator[](size() - 1);
+    }
+
+    basic_sstring substr(size_t from, size_t len = npos)  const {
+        if (from > size()) {
+            throw std::out_of_range("sstring::substr out of range");
+        }
+        if (len > size() - from) {
+            len = size() - from;
+        }
+        if (len == 0) {
+            return "";
+        }
+        return { str() + from , len };
+    }
+
+    const char_type& at(size_t pos) const {
+        if (pos >= size()) {
+            throw std::out_of_range("sstring::at out of range");
+        }
+        return *(str() + pos);
+    }
+
+    char_type& at(size_t pos) {
+        if (pos >= size()) {
+            throw std::out_of_range("sstring::at out of range");
+        }
+        return *(str() + pos);
+    }
+
+    bool empty() const noexcept {
+        return u.internal.size == 0;
+    }
+    void reset() noexcept {
+        if (is_external()) {
+            std::free(u.external.str);
+        }
+        u.internal.size = 0;
+        u.internal.str[0] = '\0';
+    }
+
+    int compare(const basic_sstring& x) const noexcept {
+        auto n = traits_type::compare(begin(), x.begin(), std::min(size(), x.size()));
+        if (n != 0) {
+            return n;
+        }
+        if (size() < x.size()) {
+            return -1;
+        } else if (size() > x.size()) {
+            return 1;
+        } else {
+            return 0;
+        }
+    }
+
+    int compare(size_t pos, size_t sz, const basic_sstring& x) const {
+        if (pos > size()) {
+            throw std::out_of_range("pos larger than string size");
+        }
+
+        sz = std::min(size() - pos, sz);
+        auto n = traits_type::compare(begin() + pos, x.begin(), std::min(sz, x.size()));
+        if (n != 0) {
+            return n;
+        }
+        if (sz < x.size()) {
+            return -1;
+        } else if (sz > x.size()) {
+            return 1;
+        } else {
+            return 0;
+        }
+    }
+
+    void swap(basic_sstring& x) noexcept {
+        contents tmp;
+        tmp = x.u;
+        x.u = u;
+        u = tmp;
+    }
+    const char_type* c_str() const {
+        return str();
+    }
+    const char_type* begin() const { return str(); }
+    const char_type* end() const { return str() + size(); }
+    const char_type* cbegin() const { return str(); }
+    const char_type* cend() const { return str() + size(); }
+    char_type* begin() { return str(); }
+    char_type* end() { return str() + size(); }
+    bool operator==(const basic_sstring& x) const {
+        return size() == x.size() && std::equal(begin(), end(), x.begin());
+    }
+    bool operator!=(const basic_sstring& x) const {
+        return !operator==(x);
+    }
+    bool operator<(const basic_sstring& x) const {
+        return compare(x) < 0;
+    }
+    basic_sstring operator+(const basic_sstring& x) const {
+        basic_sstring ret(initialized_later(), size() + x.size());
+        std::copy(begin(), end(), ret.begin());
+        std::copy(x.begin(), x.end(), ret.begin() + size());
+        return ret;
+    }
+    basic_sstring& operator+=(const basic_sstring& x) {
+        return *this = *this + x;
+    }
+    char_type& operator[](size_type pos) {
+        return str()[pos];
+    }
+    const char_type& operator[](size_type pos) const {
+        return str()[pos];
+    }
+    operator boost::basic_string_view<char_type, traits_type>() const {
+		return boost::basic_string_view<char_type, traits_type>(str(), size());
+    }
+    template <typename string_type, typename T>
+    friend inline string_type to_sstring(T value);
+};
+template <typename char_type, typename Size, Size max_size>
+constexpr Size basic_sstring<char_type, Size, max_size>::npos;
+
+template <typename char_type, typename size_type, size_type Max, size_type N>
+inline
+basic_sstring<char_type, size_type, Max>
+operator+(const char(&s)[N], const basic_sstring<char_type, size_type, Max>& t) {
+    using sstring = basic_sstring<char_type, size_type, Max>;
+    // don't copy the terminating NUL character
+    sstring ret(typename sstring::initialized_later(), N-1 + t.size());
+    auto p = std::copy(std::begin(s), std::end(s)-1, ret.begin());
+    std::copy(t.begin(), t.end(), p);
+    return ret;
+}
+
+template <size_t N>
+static inline
+size_t str_len(const char(&s)[N]) { return N - 1; }
+
+template <size_t N>
+static inline
+const char* str_begin(const char(&s)[N]) { return s; }
+
+template <size_t N>
+static inline
+const char* str_end(const char(&s)[N]) { return str_begin(s) + str_len(s); }
+
+template <typename char_type, typename size_type, size_type max_size>
+static inline
+const char_type* str_begin(const basic_sstring<char_type, size_type, max_size>& s) { return s.begin(); }
+
+template <typename char_type, typename size_type, size_type max_size>
+static inline
+const char_type* str_end(const basic_sstring<char_type, size_type, max_size>& s) { return s.end(); }
+
+template <typename char_type, typename size_type, size_type max_size>
+static inline
+size_type str_len(const basic_sstring<char_type, size_type, max_size>& s) { return s.size(); }
+
+template <typename First, typename Second, typename... Tail>
+static inline
+size_t str_len(const First& first, const Second& second, const Tail&... tail) {
+    return str_len(first) + str_len(second, tail...);
+}
+
+template <typename char_type, typename size_type, size_type max_size>
+inline
+void swap(basic_sstring<char_type, size_type, max_size>& x,
+          basic_sstring<char_type, size_type, max_size>& y) noexcept
+{
+    return x.swap(y);
+}
+
+template <typename char_type, typename size_type, size_type max_size, typename char_traits>
+inline
+std::basic_ostream<char_type, char_traits>&
+operator<<(std::basic_ostream<char_type, char_traits>& os,
+        const basic_sstring<char_type, size_type, max_size>& s) {
+    return os.write(s.begin(), s.size());
+}
+
+template <typename char_type, typename size_type, size_type max_size, typename char_traits>
+inline
+std::basic_istream<char_type, char_traits>&
+operator>>(std::basic_istream<char_type, char_traits>& is,
+        basic_sstring<char_type, size_type, max_size>& s) {
+    std::string tmp;
+    is >> tmp;
+    s = tmp;
+    return is;
+}
+
+namespace std {
+
+template <typename char_type, typename size_type, size_type max_size>
+struct hash<basic_sstring<char_type, size_type, max_size>> {
+    size_t operator()(const basic_sstring<char_type, size_type, max_size>& s) const {
+		using traits_type = std::char_traits<char_type>;
+		return std::hash<boost::basic_string_view<char_type,traits_type>>()(s);
+    }
+};
+
+}
+
+static inline
+char* copy_str_to(char* dst) {
+    return dst;
+}
+
+template <typename Head, typename... Tail>
+static inline
+char* copy_str_to(char* dst, const Head& head, const Tail&... tail) {
+    return copy_str_to(std::copy(str_begin(head), str_end(head), dst), tail...);
+}
+
+template <typename String = sstring, typename... Args>
+static String make_sstring(Args&&... args)
+{
+    String ret(sstring::initialized_later(), str_len(args...));
+    copy_str_to(ret.begin(), args...);
+    return ret;
+}
+
+template <typename string_type, typename T>
+inline string_type to_sstring(T value) {
+    return sstring::to_sstring<string_type>(value);
+}
+
+
+// encode/decode
+template <typename Char, typename Size, Size Max>
+struct denc_traits<basic_sstring<Char, Size, Max>> {
+private:
+  using value_type = basic_sstring<Char, Size, Max>;
+public:
+  static constexpr bool supported = true;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = false;
+
+  static void bound_encode(const value_type& s, size_t& p, uint64_t f=0) {
+    p += sizeof(Size) + s.size();
+  }
+
+  static void encode_nohead(const value_type& s,
+                            buffer::list::contiguous_appender& p)
+  {
+    auto len = s.size();
+    if (len) {
+      p.append(reinterpret_cast<const char*>(s.c_str()), len);
+    }
+  }
+
+  static void decode_nohead(size_t len, value_type& s,
+                            buffer::ptr::const_iterator& p)
+  {
+    s.reset();
+    if (len) {
+      s.append(reinterpret_cast<const Char*>(p.get_pos_add(len)), len);
+    }
+  }
+
+  static void encode(const value_type& s,
+                     buffer::list::contiguous_appender& p,
+                     uint64_t f=0)
+  {
+    Size len = (Size)(s.size());
+    ::denc(len, p);
+    if (len) {
+      p.append(reinterpret_cast<const char*>(s.c_str()), len);
+    }
+  }
+
+  static void decode(value_type& s,
+                     buffer::ptr::const_iterator& p,
+                     uint64_t f=0)
+  {
+    Size len;
+    ::denc(len, p);
+    decode_nohead(len, s, p);
+  }
+};
+
+#if 0 /* XXX conflicts w/Ceph types.h */
+template <typename T>
+inline
+std::ostream& operator<<(std::ostream& os, const std::vector<T>& v) {
+    bool first = true;
+    os << "{";
+    for (auto&& elem : v) {
+        if (!first) {
+            os << ", ";
+        } else {
+            first = false;
+        }
+        os << elem;
+    }
+    os << "}";
+    return os;
+}
+#endif
+
+#endif /* SSTRING_HH_ */
diff --git a/src/common/static_ptr.h b/src/common/static_ptr.h
new file mode 100644
index 00000000..c6d19b49
--- /dev/null
+++ b/src/common/static_ptr.h
@@ -0,0 +1,444 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <cstddef>
+#include <utility>
+#include <type_traits>
+
+namespace ceph {
+// `static_ptr`
+// ===========
+//
+// It would be really nice if polymorphism didn't require a bunch of
+// mucking about with the heap. So let's build something where we
+// don't have to do that.
+//
+namespace _mem {
+
+// This, an operator function, is one of the canonical ways to do type
+// erasure in C++ so long as all operations can be done with subsets
+// of the same arguments (which is not true for function type erasure)
+// it's a pretty good one.
+enum class op {
+  copy, move, destroy, size
+};
+template<typename T>
+static std::size_t op_fun(op oper, void* p1, void* p2)
+{
+  auto me = static_cast<T*>(p1);
+
+  switch (oper) {
+  case op::copy:
+    // One conspicuous downside is that immovable/uncopyable functions
+    // kill compilation right here, even if nobody ever calls the move
+    // or copy methods. Working around this is a pain, since we'd need
+    // four operator functions and a top-level class to
+    // provide/withhold copy/move operations as appropriate.
+    new (p2) T(*me);
+    break;
+
+  case op::move:
+    new (p2) T(std::move(*me));
+    break;
+
+  case op::destroy:
+    me->~T();
+    break;
+
+  case op::size:
+    return sizeof(T);
+  }
+  return 0;
+}
+}
+// The thing itself!
+//
+// The default value for Size may be wrong in almost all cases. You
+// can change it to your heart's content. The upside is that you'll
+// just get a compile error and you can bump it up.
+//
+// I *recommend* having a size constant in header files (or perhaps a
+// using declaration, e.g.
+// ```
+// using StaticFoo = static_ptr<Foo, sizeof(Blah)>`
+// ```
+// in some header file that can be used multiple places) so that when
+// you create a new derived class with a larger size, you only have to
+// change it in one place.
+//
+template<typename Base, std::size_t Size = sizeof(Base)>
+class static_ptr {
+  template<typename U, std::size_t S>
+  friend class static_ptr;
+
+  // Refuse to be set to anything with whose type we are
+  // incompatible. Also never try to eat anything bigger than you are.
+  //
+  template<typename T, std::size_t S>
+  constexpr static int create_ward() noexcept {
+    static_assert(std::is_void_v<Base> ||
+                  std::is_base_of_v<Base, std::decay_t<T>>,
+                  "Value to store must be a derivative of the base.");
+    static_assert(S <= Size, "Value too large.");
+    static_assert(std::is_void_v<Base> || !std::is_const<Base>{} ||
+                  std::is_const_v<T>,
+                  "Cannot assign const pointer to non-const pointer.");
+    return 0;
+  }
+  // Here we can store anything that has the same signature, which is
+  // relevant to the multiple-versions for move/copy support that I
+  // mentioned above.
+  //
+  size_t (*operate)(_mem::op, void*, void*);
+
+  // This is mutable so that get and the dereference operators can be
+  // const. Since we're modeling a pointer, we should preserve the
+  // difference in semantics between a pointer-to-const and a const
+  // pointer.
+  //
+  mutable typename std::aligned_storage<Size>::type buf;
+
+public:
+  using element_type = Base;
+  using pointer = Base*;
+
+  // Empty
+  static_ptr() noexcept : operate(nullptr) {}
+  static_ptr(std::nullptr_t) noexcept : operate(nullptr) {}
+  static_ptr& operator =(std::nullptr_t) noexcept {
+    reset();
+    return *this;
+  }
+  ~static_ptr() noexcept {
+    reset();
+  }
+
+  // Since other pointer-ish types have it
+  void reset() noexcept {
+    if (operate) {
+      operate(_mem::op::destroy, &buf, nullptr);
+      operate = nullptr;
+    }
+  }
+
+  // Set from another static pointer.
+  //
+  // Since the templated versions don't count for overriding the defaults
+  static_ptr(const static_ptr& rhs)
+    noexcept(std::is_nothrow_copy_constructible_v<Base>) : operate(rhs.operate) {
+    if (operate) {
+      operate(_mem::op::copy, &rhs.buf, &buf);
+    }
+  }
+  static_ptr(static_ptr&& rhs)
+    noexcept(std::is_nothrow_move_constructible_v<Base>) : operate(rhs.operate) {
+    if (operate) {
+      operate(_mem::op::move, &rhs.buf, &buf);
+    }
+  }
+
+  template<typename U, std::size_t S>
+  static_ptr(const static_ptr<U, S>& rhs)
+    noexcept(std::is_nothrow_copy_constructible_v<U>) : operate(rhs.operate) {
+    create_ward<U, S>();
+    if (operate) {
+      operate(_mem::op::copy, &rhs.buf, &buf);
+    }
+  }
+  template<typename U, std::size_t S>
+  static_ptr(static_ptr<U, S>&& rhs)
+    noexcept(std::is_nothrow_move_constructible_v<U>) : operate(rhs.operate) {
+    create_ward<U, S>();
+    if (operate) {
+      operate(_mem::op::move, &rhs.buf, &buf);
+    }
+  }
+
+  static_ptr& operator =(const static_ptr& rhs)
+    noexcept(std::is_nothrow_copy_constructible_v<Base>) {
+    reset();
+    if (rhs) {
+      operate = rhs.operate;
+      operate(_mem::op::copy,
+	      const_cast<void*>(static_cast<const void*>(&rhs.buf)), &buf);
+    }
+    return *this;
+  }
+  static_ptr& operator =(static_ptr&& rhs)
+    noexcept(std::is_nothrow_move_constructible_v<Base>) {
+    reset();
+    if (rhs) {
+      operate = rhs.operate;
+      operate(_mem::op::move, &rhs.buf, &buf);
+    }
+    return *this;
+  }
+
+  template<typename U, std::size_t S>
+  static_ptr& operator =(const static_ptr<U, S>& rhs)
+    noexcept(std::is_nothrow_copy_constructible_v<U>) {
+    create_ward<U, S>();
+    reset();
+    if (rhs) {
+      operate = rhs.operate;
+      operate(_mem::op::copy,
+	      const_cast<void*>(static_cast<const void*>(&rhs.buf)), &buf);
+    }
+    return *this;
+  }
+  template<typename U, std::size_t S>
+  static_ptr& operator =(static_ptr<U, S>&& rhs)
+    noexcept(std::is_nothrow_move_constructible_v<U>) {
+    create_ward<U, S>();
+    reset();
+    if (rhs) {
+      operate = rhs.operate;
+      operate(_mem::op::move, &rhs.buf, &buf);
+    }
+    return *this;
+  }
+
+  // In-place construction!
+  //
+  // This is basically what you want, and I didn't include value
+  // construction because in-place construction renders it
+  // unnecessary. Also it doesn't fit the pointer idiom as well.
+  //
+  template<typename T, typename... Args>
+  static_ptr(std::in_place_type_t<T>, Args&& ...args)
+    noexcept(std::is_nothrow_constructible_v<T, Args...>)
+    : operate(&_mem::op_fun<T>){
+    static_assert((!std::is_nothrow_copy_constructible_v<Base> ||
+		   std::is_nothrow_copy_constructible_v<T>) &&
+		  (!std::is_nothrow_move_constructible_v<Base> ||
+		   std::is_nothrow_move_constructible_v<T>),
+		  "If declared type of static_ptr is nothrow "
+		  "move/copy constructible, then any "
+		  "type assigned to it must be as well. "
+		  "You can use reinterpret_pointer_cast "
+		  "to get around this limit, but don't "
+		  "come crying to me when the C++ "
+		  "runtime calls terminate().");
+    create_ward<T, sizeof(T)>();
+    new (&buf) T(std::forward<Args>(args)...);
+  }
+
+  // I occasionally get tempted to make an overload of the assignment
+  // operator that takes a tuple as its right-hand side to provide
+  // arguments.
+  //
+  template<typename T, typename... Args>
+  void emplace(Args&& ...args)
+    noexcept(std::is_nothrow_constructible_v<T, Args...>) {
+    create_ward<T, sizeof(T)>();
+    reset();
+    operate = &_mem::op_fun<T>;
+    new (&buf) T(std::forward<Args>(args)...);
+  }
+
+  // Access!
+  Base* get() const noexcept {
+    return operate ? reinterpret_cast<Base*>(&buf) : nullptr;
+  }
+  template<typename U = Base>
+  std::enable_if_t<!std::is_void_v<U>, Base*> operator->() const noexcept {
+    return get();
+  }
+  template<typename U = Base>
+  std::enable_if_t<!std::is_void_v<U>, Base&> operator *() const noexcept {
+    return *get();
+  }
+  operator bool() const noexcept {
+    return !!operate;
+  }
+
+  // Big wall of friendship
+  //
+  template<typename U, std::size_t Z, typename T, std::size_t S>
+  friend static_ptr<U, Z> static_pointer_cast(const static_ptr<T, S>& p);
+  template<typename U, std::size_t Z, typename T, std::size_t S>
+  friend static_ptr<U, Z> static_pointer_cast(static_ptr<T, S>&& p);
+
+  template<typename U, std::size_t Z, typename T, std::size_t S>
+  friend static_ptr<U, Z> dynamic_pointer_cast(const static_ptr<T, S>& p);
+  template<typename U, std::size_t Z, typename T, std::size_t S>
+  friend static_ptr<U, Z> dynamic_pointer_cast(static_ptr<T, S>&& p);
+
+  template<typename U, std::size_t Z, typename T, std::size_t S>
+  friend static_ptr<U, Z> const_pointer_cast(const static_ptr<T, S>& p);
+  template<typename U, std::size_t Z, typename T, std::size_t S>
+  friend static_ptr<U, Z> const_pointer_cast(static_ptr<T, S>&& p);
+
+  template<typename U, std::size_t Z, typename T, std::size_t S>
+  friend static_ptr<U, Z> reinterpret_pointer_cast(const static_ptr<T, S>& p);
+  template<typename U, std::size_t Z, typename T, std::size_t S>
+  friend static_ptr<U, Z> reinterpret_pointer_cast(static_ptr<T, S>&& p);
+
+  template<typename U, std::size_t Z, typename T, std::size_t S>
+  friend static_ptr<U, Z> resize_pointer_cast(const static_ptr<T, S>& p);
+  template<typename U, std::size_t Z, typename T, std::size_t S>
+  friend static_ptr<U, Z> resize_pointer_cast(static_ptr<T, S>&& p);
+};
+
+// These are all modeled after the same ones for shared pointer.
+//
+// Also I'm annoyed that the standard library doesn't have
+// *_pointer_cast overloads for a move-only unique pointer. It's a
+// nice idiom. Having to release and reconstruct is obnoxious.
+//
+template<typename U, std::size_t Z, typename T, std::size_t S>
+static_ptr<U, Z> static_pointer_cast(const static_ptr<T, S>& p) {
+  static_assert(Z >= S,
+                "Value too large.");
+  static_ptr<U, Z> r;
+  // Really, this is always true because static_cast either succeeds
+  // or fails to compile, but it prevents an unused variable warning
+  // and should be optimized out.
+  if (static_cast<U*>(p.get())) {
+    p.operate(_mem::op::copy, &p.buf, &r.buf);
+    r.operate = p.operate;
+  }
+  return r;
+}
+template<typename U, std::size_t Z, typename T, std::size_t S>
+static_ptr<U, Z> static_pointer_cast(static_ptr<T, S>&& p) {
+  static_assert(Z >= S,
+                "Value too large.");
+  static_ptr<U, Z> r;
+  if (static_cast<U*>(p.get())) {
+    p.operate(_mem::op::move, &p.buf, &r.buf);
+    r.operate = p.operate;
+  }
+  return r;
+}
+
+// Here the conditional is actually important and ensures we have the
+// same behavior as dynamic_cast.
+//
+template<typename U, std::size_t Z, typename T, std::size_t S>
+static_ptr<U, Z> dynamic_pointer_cast(const static_ptr<T, S>& p) {
+  static_assert(Z >= S,
+                "Value too large.");
+  static_ptr<U, Z> r;
+  if (dynamic_cast<U*>(p.get())) {
+    p.operate(_mem::op::copy, &p.buf, &r.buf);
+    r.operate = p.operate;
+  }
+  return r;
+}
+template<typename U, std::size_t Z, typename T, std::size_t S>
+static_ptr<U, Z> dynamic_pointer_cast(static_ptr<T, S>&& p) {
+  static_assert(Z >= S,
+                "Value too large.");
+  static_ptr<U, Z> r;
+  if (dynamic_cast<U*>(p.get())) {
+    p.operate(_mem::op::move, &p.buf, &r.buf);
+    r.operate = p.operate;
+  }
+  return r;
+}
+
+template<typename U, std::size_t Z, typename T, std::size_t S>
+static_ptr<U, Z> const_pointer_cast(const static_ptr<T, S>& p) {
+  static_assert(Z >= S,
+                "Value too large.");
+  static_ptr<U, Z> r;
+  if (const_cast<U*>(p.get())) {
+    p.operate(_mem::op::copy, &p.buf, &r.buf);
+    r.operate = p.operate;
+  }
+  return r;
+}
+template<typename U, std::size_t Z, typename T, std::size_t S>
+static_ptr<U, Z> const_pointer_cast(static_ptr<T, S>&& p) {
+  static_assert(Z >= S,
+                "Value too large.");
+  static_ptr<U, Z> r;
+  if (const_cast<U*>(p.get())) {
+    p.operate(_mem::op::move, &p.buf, &r.buf);
+    r.operate = p.operate;
+  }
+  return r;
+}
+
+// I'm not sure if anyone will ever use this. I can imagine situations
+// where they might. It works, though!
+//
+template<typename U, std::size_t Z, typename T, std::size_t S>
+static_ptr<U, Z> reinterpret_pointer_cast(const static_ptr<T, S>& p) {
+  static_assert(Z >= S,
+                "Value too large.");
+  static_ptr<U, Z> r;
+  p.operate(_mem::op::copy, &p.buf, &r.buf);
+  r.operate = p.operate;
+  return r;
+}
+template<typename U, std::size_t Z, typename T, std::size_t S>
+static_ptr<U, Z> reinterpret_pointer_cast(static_ptr<T, S>&& p) {
+  static_assert(Z >= S,
+                "Value too large.");
+  static_ptr<U, Z> r;
+  p.operate(_mem::op::move, &p.buf, &r.buf);
+  r.operate = p.operate;
+  return r;
+}
+
+// This is the only way to move from a bigger static pointer into a
+// smaller static pointer. The size of the total data stored in the
+// pointer is checked at runtime and if the destination size is large
+// enough, we copy it over.
+//
+// I follow cast semantics. Since this is a pointer-like type, it
+// returns a null value rather than throwing.
+template<typename U, std::size_t Z, typename T, std::size_t S>
+static_ptr<U, Z> resize_pointer_cast(const static_ptr<T, S>& p) {
+  static_assert(std::is_same_v<U, T>,
+                "resize_pointer_cast only changes size, not type.");
+  static_ptr<U, Z> r;
+  if (Z >= p.operate(_mem::op::size, &p.buf, nullptr)) {
+    p.operate(_mem::op::copy, &p.buf, &r.buf);
+    r.operate = p.operate;
+  }
+  return r;
+}
+template<typename U, std::size_t Z, typename T, std::size_t S>
+static_ptr<U, Z> resize_pointer_cast(static_ptr<T, S>&& p) {
+  static_assert(std::is_same_v<U, T>,
+                "resize_pointer_cast only changes size, not type.");
+  static_ptr<U, Z> r;
+  if (Z >= p.operate(_mem::op::size, &p.buf, nullptr)) {
+    p.operate(_mem::op::move, &p.buf, &r.buf);
+    r.operate = p.operate;
+  }
+  return r;
+}
+
+template<typename Base, std::size_t Size>
+bool operator ==(static_ptr<Base, Size> s, std::nullptr_t) {
+  return !s;
+}
+template<typename Base, std::size_t Size>
+bool operator ==(std::nullptr_t, static_ptr<Base, Size> s) {
+  return !s;
+}
+
+// Since `make_unique` and `make_shared` exist, we should follow their
+// lead.
+//
+template<typename Base, typename Derived = Base,
+         std::size_t Size = sizeof(Derived), typename... Args>
+static_ptr<Base, Size> make_static(Args&& ...args) {
+  return { std::in_place_type<Derived>, std::forward<Args>(args)... };
+}
+}
diff --git a/src/common/str_list.cc b/src/common/str_list.cc
new file mode 100644
index 00000000..33a1157d
--- /dev/null
+++ b/src/common/str_list.cc
@@ -0,0 +1,84 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2009-2010 Dreamhost
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/str_list.h"
+
+using std::string;
+using std::vector;
+using std::set;
+using std::list;
+using ceph::for_each_substr;
+
+void get_str_list(const string& str, const char *delims, list<string>& str_list)
+{
+  str_list.clear();
+  for_each_substr(str, delims, [&str_list] (auto token) {
+      str_list.emplace_back(token.begin(), token.end());
+    });
+}
+
+void get_str_list(const string& str, list<string>& str_list)
+{
+  const char *delims = ";,= \t";
+  get_str_list(str, delims, str_list);
+}
+
+list<string> get_str_list(const string& str, const char *delims)
+{
+  list<string> result;
+  get_str_list(str, delims, result);
+  return result;
+}
+
+void get_str_vec(const string& str, const char *delims, vector<string>& str_vec)
+{
+  str_vec.clear();
+  for_each_substr(str, delims, [&str_vec] (auto token) {
+      str_vec.emplace_back(token.begin(), token.end());
+    });
+}
+
+void get_str_vec(const string& str, vector<string>& str_vec)
+{
+  const char *delims = ";,= \t";
+  get_str_vec(str, delims, str_vec);
+}
+
+vector<string> get_str_vec(const string& str, const char *delims)
+{
+  vector<string> result;
+  get_str_vec(str, delims, result);
+  return result;
+}
+
+void get_str_set(const string& str, const char *delims, set<string>& str_set)
+{
+  str_set.clear();
+  for_each_substr(str, delims, [&str_set] (auto token) {
+      str_set.emplace(token.begin(), token.end());
+    });
+}
+
+void get_str_set(const string& str, set<string>& str_set)
+{
+  const char *delims = ";,= \t";
+  get_str_set(str, delims, str_set);
+}
+
+set<string> get_str_set(const string& str, const char *delims)
+{
+  set<string> result;
+  get_str_set(str, delims, result);
+  return result;
+}
diff --git a/src/common/str_map.cc b/src/common/str_map.cc
new file mode 100644
index 00000000..947ad21a
--- /dev/null
+++ b/src/common/str_map.cc
@@ -0,0 +1,159 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ * 
+ */
+
+#include "include/str_map.h"
+#include "include/str_list.h"
+
+#include <boost/algorithm/string.hpp>
+
+#include "json_spirit/json_spirit.h"
+
+using namespace std;
+
+int get_json_str_map(
+    const string &str,
+    ostream &ss,
+    map<string,string> *str_map,
+    bool fallback_to_plain)
+{
+  json_spirit::mValue json;
+  try {
+    // try json parsing first
+
+    json_spirit::read_or_throw(str, json);
+
+    if (json.type() != json_spirit::obj_type) {
+      ss << str << " must be a JSON object but is of type "
+	 << json.type() << " instead";
+      return -EINVAL;
+    }
+
+    json_spirit::mObject o = json.get_obj();
+
+    for (map<string, json_spirit::mValue>::iterator i = o.begin();
+	 i != o.end();
+	 ++i) {
+      (*str_map)[i->first] = i->second.get_str();
+    }
+  } catch (json_spirit::Error_position &e) {
+    if (fallback_to_plain) {
+      // fallback to key=value format
+      get_str_map(str, str_map, "\t\n ");
+    } else {
+      return -EINVAL;
+    }
+  }
+  return 0;
+}
+
+string trim(const string& str) {
+  return boost::algorithm::trim_copy_if(
+    str,
+    [](unsigned char c) {
+      return std::isspace(c);
+    });
+}
+
+int get_str_map(
+    const string &str,
+    map<string,string> *str_map,
+    const char *delims)
+{
+  list<string> pairs;
+  get_str_list(str, delims, pairs);
+  for (list<string>::iterator i = pairs.begin(); i != pairs.end(); ++i) {
+    size_t equal = i->find('=');
+    if (equal == string::npos)
+      (*str_map)[*i] = string();
+    else {
+      const string key = trim(i->substr(0, equal));
+      equal++;
+      const string value = trim(i->substr(equal));
+      (*str_map)[key] = value;
+    }
+  }
+  return 0;
+}
+
+string get_str_map_value(
+    const map<string,string> &str_map,
+    const string &key,
+    const string *def_val)
+{
+  map<string,string>::const_iterator p = str_map.find(key);
+
+  // key exists in str_map
+  if (p != str_map.end()) {
+    // but value is empty
+    if (p->second.empty())
+      return p->first;
+    // and value is not empty
+    return p->second;
+  }
+
+  // key DNE in str_map and def_val was specified
+  if (def_val != NULL)
+    return *def_val;
+
+  // key DNE in str_map, no def_val was specified
+  return string();
+}
+
+string get_str_map_key(
+    const map<string,string> &str_map,
+    const string &key,
+    const string *fallback_key)
+{
+  map<string,string>::const_iterator p = str_map.find(key);
+  if (p != str_map.end())
+    return p->second;
+
+  if (fallback_key != NULL) {
+    p = str_map.find(*fallback_key);
+    if (p != str_map.end())
+      return p->second;
+  }
+  return string();
+}
+
+// This function's only purpose is to check whether a given map has only
+// ONE key with an empty value (which would mean that 'get_str_map()' read
+// a map in the form of 'VALUE', without any KEY/VALUE pairs) and, in such
+// event, to assign said 'VALUE' to a given 'def_key', such that we end up
+// with a map of the form "m = { 'def_key' : 'VALUE' }" instead of the
+// original "m = { 'VALUE' : '' }".
+int get_conf_str_map_helper(
+    const string &str,
+    ostringstream &oss,
+    map<string,string> *m,
+    const string &def_key)
+{
+  int r = get_str_map(str, m);
+
+  if (r < 0) {
+    return r;
+  }
+
+  if (r >= 0 && m->size() == 1) {
+    map<string,string>::iterator p = m->begin();
+    if (p->second.empty()) {
+      string s = p->first;
+      m->erase(s);
+      (*m)[def_key] = s;
+    }
+  }
+  return r;
+}
diff --git a/src/common/strtol.cc b/src/common/strtol.cc
new file mode 100644
index 00000000..9b437596
--- /dev/null
+++ b/src/common/strtol.cc
@@ -0,0 +1,319 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "strtol.h"
+
+#include <climits>
+#include <limits>
+#include <cmath>
+#include <sstream>
+#include <string_view>
+
+using std::ostringstream;
+
+long long strict_strtoll(std::string_view str, int base, std::string *err)
+{
+  char *endptr;
+  errno = 0; /* To distinguish success/failure after call (see man page) */
+  long long ret = strtoll(str.data(), &endptr, base);
+  if (endptr == str.data() || endptr != str.data() + str.size()) {
+    *err = (std::string{"Expected option value to be integer, got '"} +
+	    std::string{str} + "'");
+    return 0;
+  }
+  if (errno) {
+    *err = (std::string{"The option value '"} + std::string{str} +
+	    "' seems to be invalid");
+    return 0;
+  }
+  *err = "";
+  return ret;
+}
+
+long long strict_strtoll(const char *str, int base, std::string *err)
+{
+  return strict_strtoll(std::string_view(str), base, err);
+}
+
+int strict_strtol(std::string_view str, int base, std::string *err)
+{
+  long long ret = strict_strtoll(str, base, err);
+  if (!err->empty())
+    return 0;
+  if ((ret < INT_MIN) || (ret > INT_MAX)) {
+    ostringstream errStr;
+    errStr << "The option value '" << str << "' seems to be invalid";
+    *err = errStr.str();
+    return 0;
+  }
+  return static_cast<int>(ret);
+}
+
+int strict_strtol(const char *str, int base, std::string *err)
+{
+  return strict_strtol(std::string_view(str), base, err);
+}
+
+double strict_strtod(std::string_view str, std::string *err)
+{
+  char *endptr;
+  errno = 0; /* To distinguish success/failure after call (see man page) */
+  double ret = strtod(str.data(), &endptr);
+  if (errno == ERANGE) {
+    ostringstream oss;
+    oss << "strict_strtod: floating point overflow or underflow parsing '"
+	<< str << "'";
+    *err = oss.str();
+    return 0.0;
+  }
+  if (endptr == str) {
+    ostringstream oss;
+    oss << "strict_strtod: expected double, got: '" << str << "'";
+    *err = oss.str();
+    return 0;
+  }
+  if (*endptr != '\0') {
+    ostringstream oss;
+    oss << "strict_strtod: garbage at end of string. got: '" << str << "'";
+    *err = oss.str();
+    return 0;
+  }
+  *err = "";
+  return ret;
+}
+
+double strict_strtod(const char *str, std::string *err)
+{
+  return strict_strtod(std::string_view(str), err);
+}
+
+float strict_strtof(std::string_view str, std::string *err)
+{
+  char *endptr;
+  errno = 0; /* To distinguish success/failure after call (see man page) */
+  float ret = strtof(str.data(), &endptr);
+  if (errno == ERANGE) {
+    ostringstream oss;
+    oss << "strict_strtof: floating point overflow or underflow parsing '"
+	<< str << "'";
+    *err = oss.str();
+    return 0.0;
+  }
+  if (endptr == str) {
+    ostringstream oss;
+    oss << "strict_strtof: expected float, got: '" << str << "'";
+    *err = oss.str();
+    return 0;
+  }
+  if (*endptr != '\0') {
+    ostringstream oss;
+    oss << "strict_strtof: garbage at end of string. got: '" << str << "'";
+    *err = oss.str();
+    return 0;
+  }
+  *err = "";
+  return ret;
+}
+
+float strict_strtof(const char *str, std::string *err)
+{
+  return strict_strtof(std::string_view(str), err);
+}
+
+template<typename T>
+T strict_iec_cast(std::string_view str, std::string *err)
+{
+  if (str.empty()) {
+    *err = "strict_iecstrtoll: value not specified";
+    return 0;
+  }
+  // get a view of the unit and of the value
+  std::string_view unit;
+  std::string_view n = str;
+  size_t u = str.find_first_not_of("0123456789-+");
+  int m = 0;
+  // deal with unit prefix is there is one
+  if (u != std::string_view::npos) {
+    n = str.substr(0, u);
+    unit = str.substr(u, str.length() - u);
+    // we accept both old si prefixes as well as the proper iec prefixes
+    // i.e. K, M, ... and Ki, Mi, ...
+    if (unit.back() == 'i') {
+      if (unit.front() == 'B') {
+        *err = "strict_iecstrtoll: illegal prefix \"Bi\"";
+        return 0;
+      }
+    }
+    if (unit.length() > 2) {
+      *err = "strict_iecstrtoll: illegal prefix (length > 2)";
+      return 0;
+    }
+    switch(unit.front()) {
+      case 'K':
+        m = 10;
+        break;
+      case 'M':
+        m = 20;
+        break;
+      case 'G':
+        m = 30;
+        break;
+      case 'T':
+        m = 40;
+        break;
+      case 'P':
+        m = 50;
+        break;
+      case 'E':
+        m = 60;
+        break;
+      case 'B':
+        break;
+      default:
+        *err = "strict_iecstrtoll: unit prefix not recognized";
+        return 0;
+    }
+  }
+
+  long long ll = strict_strtoll(n, 10, err);
+  if (ll < 0 && !std::numeric_limits<T>::is_signed) {
+    *err = "strict_iecstrtoll: value should not be negative";
+    return 0;
+  }
+  if (static_cast<unsigned>(m) >= sizeof(T) * CHAR_BIT) {
+    *err = ("strict_iecstrtoll: the IEC prefix is too large for the designated "
+        "type");
+    return 0;
+  }
+  using promoted_t = typename std::common_type<decltype(ll), T>::type;
+  if (static_cast<promoted_t>(ll) <
+      static_cast<promoted_t>(std::numeric_limits<T>::min()) >> m) {
+    *err = "strict_iecstrtoll: value seems to be too small";
+    return 0;
+  }
+  if (static_cast<promoted_t>(ll) >
+      static_cast<promoted_t>(std::numeric_limits<T>::max()) >> m) {
+    *err = "strict_iecstrtoll: value seems to be too large";
+    return 0;
+  }
+  return (ll << m);
+}
+
+template int strict_iec_cast<int>(std::string_view str, std::string *err);
+template long strict_iec_cast<long>(std::string_view str, std::string *err);
+template long long strict_iec_cast<long long>(std::string_view str, std::string *err);
+template uint64_t strict_iec_cast<uint64_t>(std::string_view str, std::string *err);
+template uint32_t strict_iec_cast<uint32_t>(std::string_view str, std::string *err);
+
+uint64_t strict_iecstrtoll(std::string_view str, std::string *err)
+{
+  return strict_iec_cast<uint64_t>(str, err);
+}
+
+uint64_t strict_iecstrtoll(const char *str, std::string *err)
+{
+  return strict_iec_cast<uint64_t>(std::string_view(str), err);
+}
+
+template<typename T>
+T strict_iec_cast(const char *str, std::string *err)
+{
+  return strict_iec_cast<T>(std::string_view(str), err);
+}
+
+template int strict_iec_cast<int>(const char *str, std::string *err);
+template long strict_iec_cast<long>(const char *str, std::string *err);
+template long long strict_iec_cast<long long>(const char *str, std::string *err);
+template uint64_t strict_iec_cast<uint64_t>(const char *str, std::string *err);
+template uint32_t strict_iec_cast<uint32_t>(const char *str, std::string *err);
+
+template<typename T>
+T strict_si_cast(std::string_view str, std::string *err)
+{
+  if (str.empty()) {
+    *err = "strict_sistrtoll: value not specified";
+    return 0;
+  }
+  std::string_view n = str;
+  int m = 0;
+  // deal with unit prefix is there is one
+  if (str.find_first_not_of("0123456789+-") != std::string_view::npos) {
+    const char &u = str.back();
+    if (u == 'K')
+      m = 3;
+    else if (u == 'M')
+      m = 6;
+    else if (u == 'G')
+      m = 9;
+    else if (u == 'T')
+      m = 12;
+    else if (u == 'P')
+      m = 15;
+    else if (u == 'E')
+      m = 18;
+    else if (u != 'B') {
+      *err = "strict_si_cast: unit prefix not recognized";
+      return 0;
+    }
+
+    if (m >= 3)
+      n = str.substr(0, str.length() -1);
+  }
+
+  long long ll = strict_strtoll(n, 10, err);
+  if (ll < 0 && !std::numeric_limits<T>::is_signed) {
+    *err = "strict_sistrtoll: value should not be negative";
+    return 0;
+  }
+  using promoted_t = typename std::common_type<decltype(ll), T>::type;
+  if (static_cast<promoted_t>(ll) <
+      static_cast<promoted_t>(std::numeric_limits<T>::min()) / pow (10, m)) {
+    *err = "strict_sistrtoll: value seems to be too small";
+    return 0;
+  }
+  if (static_cast<promoted_t>(ll) >
+      static_cast<promoted_t>(std::numeric_limits<T>::max()) / pow (10, m)) {
+    *err = "strict_sistrtoll: value seems to be too large";
+    return 0;
+  }
+  return (ll * pow (10,  m));
+}
+
+template int strict_si_cast<int>(std::string_view str, std::string *err);
+template long strict_si_cast<long>(std::string_view str, std::string *err);
+template long long strict_si_cast<long long>(std::string_view str, std::string *err);
+template uint64_t strict_si_cast<uint64_t>(std::string_view str, std::string *err);
+template uint32_t strict_si_cast<uint32_t>(std::string_view str, std::string *err);
+
+uint64_t strict_sistrtoll(std::string_view str, std::string *err)
+{
+  return strict_si_cast<uint64_t>(str, err);
+}
+
+uint64_t strict_sistrtoll(const char *str, std::string *err)
+{
+  return strict_si_cast<uint64_t>(str, err);
+}
+
+template<typename T>
+T strict_si_cast(const char *str, std::string *err)
+{
+  return strict_si_cast<T>(std::string_view(str), err);
+}
+
+template int strict_si_cast<int>(const char *str, std::string *err);
+template long strict_si_cast<long>(const char *str, std::string *err);
+template long long strict_si_cast<long long>(const char *str, std::string *err);
+template uint64_t strict_si_cast<uint64_t>(const char *str, std::string *err);
+template uint32_t strict_si_cast<uint32_t>(const char *str, std::string *err);
diff --git a/src/common/strtol.h b/src/common/strtol.h
new file mode 100644
index 00000000..a7c0cc22
--- /dev/null
+++ b/src/common/strtol.h
@@ -0,0 +1,63 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_STRTOL_H
+#define CEPH_COMMON_STRTOL_H
+
+#include <string>
+extern "C" {
+#include <stdint.h>
+}
+
+long long strict_strtoll(const char *str, int base, std::string *err);
+
+int strict_strtol(const char *str, int base, std::string *err);
+
+double strict_strtod(const char *str, std::string *err);
+
+float strict_strtof(const char *str, std::string *err);
+
+uint64_t strict_iecstrtoll(const char *str, std::string *err);
+
+template<typename T>
+T strict_iec_cast(const char *str, std::string *err);
+
+uint64_t strict_sistrtoll(const char *str, std::string *err);
+
+template<typename T>
+T strict_si_cast(const char *str, std::string *err);
+
+/* On enter buf points to the end of the buffer, e.g. where the least
+ * significant digit of the input number will be printed. Returns pointer to
+ * where the most significant digit were printed, including zero padding.
+ * Does NOT add zero at the end of buffer, this is responsibility of the caller.
+ */
+template<typename T, const unsigned base = 10, const unsigned width = 1>
+static inline
+char* ritoa(T u, char *buf)
+{
+  static_assert(std::is_unsigned<T>::value, "signed types are not supported");
+  static_assert(base <= 16, "extend character map below to support higher bases");
+  unsigned digits = 0;
+  while (u) {
+    *--buf = "0123456789abcdef"[u % base];
+    u /= base;
+    digits++;
+  }
+  while (digits++ < width)
+    *--buf = '0';
+  return buf;
+}
+
+#endif
diff --git a/src/common/subsys.h b/src/common/subsys.h
new file mode 100644
index 00000000..18443a04
--- /dev/null
+++ b/src/common/subsys.h
@@ -0,0 +1,81 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+/**
+ * This header describes the subsystems (each one gets a "--debug-<subsystem>"
+ * log verbosity setting), along with their default verbosities.
+ */
+
+DEFAULT_SUBSYS(0, 5)
+SUBSYS(lockdep, 0, 1)
+SUBSYS(context, 0, 1)
+SUBSYS(crush, 1, 1)
+SUBSYS(mds, 1, 5)
+SUBSYS(mds_balancer, 1, 5)
+SUBSYS(mds_locker, 1, 5)
+SUBSYS(mds_log, 1, 5)
+SUBSYS(mds_log_expire, 1, 5)
+SUBSYS(mds_migrator, 1, 5)
+SUBSYS(buffer, 0, 1)
+SUBSYS(timer, 0, 1)
+SUBSYS(filer, 0, 1)
+SUBSYS(striper, 0, 1)
+SUBSYS(objecter, 0, 1)
+SUBSYS(rados, 0, 5)
+SUBSYS(rbd, 0, 5)
+SUBSYS(rbd_mirror, 0, 5)
+SUBSYS(rbd_replay, 0, 5)
+SUBSYS(journaler, 0, 5)
+SUBSYS(objectcacher, 0, 5)
+SUBSYS(client, 0, 5)
+SUBSYS(osd, 1, 5)
+SUBSYS(optracker, 0, 5)
+SUBSYS(objclass, 0, 5)
+SUBSYS(filestore, 1, 3)
+SUBSYS(journal, 1, 3)
+SUBSYS(ms, 0, 0)
+SUBSYS(mon, 1, 5)
+SUBSYS(monc, 0, 10)
+SUBSYS(paxos, 1, 5)
+SUBSYS(tp, 0, 5)
+SUBSYS(auth, 1, 5)
+SUBSYS(crypto, 1, 5)
+SUBSYS(finisher, 1, 1)
+SUBSYS(reserver, 1, 1)
+SUBSYS(heartbeatmap, 1, 5)
+SUBSYS(perfcounter, 1, 5)
+SUBSYS(rgw, 1, 5)                 // log level for the Rados gateway
+SUBSYS(rgw_sync, 1, 5)
+SUBSYS(civetweb, 1, 10)
+SUBSYS(javaclient, 1, 5)
+SUBSYS(asok, 1, 5)
+SUBSYS(throttle, 1, 1)
+SUBSYS(refs, 0, 0)
+SUBSYS(xio, 1, 5)
+SUBSYS(compressor, 1, 5)
+SUBSYS(bluestore, 1, 5)
+SUBSYS(bluefs, 1, 5)
+SUBSYS(bdev, 1, 3)
+SUBSYS(kstore, 1, 5)
+SUBSYS(rocksdb, 4, 5)
+SUBSYS(leveldb, 4, 5)
+SUBSYS(memdb, 4, 5)
+SUBSYS(kinetic, 1, 5)
+SUBSYS(fuse, 1, 5)
+SUBSYS(mgr, 1, 5)
+SUBSYS(mgrc, 1, 5)
+SUBSYS(dpdk, 1, 5)
+SUBSYS(eventtrace, 1, 5)
+SUBSYS(prioritycache, 1, 5)
diff --git a/src/common/subsys_types.h b/src/common/subsys_types.h
new file mode 100644
index 00000000..52171809
--- /dev/null
+++ b/src/common/subsys_types.h
@@ -0,0 +1,86 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_SUBSYS_TYPES_H
+#define CEPH_SUBSYS_TYPES_H
+
+#include <algorithm>
+#include <array>
+
+enum ceph_subsys_id_t {
+  ceph_subsys_,   // default
+#define SUBSYS(name, log, gather) \
+  ceph_subsys_##name,
+#define DEFAULT_SUBSYS(log, gather)
+#include "common/subsys.h"
+#undef SUBSYS
+#undef DEFAULT_SUBSYS
+  ceph_subsys_max
+};
+
+constexpr static std::size_t ceph_subsys_get_num() {
+  return static_cast<std::size_t>(ceph_subsys_max);
+}
+
+struct ceph_subsys_item_t {
+  const char* name;
+  uint8_t log_level;
+  uint8_t gather_level;
+};
+
+constexpr static std::array<ceph_subsys_item_t, ceph_subsys_get_num()>
+ceph_subsys_get_as_array() {
+#define SUBSYS(name, log, gather) \
+  ceph_subsys_item_t{ #name, log, gather },
+#define DEFAULT_SUBSYS(log, gather) \
+  ceph_subsys_item_t{ "none", log, gather },
+
+  return {
+#include "common/subsys.h"
+  };
+#undef SUBSYS
+#undef DEFAULT_SUBSYS
+}
+
+constexpr static std::uint8_t
+ceph_subsys_get_max_default_level(const std::size_t subidx) {
+  const auto item = ceph_subsys_get_as_array()[subidx];
+  return std::max(item.log_level, item.gather_level);
+}
+
+// Compile time-capable version of std::strlen. Resorting to own
+// implementation only because C++17 doesn't mandate constexpr
+// on the standard one.
+constexpr static std::size_t strlen_ct(const char* const s) {
+  std::size_t l = 0;
+  while (s[l] != '\0') {
+    ++l;
+  }
+  return l;
+}
+
+constexpr static std::size_t ceph_subsys_max_name_length() {
+  return std::max({
+#define SUBSYS(name, log, gather) \
+  strlen_ct(#name),
+#define DEFAULT_SUBSYS(log, gather) \
+  strlen_ct("none"),
+#include "common/subsys.h"
+#undef SUBSYS
+#undef DEFAULT_SUBSYS
+  });
+}
+
+#endif // CEPH_SUBSYS_TYPES_H
+
diff --git a/src/common/sync_filesystem.h b/src/common/sync_filesystem.h
new file mode 100644
index 00000000..f457f655
--- /dev/null
+++ b/src/common/sync_filesystem.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_SYNC_FILESYSTEM_H
+#define CEPH_SYNC_FILESYSTEM_H
+
+#include <unistd.h>
+
+#if defined(__linux__)
+#include <sys/ioctl.h>
+#include <syscall.h>
+#include "os/fs/btrfs_ioctl.h"
+#endif
+
+inline int sync_filesystem(int fd)
+{
+  /* On Linux, newer versions of glibc have a function called syncfs that
+   * performs a sync on only one filesystem. If we don't have this call, we
+   * have to fall back on sync(), which synchronizes every filesystem on the
+   * computer. */
+#ifdef HAVE_SYS_SYNCFS
+  if (syncfs(fd) == 0)
+    return 0;
+#elif defined(SYS_syncfs)
+  if (syscall(SYS_syncfs, fd) == 0)
+    return 0;
+#elif defined(__NR_syncfs)
+  if (syscall(__NR_syncfs, fd) == 0)
+    return 0;
+#endif
+
+#if defined(HAVE_SYS_SYNCFS) || defined(SYS_syncfs) || defined(__NR_syncfs)
+  else if (errno == ENOSYS) {
+    sync();
+    return 0;
+  } else {
+    return -errno;
+  }
+#else
+  sync();
+  return 0;
+#endif
+}
+
+#endif
diff --git a/src/common/tracked_int_ptr.hpp b/src/common/tracked_int_ptr.hpp
new file mode 100644
index 00000000..fc54c4cb
--- /dev/null
+++ b/src/common/tracked_int_ptr.hpp
@@ -0,0 +1,74 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_TRACKEDINTPTR_H
+#define CEPH_TRACKEDINTPTR_H
+
+
+template <class T>
+class TrackedIntPtr {
+  T *ptr;
+  uint64_t id;
+public:
+  TrackedIntPtr() : ptr(NULL), id(0) {}
+  TrackedIntPtr(T *ptr) : ptr(ptr), id(ptr ? get_with_id(ptr) : 0) {}
+  ~TrackedIntPtr() {
+    if (ptr)
+      put_with_id(ptr, id);
+    else
+      ceph_assert(id == 0);
+  }
+  void swap(TrackedIntPtr &other) {
+    T *optr = other.ptr;
+    uint64_t oid = other.id;
+    other.ptr = ptr;
+    other.id = id;
+    ptr = optr;
+    id = oid;
+  }
+  TrackedIntPtr(const TrackedIntPtr &rhs) :
+    ptr(rhs.ptr), id(ptr ? get_with_id(ptr) : 0) {}
+
+  TrackedIntPtr& operator=(const TrackedIntPtr &rhs) {
+    TrackedIntPtr o(rhs.ptr);
+    swap(o);
+    return *this;
+  }
+  T &operator*() const {
+    return *ptr;
+  }
+  T *operator->() const {
+    return ptr;
+  }
+  T *get() const { return ptr; }
+
+  operator bool() const {
+    return ptr != NULL;
+  }
+  bool operator<(const TrackedIntPtr &lhs) const {
+    return ptr < lhs.ptr;
+  }
+  bool operator==(const TrackedIntPtr &lhs) const {
+    return ptr == lhs.ptr;
+  }
+
+  void reset() {
+    if (ptr) 
+      put_with_id(ptr, id);
+    ptr = nullptr;
+    id = 0;
+  }
+};
+
+#endif
diff --git a/src/common/types.cc b/src/common/types.cc
new file mode 100644
index 00000000..36861329
--- /dev/null
+++ b/src/common/types.cc
@@ -0,0 +1,32 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ * 
+ */
+#ifndef __CEPH_TYPES_H
+#define __CEPH_TYPES_H
+
+#include <include/types.h>
+
+#ifndef UINT8_MAX
+#define UINT8_MAX (255)
+#endif
+
+const shard_id_t shard_id_t::NO_SHARD(-1);
+
+ostream &operator<<(ostream &lhs, const shard_id_t &rhs)
+{
+  return lhs << (unsigned)(uint8_t)rhs.id;
+}
+
+#endif
diff --git a/src/common/url_escape.cc b/src/common/url_escape.cc
new file mode 100644
index 00000000..6580d28c
--- /dev/null
+++ b/src/common/url_escape.cc
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "url_escape.h"
+
+#include <stdexcept>
+#include <sstream>
+
+std::string url_escape(const std::string& s)
+{
+  std::string out;
+  for (auto c : s) {
+    if (std::isalnum(c) || c == '-' || c == '.' || c == '_' || c == '~' ||
+	c == '/') {
+      out.push_back(c);
+    } else {
+      char t[4];
+      snprintf(t, sizeof(t), "%%%02x", (int)(unsigned char)c);
+      out.append(t);
+    }
+  }
+  return out;
+}
+
+std::string url_unescape(const std::string& s)
+{
+  std::string out;
+  const char *end = s.c_str() + s.size();
+  for (const char *c = s.c_str(); c < end; ++c) {
+    switch (*c) {
+    case '%':
+      {
+	unsigned char v = 0;
+	for (unsigned i=0; i<2; ++i) {
+	  ++c;
+	  if (c >= end) {
+	    std::ostringstream ss;
+	    ss << "invalid escaped string at pos " << (c - s.c_str()) << " of '"
+	       << s << "'";
+	    throw std::runtime_error(ss.str());
+	  }
+	  v <<= 4;
+	  if (*c >= '0' && *c <= '9') {
+	    v += *c - '0';
+	  } else if (*c >= 'a' && *c <= 'f') {
+	    v += *c - 'a' + 10;
+	  } else if (*c >= 'A' && *c <= 'F') {
+	    v += *c - 'A' + 10;
+	  } else {
+	    std::ostringstream ss;
+	    ss << "invalid escaped string at pos " << (c - s.c_str()) << " of '"
+	       << s << "'";
+	    throw std::runtime_error(ss.str());
+	  }
+	}
+	out.push_back(v);
+      }
+      break;
+    default:
+      out.push_back(*c);
+    }
+  }
+  return out;
+}
diff --git a/src/common/url_escape.h b/src/common/url_escape.h
new file mode 100644
index 00000000..3cb539b1
--- /dev/null
+++ b/src/common/url_escape.h
@@ -0,0 +1,9 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+
+extern std::string url_escape(const std::string& s);
+extern std::string url_unescape(const std::string& s);
diff --git a/src/common/utf8.c b/src/common/utf8.c
new file mode 100644
index 00000000..9b7aaf5f
--- /dev/null
+++ b/src/common/utf8.c
@@ -0,0 +1,183 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "common/utf8.h"
+
+#include <string.h>
+
+static int high_bits_set(int c)
+{
+	int ret = 0;
+	while (1) {
+		if ((c & 0x80) != 0x080)
+			break;
+		c <<= 1;
+		++ret;
+	}
+	return ret;
+}
+
+/* Encode a 31-bit UTF8 code point to 'buf'.
+ * Assumes buf is of size MAX_UTF8_SZ
+ * Returns -1 on failure; number of bytes in the encoded value otherwise.
+ */
+int encode_utf8(unsigned long u, unsigned char *buf)
+{
+	static const unsigned long max_val[MAX_UTF8_SZ] = {
+		0x0000007ful, 0x000007fful, 0x0000fffful,
+		0x001ffffful, 0x03fffffful, 0x7ffffffful
+	};
+	static const int MAX_VAL_SZ = sizeof(max_val)/sizeof(max_val[0]);
+
+	int i;
+	for (i = 0; i < MAX_VAL_SZ; ++i) {
+		if (u <= max_val[i])
+			break;
+	}
+	if (i == MAX_VAL_SZ) {
+		// This code point is too big to encode.
+		return -1;
+	}
+
+	if (i == 0) {
+		buf[0] = u;
+	}
+	else {
+		signed int j;
+		for (j = i; j > 0; --j) {
+			buf[j] = 0x80 | (u & 0x3f);
+			u >>= 6;
+		}
+
+		unsigned char mask = ~(0xFF >> (i + 1));
+		buf[0] = mask | u;
+	}
+
+	return i + 1;
+}
+
+/*
+ * Decode a UTF8 character from an array of bytes. Return character code.
+ * Upon error, return INVALID_UTF8_CHAR.
+ */
+unsigned long decode_utf8(unsigned char *buf, int nbytes)
+{
+	unsigned long code;
+	int i, j;
+
+	if (nbytes <= 0)
+		return INVALID_UTF8_CHAR;
+
+	if (nbytes == 1) {
+		if (buf[0] >= 0x80)
+			return INVALID_UTF8_CHAR;
+		return buf[0];
+	}
+
+	i = high_bits_set(buf[0]);
+	if (i != nbytes)
+		return INVALID_UTF8_CHAR;
+	code = buf[0] & (0xff >> i);
+	for (j = 1; j < nbytes; ++j) {
+		if ((buf[j] & 0xc0) != 0x80)
+			    return INVALID_UTF8_CHAR;
+		code = (code << 6) | (buf[j] & 0x3f);
+	}
+
+	// Check for invalid code points
+	if (code == 0xFFFE)
+	    return INVALID_UTF8_CHAR;
+	if (code == 0xFFFF)
+	    return INVALID_UTF8_CHAR;
+	if (code >= 0xD800 && code <= 0xDFFF)
+	    return INVALID_UTF8_CHAR;
+
+	return code;
+}
+
+int check_utf8(const char *buf, int len)
+{
+	unsigned char u[MAX_UTF8_SZ];
+	int enc_len = 0;
+	int i = 0;
+	while (1) {
+		unsigned int c = buf[i];
+		if (i >= len || c < 0x80 || (c & 0xC0) != 0x80) {
+			// the start of a new character. Process what we have
+			// in the buffer.
+			if (enc_len > 0) {
+				int re_encoded_len;
+				unsigned char re_encoded[MAX_UTF8_SZ];
+				unsigned long code = decode_utf8(u, enc_len);
+				if (code == INVALID_UTF8_CHAR) {
+					//printf("decoded to invalid utf8");
+					return i + 1;
+				}
+				re_encoded_len = encode_utf8(code, re_encoded);
+				if (enc_len != re_encoded_len) {
+					//printf("originally encoded as %d bytes, "
+					//	"but was re-encoded to %d!\n",
+					//	enc_len, re_encoded_len);
+					return i + 1;
+				}
+				if (memcmp(u, re_encoded, enc_len) != 0) {
+					//printf("re-encoded to a different "
+					//	"byte stream!");
+					return i + 1;
+				}
+				//printf("code_point %lu\n", code);
+			}
+			enc_len = 0;
+			if (i >= len)
+				break;
+			// start collecting again?
+			if (c >= 0x80)
+				u[enc_len++] = c;
+		} else {
+			if (enc_len == MAX_UTF8_SZ) {
+				//printf("too many enc_len in utf character!\n");
+				return i + 1;
+			}
+			//printf("continuation byte...\n");
+			u[enc_len++] = c;
+		}
+		++i;
+	}
+	return 0;
+}
+
+int check_utf8_cstr(const char *buf)
+{
+	return check_utf8(buf, strlen(buf));
+}
+
+int is_control_character(int c)
+{
+	return (((c != 0) && (c < 0x20)) || (c == 0x7f));
+}
+
+int check_for_control_characters(const char *buf, int len)
+{
+	int i;
+	for (i = 0; i < len; ++i) {
+		if (is_control_character((int)(unsigned char)buf[i])) {
+ 			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int check_for_control_characters_cstr(const char *buf)
+{
+	return check_for_control_characters(buf, strlen(buf));
+}
diff --git a/src/common/utf8.h b/src/common/utf8.h
new file mode 100644
index 00000000..83efe6fd
--- /dev/null
+++ b/src/common/utf8.h
@@ -0,0 +1,66 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_UTF8_H
+#define CEPH_COMMON_UTF8_H
+
+#define MAX_UTF8_SZ 6
+#define INVALID_UTF8_CHAR 0xfffffffful
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Checks if a buffer is valid UTF-8.
+ * Returns 0 if it is, and one plus the offset of the first invalid byte
+ * if it is not.
+ */
+int check_utf8(const char *buf, int len);
+
+/* Checks if a null-terminated string is valid UTF-8.
+ * Returns 0 if it is, and one plus the offset of the first invalid byte
+ * if it is not.
+ */
+int check_utf8_cstr(const char *buf);
+
+/* Returns true if 'ch' is a control character.
+ * We do count newline as a control character, but not NULL.
+ */
+int is_control_character(int ch);
+
+/* Checks if a buffer contains control characters.
+ */
+int check_for_control_characters(const char *buf, int len);
+
+/* Checks if a null-terminated string contains control characters.
+ */
+int check_for_control_characters_cstr(const char *buf);
+
+/* Encode a 31-bit UTF8 code point to 'buf'.
+ * Assumes buf is of size MAX_UTF8_SZ
+ * Returns -1 on failure; number of bytes in the encoded value otherwise.
+ */
+int encode_utf8(unsigned long u, unsigned char *buf);
+
+/*
+ * Decode a UTF8 character from an array of bytes. Return character code.
+ * Upon error, return INVALID_UTF8_CHAR.
+ */
+unsigned long decode_utf8(unsigned char *buf, int nbytes);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/common/util.cc b/src/common/util.cc
new file mode 100644
index 00000000..f816ff41
--- /dev/null
+++ b/src/common/util.cc
@@ -0,0 +1,364 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Inktank Storage, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include <sys/utsname.h>
+#include <fstream>
+#include <boost/algorithm/string.hpp>
+
+#include "include/compat.h"
+#include "include/util.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/version.h"
+
+#ifdef HAVE_SYS_VFS_H
+#include <sys/vfs.h>
+#endif
+
+#if defined(__APPLE__) || defined(__FreeBSD__)
+#include <sys/param.h>
+#include <sys/mount.h>
+#if defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+#endif
+
+#include <string>
+
+#include <stdio.h>
+
+int get_fs_stats(ceph_data_stats_t &stats, const char *path)
+{
+  if (!path)
+    return -EINVAL;
+
+  struct statfs stbuf;
+  int err = ::statfs(path, &stbuf);
+  if (err < 0) {
+    return -errno;
+  }
+
+  stats.byte_total = stbuf.f_blocks * stbuf.f_bsize;
+  stats.byte_used = (stbuf.f_blocks - stbuf.f_bfree) * stbuf.f_bsize;
+  stats.byte_avail = stbuf.f_bavail * stbuf.f_bsize;
+  stats.avail_percent = (((float)stats.byte_avail/stats.byte_total)*100);
+  return 0;
+}
+
+static char* value_sanitize(char *value)
+{
+  while (isspace(*value) || *value == '"')
+    value++;
+
+  char* end = value + strlen(value) - 1;
+  while (end > value && (isspace(*end) || *end == '"'))
+    end--;
+
+  *(end + 1) = '\0';
+
+  return value;
+}
+
+static bool value_set(char *buf, const char *prefix,
+			    map<string, string> *pm, const char *key)
+{
+  if (strncmp(buf, prefix, strlen(prefix))) {
+    return false;
+  }
+
+  (*pm)[key] = value_sanitize(buf + strlen(prefix));
+  return true;
+}
+
+static void file_values_parse(const map<string, string>& kvm, FILE *fp, map<string, string> *m, CephContext *cct) {
+  char buf[512];
+  while (fgets(buf, sizeof(buf) - 1, fp) != NULL) {
+    for (auto& kv : kvm) {
+      if (value_set(buf, kv.second.c_str(), m, kv.first.c_str()))
+        continue;
+    }
+  }
+}
+
+static bool os_release_parse(map<string, string> *m, CephContext *cct)
+{
+#if defined(__linux__)
+  static const map<string, string> kvm = {
+    { "distro", "ID=" },
+    { "distro_description", "PRETTY_NAME=" },
+    { "distro_version", "VERSION_ID=" }
+  };
+
+  FILE *fp = fopen("/etc/os-release", "r");
+  if (!fp) {
+    int ret = -errno;
+    lderr(cct) << "os_release_parse - failed to open /etc/os-release: " << cpp_strerror(ret) << dendl;
+    return false;
+  }
+
+  file_values_parse(kvm, fp, m, cct);
+
+  fclose(fp);
+#elif defined(__FreeBSD__)
+  struct utsname u;
+  int r = uname(&u);
+  if (!r) {
+     m->insert(std::make_pair("distro", u.sysname));
+     m->insert(std::make_pair("distro_description", u.version));
+     m->insert(std::make_pair("distro_version", u.release));
+  }
+#endif
+
+  return true;
+}
+
+static void distro_detect(map<string, string> *m, CephContext *cct)
+{
+  if (!os_release_parse(m, cct)) {
+    lderr(cct) << "distro_detect - /etc/os-release is required" << dendl;
+  }
+
+  for (const char* rk: {"distro", "distro_description"}) {
+    if (m->find(rk) == m->end())
+      lderr(cct) << "distro_detect - can't detect " << rk << dendl;
+  }
+}
+
+int get_cgroup_memory_limit(uint64_t *limit)
+{
+  // /sys/fs/cgroup/memory/memory.limit_in_bytes
+
+  // the magic value 9223372036854771712 or 0x7ffffffffffff000
+  // appears to mean no limit.
+  FILE *f = fopen(PROCPREFIX "/sys/fs/cgroup/memory/memory.limit_in_bytes", "r");
+  if (!f) {
+    return -errno;
+  }
+  char buf[100];
+  int ret = 0;
+  long long value;
+  char *line = fgets(buf, sizeof(buf), f);
+  if (!line) {
+    ret = -EINVAL;
+    goto out;
+  }
+  if (sscanf(line, "%lld", &value) != 1) {
+    ret = -EINVAL;
+  }
+  if (value == 0x7ffffffffffff000) {
+    *limit = 0;  // no limit
+  } else {
+    *limit = value;
+  }
+out:
+  fclose(f);
+  return ret;
+}
+
+
+void collect_sys_info(map<string, string> *m, CephContext *cct)
+{
+  // version
+  (*m)["ceph_version"] = pretty_version_to_str();
+  (*m)["ceph_version_short"] = ceph_version_to_str();
+  (*m)["ceph_release"] = ceph_release_to_str();
+
+  // kernel info
+  struct utsname u;
+  int r = uname(&u);
+  if (r >= 0) {
+    (*m)["os"] = u.sysname;
+    (*m)["kernel_version"] = u.release;
+    (*m)["kernel_description"] = u.version;
+    (*m)["hostname"] = u.nodename;
+    (*m)["arch"] = u.machine;
+  }
+
+  // but wait, am i in a container?
+  bool in_container = false;
+
+  if (const char *pod_name = getenv("POD_NAME")) {
+    (*m)["pod_name"] = pod_name;
+    in_container = true;
+  }
+  if (const char *container_name = getenv("CONTAINER_NAME")) {
+    (*m)["container_name"] = container_name;
+    in_container = true;
+  }
+  if (const char *container_image = getenv("CONTAINER_IMAGE")) {
+    (*m)["container_image"] = container_image;
+    in_container = true;
+  }
+  if (in_container) {
+    if (const char *node_name = getenv("NODE_NAME")) {
+      (*m)["container_hostname"] = u.nodename;
+      (*m)["hostname"] = node_name;
+    }
+    if (const char *ns = getenv("POD_NAMESPACE")) {
+      (*m)["pod_namespace"] = ns;
+    }
+  }
+
+#ifdef __APPLE__
+  // memory
+  {
+    uint64_t size;
+    size_t len = sizeof(size);
+    r = sysctlbyname("hw.memsize", &size, &len, NULL, 0);
+    if (r == 0) {
+      (*m)["mem_total_kb"] = std::to_string(size);
+    }
+  }
+  {
+    xsw_usage vmusage;
+    size_t len = sizeof(vmusage);
+    r = sysctlbyname("vm.swapusage", &vmusage, &len, NULL, 0);
+    if (r == 0) {
+      (*m)["mem_swap_kb"] = std::to_string(vmusage.xsu_total);
+    }
+  }
+  // processor
+  {
+    char buf[100];
+    size_t len = sizeof(buf);
+    r = sysctlbyname("machdep.cpu.brand_string", buf, &len, NULL, 0);
+    if (r == 0) {
+      buf[len - 1] = '\0';
+      (*m)["cpu"] = buf;
+    }
+  }
+#else
+  // memory
+  if (std::ifstream f{PROCPREFIX "/proc/meminfo"}; !f.fail()) {
+    for (std::string line; std::getline(f, line); ) {
+      std::vector<string> parts;
+      boost::split(parts, line, boost::is_any_of(":\t "), boost::token_compress_on);
+      if (parts.size() != 3) {
+	continue;
+      }
+      if (parts[0] == "MemTotal") {
+	(*m)["mem_total_kb"] = parts[1];
+      } else if (parts[0] == "SwapTotal") {
+	(*m)["mem_swap_kb"] = parts[1];
+      }
+    }
+  }
+  uint64_t cgroup_limit;
+  if (get_cgroup_memory_limit(&cgroup_limit) == 0 &&
+      cgroup_limit > 0) {
+    (*m)["mem_cgroup_limit"] = std::to_string(cgroup_limit);
+  }
+
+  // processor
+  if (std::ifstream f{PROCPREFIX "/proc/cpuinfo"}; !f.fail()) {
+    for (std::string line; std::getline(f, line); ) {
+      std::vector<string> parts;
+      boost::split(parts, line, boost::is_any_of(":"));
+      if (parts.size() != 2) {
+	continue;
+      }
+      boost::trim(parts[0]);
+      boost::trim(parts[1]);
+      if (parts[0] == "model name") {
+	(*m)["cpu"] = parts[1];
+	break;
+      }
+    }
+  }
+#endif
+  // distro info
+  distro_detect(m, cct);
+}
+
+void dump_services(Formatter* f, const map<string, list<int> >& services, const char* type)
+{
+  ceph_assert(f);
+
+  f->open_object_section(type);
+  for (map<string, list<int> >::const_iterator host = services.begin();
+       host != services.end(); ++host) {
+    f->open_array_section(host->first.c_str());
+    const list<int>& hosted = host->second;
+    for (list<int>::const_iterator s = hosted.begin();
+	 s != hosted.end(); ++s) {
+      f->dump_int(type, *s);
+    }
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void dump_services(Formatter* f, const map<string, list<string> >& services, const char* type)
+{
+  ceph_assert(f);
+
+  f->open_object_section(type);
+  for (const auto& host : services) {
+    f->open_array_section(host.first.c_str());
+    const auto& hosted = host.second;
+    for (const auto& s : hosted) {
+      f->dump_string(type, s);
+    }
+    f->close_section();
+  }
+  f->close_section();
+}
+
+// If non-printable characters found then convert bufferlist to
+// base64 encoded string indicating whether it did.
+string cleanbin(bufferlist &bl, bool &base64, bool show)
+{
+  bufferlist::iterator it;
+  for (it = bl.begin(); it != bl.end(); ++it) {
+    if (iscntrl(*it))
+      break;
+  }
+  if (it == bl.end()) {
+    base64 = false;
+    string result(bl.c_str(), bl.length());
+    return result;
+  }
+
+  bufferlist b64;
+  bl.encode_base64(b64);
+  string encoded(b64.c_str(), b64.length());
+  if (show)
+    encoded = "Base64:" + encoded;
+  base64 = true;
+  return encoded;
+}
+
+// If non-printable characters found then convert to "Base64:" followed by
+// base64 encoding
+string cleanbin(string &str)
+{
+  bool base64;
+  bufferlist bl;
+  bl.append(str);
+  string result = cleanbin(bl, base64, true);
+  return result;
+}
+
+std::string bytes2str(uint64_t count) {
+  static char s[][2] = {"\0", "k", "M", "G", "T", "P", "E", "\0"};
+  int i = 0;
+  while (count >= 1024 && *s[i+1]) {
+    count >>= 10;
+    i++;
+  }
+  char str[128];
+  snprintf(str, sizeof str, "%" PRIu64 "%sB", count, s[i]);
+  return std::string(str);
+}
diff --git a/src/common/valgrind.h b/src/common/valgrind.h
new file mode 100644
index 00000000..1faa9cd8
--- /dev/null
+++ b/src/common/valgrind.h
@@ -0,0 +1,19 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_VALGRIND_H
+#define CEPH_VALGRIND_H
+
+#include "acconfig.h"
+
+#if defined(HAVE_VALGRIND_HELGRIND_H) && !defined(NDEBUG)
+  #include <valgrind/helgrind.h>
+#else
+  #define ANNOTATE_HAPPENS_AFTER(x)             (void)0
+  #define ANNOTATE_HAPPENS_BEFORE_FORGET_ALL(x) (void)0
+  #define ANNOTATE_HAPPENS_BEFORE(x)            (void)0
+
+  #define ANNOTATE_BENIGN_RACE_SIZED(address, size, description) (void)0
+#endif
+
+#endif // CEPH_VALGRIND_H
diff --git a/src/common/version.cc b/src/common/version.cc
new file mode 100644
index 00000000..5da48ea3
--- /dev/null
+++ b/src/common/version.cc
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <string.h>
+
+#include "ceph_ver.h"
+#include "common/version.h"
+#include "include/rados.h"
+
+#include <sstream>
+
+#define _STR(x) #x
+#define STRINGIFY(x) _STR(x)
+
+const char *ceph_version_to_str(void)
+{
+  return CEPH_GIT_NICE_VER;
+}
+
+const char *ceph_release_to_str(void)
+{
+  return ceph_release_name(CEPH_RELEASE);
+}
+
+const char *git_version_to_str(void)
+{
+  return STRINGIFY(CEPH_GIT_VER);
+}
+
+std::string const pretty_version_to_str(void)
+{
+  std::ostringstream oss;
+  oss << "ceph version " << CEPH_GIT_NICE_VER
+      << " (" << STRINGIFY(CEPH_GIT_VER) << ") "
+      << ceph_release_name(CEPH_RELEASE)
+      << " (" << CEPH_RELEASE_TYPE << ")";
+  return oss.str();
+}
+
+unsigned ceph_release(void)
+{
+  return CEPH_RELEASE;
+}
+
+const char *ceph_release_type(void)
+{
+  return CEPH_RELEASE_TYPE;
+}
diff --git a/src/common/version.h b/src/common/version.h
new file mode 100644
index 00000000..0a84ae44
--- /dev/null
+++ b/src/common/version.h
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_VERSION_H
+#define CEPH_COMMON_VERSION_H
+
+#include <string>
+
+// Return a string describing the Ceph version
+const char *ceph_version_to_str(void);
+
+// Return a string with the Ceph release
+const char *ceph_release_to_str(void);
+
+// Return a string describing the git version
+const char *git_version_to_str(void);
+
+// Return a formatted string describing the ceph and git versions
+std::string const pretty_version_to_str(void);
+
+// Release number (e.g., 12 for luminous); see CEPH_RELEASE_* in include/rados.h
+// and use const char *ceph_release_name(int) to translate to a string.
+unsigned ceph_release(void);
+
+// Release type ("dev", "rc", or "stable")
+const char *ceph_release_type(void);
+
+#endif
diff --git a/src/common/xattr.c b/src/common/xattr.c
new file mode 100644
index 00000000..c7afcf1e
--- /dev/null
+++ b/src/common/xattr.c
@@ -0,0 +1,278 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 Stanislav Sedov <stas@FreeBSD.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#if defined(__FreeBSD__)
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <sys/types.h>
+#include <sys/extattr.h>
+#elif defined(__linux__)
+#include <sys/types.h>
+#include <sys/xattr.h>
+#elif defined(__APPLE__)
+#include <errno.h>
+#include <sys/xattr.h>
+#else
+#error "Your system is not supported!"
+#endif
+
+#include "common/xattr.h"
+
+/*
+ * Sets extended attribute on a file.
+ * Returns 0 on success, -1 on failure.
+ */
+int
+ceph_os_setxattr(const char *path, const char *name,
+    const void *value, size_t size)
+{
+	int error = -1;
+
+#if defined(__FreeBSD__)
+	error = extattr_set_file(path, EXTATTR_NAMESPACE_USER, name, value,
+	    size);
+	if (error > 0)
+		error = 0;
+#elif defined(__linux__) 
+	error = setxattr(path, name, value, size, 0);
+#elif defined(__APPLE__)
+	error = setxattr(path, name, value, size, 0 /* position */, 0);
+#endif
+
+	return (error);
+}
+
+int
+ceph_os_fsetxattr(int fd, const char *name, const void *value,
+    size_t size)
+{
+	int error = -1;
+
+#if defined(__FreeBSD__)
+	error = extattr_set_fd(fd, EXTATTR_NAMESPACE_USER, name, value, size);
+	if (error > 0)
+		error = 0;
+#elif defined(__linux__)
+	error = fsetxattr(fd, name, value, size, 0);
+#elif defined(__APPLE__)
+	error = fsetxattr(fd, name, value, size, 0, 0 /* no options should be identical to Linux */ );
+#endif
+
+	return (error);
+}
+
+ssize_t
+ceph_os_getxattr(const char *path, const char *name,
+void *value, size_t size)
+{
+	ssize_t error = -1;
+
+#if defined(__FreeBSD__)
+	if (value == NULL || size == 0) {
+		error = extattr_get_file(path, EXTATTR_NAMESPACE_USER, name, value,
+		    size);
+	} else {
+		error = extattr_get_file(path, EXTATTR_NAMESPACE_USER, name, NULL,
+		    0);
+		if (error > 0) {
+			if (error > size) {
+				errno = ERANGE;
+				error = -1;
+			} else  {
+				error = extattr_get_file(path, EXTATTR_NAMESPACE_USER,
+				    name, value, size);
+			}
+		}
+	}
+#elif defined(__linux__)
+	error = getxattr(path, name, value, size);
+#elif defined(__APPLE__)
+	error = getxattr(path, name, value, size, 0 /* position  */, 0);
+	/* ENOATTR and ENODATA have different values */
+	if (error < 0 && errno == ENOATTR)
+		errno = ENODATA;
+#endif
+
+	return (error);
+}
+
+ssize_t
+ceph_os_fgetxattr(int fd, const char *name, void *value,
+    size_t size)
+{
+	ssize_t error = -1;
+
+#if defined(__FreeBSD__)
+	if (value == NULL || size == 0) {
+		error = extattr_get_fd(fd, EXTATTR_NAMESPACE_USER, name, value,
+		    size);
+	} else {
+		error = extattr_get_fd(fd, EXTATTR_NAMESPACE_USER, name, NULL,
+		    0);
+		if (error > 0) {
+			if (error > size) {
+				errno = ERANGE;
+				error = -1;
+			} else  {
+				error = extattr_get_fd(fd, EXTATTR_NAMESPACE_USER,
+				    name, value, size);
+			}
+		}
+	}
+#elif defined(__linux__)
+	error = fgetxattr(fd, name, value, size);
+#elif defined(__APPLE__)
+	error = fgetxattr(fd, name, value, size, 0, 0 /* no options */);
+	/* ENOATTR and ENODATA have different values */
+	if (error < 0 && errno == ENOATTR)
+		errno = ENODATA;
+#endif
+
+	return (error);
+}
+
+ssize_t
+ceph_os_listxattr(const char *path, char *list, size_t size)
+{
+	ssize_t error = -1;
+
+#if defined(__FreeBSD__)
+	/*
+	 * XXX. The format of the list FreeBSD returns differs
+	 * from the Linux ones.  We have to perform the conversion. :-(
+	 */
+	char *newlist, *p, *p1;
+
+	if (size != 0) {
+		newlist = malloc(size);
+		if (newlist != NULL) {
+			error = extattr_list_file(path, EXTATTR_NAMESPACE_USER,
+			    newlist, size);
+			if (error > 0) {
+				p = newlist;
+				p1 = list;
+				while ((p - newlist) < error) {
+					uint8_t len = *(uint8_t *)p;
+					p++;
+					if ((p + len - newlist) > error)
+						break;
+					if (len > 0) {
+						bcopy(p, p1, len);
+						p += len;
+						p1 += len;
+						*p1++ = '\0';
+					}
+				}
+				error = p1 - list;
+			}
+			free(newlist);
+		}
+	} else {
+		error = extattr_list_file(path, EXTATTR_NAMESPACE_USER,
+		    list, size);
+	}
+#elif defined(__linux__)
+	error = listxattr(path, list, size);
+#elif defined(__APPLE__)
+	error = listxattr(path, list, size, 0);
+#endif
+
+	return (error);
+}
+
+ssize_t
+ceph_os_flistxattr(int fd, char *list, size_t size)
+{
+	ssize_t error = -1;
+
+#if defined(__FreeBSD__)
+	/*
+	 * XXX. The format of the list FreeBSD returns differs
+	 * from the Linux ones.  We have to perform the conversion. :-(
+	 */
+	char *newlist, *p, *p1;
+
+	if (size != 0) {
+		newlist = malloc(size);
+		if (newlist != NULL) {
+			error = extattr_list_fd(fd, EXTATTR_NAMESPACE_USER,
+			    newlist, size);
+			if (error > 0) {
+				p = newlist;
+				p1 = list;
+				while ((p - newlist) < error) {
+					uint8_t len = *(uint8_t *)p;
+					p++;
+					if ((p + len - newlist) > error)
+						break;
+					if (len > 0) {
+						bcopy(p, p1, len);
+						p += len;
+						p1 += len;
+						*p1++ = '\0';
+					}
+				}
+				error = p1 - list;
+			}
+			free(newlist);
+		}
+	} else {
+		error = extattr_list_fd(fd, EXTATTR_NAMESPACE_USER,
+		    list, size);
+	}
+#elif defined(__linux__)
+	error = flistxattr(fd, list, size);
+#elif defined(__APPLE__)
+	error = flistxattr(fd, list, size, 0);
+#endif
+
+	return (error);
+}
+
+int
+ceph_os_removexattr(const char *path, const char *name)
+{
+	int error = -1;
+
+#if defined(__FreeBSD__)
+	error = extattr_delete_file(path, EXTATTR_NAMESPACE_USER, name);
+#elif defined(__linux__)
+	error = removexattr(path, name);
+#elif defined(__APPLE__)
+	error = removexattr(path, name, 0);
+	/* ENOATTR and ENODATA have different values */
+	if (error < 0 && errno == ENOATTR)
+		errno = ENODATA;
+#endif
+
+	return (error);
+}
+
+int
+ceph_os_fremovexattr(int fd, const char *name)
+{
+	int error = -1;
+
+#if defined(__FreeBSD__)
+	error = extattr_delete_fd(fd, EXTATTR_NAMESPACE_USER, name);
+#elif defined(__linux__)
+	error = fremovexattr(fd, name);
+#elif defined(__APPLE__)
+	error = fremovexattr(fd, name, 0);
+	/* ENOATTR and ENODATA have different values */
+	if (error < 0 && errno == ENOATTR)
+		errno = ENODATA;
+#endif
+
+	return (error);
+}
diff --git a/src/common/xattr.h b/src/common/xattr.h
new file mode 100644
index 00000000..15b35619
--- /dev/null
+++ b/src/common/xattr.h
@@ -0,0 +1,44 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 Stanislav Sedov <stas@FreeBSD.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#ifndef CEPH_EXTATTR_H
+#define CEPH_EXTATTR_H
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Almost everyone defines ENOATTR, except for Linux,
+// which does #define ENOATTR ENODATA.  It seems that occasionally that
+// isn't defined, though, so let's make sure.
+#ifndef ENOATTR
+# define ENOATTR ENODATA
+#endif
+
+int ceph_os_setxattr(const char *path, const char *name,
+                  const void *value, size_t size);
+int ceph_os_fsetxattr(int fd, const char *name, const void *value,
+                   size_t size);
+ssize_t ceph_os_getxattr(const char *path, const char *name,
+                         void *value, size_t size);
+ssize_t ceph_os_fgetxattr(int fd, const char *name, void *value,
+                          size_t size);
+ssize_t ceph_os_listxattr(const char *path, char *list, size_t size);
+ssize_t ceph_os_flistxattr(int fd, char *list, size_t size);
+int ceph_os_removexattr(const char *path, const char *name);
+int ceph_os_fremovexattr(int fd, const char *name);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !CEPH_EXTATTR_H */
diff --git a/src/common/zipkin_trace.h b/src/common/zipkin_trace.h
new file mode 100644
index 00000000..46e44d3a
--- /dev/null
+++ b/src/common/zipkin_trace.h
@@ -0,0 +1,91 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef COMMON_ZIPKIN_TRACE_H
+#define COMMON_ZIPKIN_TRACE_H
+
+#include "acconfig.h"
+#include "include/encoding.h"
+
+#ifdef WITH_BLKIN
+
+#include <ztracer.hpp>
+
+#else // !WITH_BLKIN
+
+// add stubs for noop Trace and Endpoint
+
+// match the "real" struct
+struct blkin_trace_info {
+    int64_t trace_id;
+    int64_t span_id;
+    int64_t parent_span_id;
+};
+
+namespace ZTracer
+{
+static inline int ztrace_init() { return 0; }
+
+class Endpoint {
+ public:
+  Endpoint(const char *name) {}
+  Endpoint(const char *ip, int port, const char *name) {}
+
+  void copy_ip(const std::string &newip) {}
+  void copy_name(const std::string &newname) {}
+  void copy_address_from(const Endpoint *endpoint) {}
+  void share_address_from(const Endpoint *endpoint) {}
+  void set_port(int p) {}
+};
+
+class Trace {
+ public:
+  Trace() {}
+  Trace(const char *name, const Endpoint *ep, const Trace *parent = NULL) {}
+  Trace(const char *name, const Endpoint *ep,
+        const blkin_trace_info *i, bool child=false) {}
+
+  bool valid() const { return false; }
+  operator bool() const { return false; }
+
+  int init(const char *name, const Endpoint *ep, const Trace *parent = NULL) {
+    return 0;
+  }
+  int init(const char *name, const Endpoint *ep,
+           const blkin_trace_info *i, bool child=false) {
+    return 0;
+  }
+
+  void copy_name(const std::string &newname) {}
+
+  const blkin_trace_info* get_info() const { return NULL; }
+  void set_info(const blkin_trace_info *i) {}
+
+  void keyval(const char *key, const char *val) const {}
+  void keyval(const char *key, int64_t val) const {}
+  void keyval(const char *key, const char *val, const Endpoint *ep) const {}
+  void keyval(const char *key, int64_t val, const Endpoint *ep) const {}
+
+  void event(const char *event) const {}
+  void event(const char *event, const Endpoint *ep) const {}
+};
+} // namespace ZTrace
+
+#endif // !WITH_BLKIN
+
+static inline void encode(const blkin_trace_info& b, bufferlist& bl)
+{
+  encode(b.trace_id, bl);
+  encode(b.span_id, bl);
+  encode(b.parent_span_id, bl);
+}
+
+static inline void decode(blkin_trace_info& b, bufferlist::const_iterator& p)
+{
+  decode(b.trace_id, p);
+  decode(b.span_id, p);
+  decode(b.parent_span_id, p);
+}
+
+
+
+#endif // COMMON_ZIPKIN_TRACE_H
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-27 18:24:20 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-27 18:24:20 +0000
commit	483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
tree	e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/common
parent	Initial commit. (diff)
download	ceph-upstream.tar.xz ceph-upstream.zip